001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package com.google.common.escape; 018 019import static com.google.common.base.Preconditions.checkNotNull; 020 021import com.google.common.annotations.Beta; 022import com.google.common.annotations.GwtCompatible; 023 024import java.util.Map; 025 026/** 027 * A {@link CharEscaper} that uses an array to quickly look up replacement 028 * characters for a given {@code char} value. An additional safe range is 029 * provided that determines whether {@code char} values without specific 030 * replacements are to be considered safe and left unescaped or should be 031 * escaped in a general way. 032 * 033 * <p>A good example of usage of this class is for Java source code escaping 034 * where the replacement array contains information about special ASCII 035 * characters such as {@code \\t} and {@code \\n} while {@link #escapeUnsafe} 036 * is overridden to handle general escaping of the form {@code \\uxxxx}. 037 * 038 * <p>The size of the data structure used by {@link ArrayBasedCharEscaper} is 039 * proportional to the highest valued character that requires escaping. 040 * For example a replacement map containing the single character 041 * '{@code \}{@code u1000}' will require approximately 16K of memory. If you 042 * need to create multiple escaper instances that have the same character 043 * replacement mapping consider using {@link ArrayBasedEscaperMap}. 044 * 045 * @author Sven Mawson 046 * @author David Beaumont 047 * @since 15.0 048 */ 049@Beta 050@GwtCompatible 051public abstract class ArrayBasedCharEscaper extends CharEscaper { 052 // The replacement array (see ArrayBasedEscaperMap). 053 private final char[][] replacements; 054 // The number of elements in the replacement array. 055 private final int replacementsLength; 056 // The first character in the safe range. 057 private final char safeMin; 058 // The last character in the safe range. 059 private final char safeMax; 060 061 /** 062 * Creates a new ArrayBasedCharEscaper instance with the given replacement map 063 * and specified safe range. If {@code safeMax < safeMin} then no characters 064 * are considered safe. 065 * 066 * <p>If a character has no mapped replacement then it is checked against the 067 * safe range. If it lies outside that, then {@link #escapeUnsafe} is 068 * called, otherwise no escaping is performed. 069 * 070 * @param replacementMap a map of characters to their escaped representations 071 * @param safeMin the lowest character value in the safe range 072 * @param safeMax the highest character value in the safe range 073 */ 074 protected ArrayBasedCharEscaper(Map<Character, String> replacementMap, 075 char safeMin, char safeMax) { 076 077 this(ArrayBasedEscaperMap.create(replacementMap), safeMin, safeMax); 078 } 079 080 /** 081 * Creates a new ArrayBasedCharEscaper instance with the given replacement map 082 * and specified safe range. If {@code safeMax < safeMin} then no characters 083 * are considered safe. This initializer is useful when explicit instances of 084 * ArrayBasedEscaperMap are used to allow the sharing of large replacement 085 * mappings. 086 * 087 * <p>If a character has no mapped replacement then it is checked against the 088 * safe range. If it lies outside that, then {@link #escapeUnsafe} is 089 * called, otherwise no escaping is performed. 090 * 091 * @param escaperMap the mapping of characters to be escaped 092 * @param safeMin the lowest character value in the safe range 093 * @param safeMax the highest character value in the safe range 094 */ 095 protected ArrayBasedCharEscaper(ArrayBasedEscaperMap escaperMap, 096 char safeMin, char safeMax) { 097 098 checkNotNull(escaperMap); // GWT specific check (do not optimize) 099 this.replacements = escaperMap.getReplacementArray(); 100 this.replacementsLength = replacements.length; 101 if (safeMax < safeMin) { 102 // If the safe range is empty, set the range limits to opposite extremes 103 // to ensure the first test of either value will (almost certainly) fail. 104 safeMax = Character.MIN_VALUE; 105 safeMin = Character.MAX_VALUE; 106 } 107 this.safeMin = safeMin; 108 this.safeMax = safeMax; 109 } 110 111 /* 112 * This is overridden to improve performance. Rough benchmarking shows that 113 * this almost doubles the speed when processing strings that do not require 114 * any escaping. 115 */ 116 @Override 117 public final String escape(String s) { 118 checkNotNull(s); // GWT specific check (do not optimize). 119 for (int i = 0; i < s.length(); i++) { 120 char c = s.charAt(i); 121 if ((c < replacementsLength && replacements[c] != null) || 122 c > safeMax || c < safeMin) { 123 return escapeSlow(s, i); 124 } 125 } 126 return s; 127 } 128 129 /** 130 * Escapes a single character using the replacement array and safe range 131 * values. If the given character does not have an explicit replacement and 132 * lies outside the safe range then {@link #escapeUnsafe} is called. 133 */ 134 @Override protected final char[] escape(char c) { 135 if (c < replacementsLength) { 136 char[] chars = replacements[c]; 137 if (chars != null) { 138 return chars; 139 } 140 } 141 if (c >= safeMin && c <= safeMax) { 142 return null; 143 } 144 return escapeUnsafe(c); 145 } 146 147 /** 148 * Escapes a {@code char} value that has no direct explicit value in the 149 * replacement array and lies outside the stated safe range. Subclasses should 150 * override this method to provide generalized escaping for characters. 151 * 152 * <p>Note that arrays returned by this method must not be modified once they 153 * have been returned. However it is acceptable to return the same array 154 * multiple times (even for different input characters). 155 * 156 * @param c the character to escape 157 * @return the replacement characters, or {@code null} if no escaping was 158 * required 159 */ 160 // TODO(user,cpovirk): Rename this something better once refactoring done 161 protected abstract char[] escapeUnsafe(char c); 162}