001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package com.google.common.escape; 018 019import static com.google.common.base.Preconditions.checkNotNull; 020 021import com.google.common.annotations.Beta; 022import com.google.common.annotations.GwtCompatible; 023 024import java.util.Map; 025 026import javax.annotation.Nullable; 027 028/** 029 * A {@link UnicodeEscaper} that uses an array to quickly look up replacement 030 * characters for a given code point. An additional safe range is provided that 031 * determines whether code points without specific replacements are to be 032 * considered safe and left unescaped or should be escaped in a general way. 033 * 034 * <p>A good example of usage of this class is for HTML escaping where the 035 * replacement array contains information about the named HTML entities 036 * such as {@code &} and {@code "} while {@link #escapeUnsafe} is 037 * overridden to handle general escaping of the form {@code &#NNNNN;}. 038 * 039 * <p>The size of the data structure used by {@link ArrayBasedUnicodeEscaper} is 040 * proportional to the highest valued code point that requires escaping. 041 * For example a replacement map containing the single character 042 * '{@code \}{@code u1000}' will require approximately 16K of memory. If you 043 * need to create multiple escaper instances that have the same character 044 * replacement mapping consider using {@link ArrayBasedEscaperMap}. 045 * 046 * @author David Beaumont 047 * @since 15.0 048 */ 049@Beta 050@GwtCompatible 051public abstract class ArrayBasedUnicodeEscaper extends UnicodeEscaper { 052 // The replacement array (see ArrayBasedEscaperMap). 053 private final char[][] replacements; 054 // The number of elements in the replacement array. 055 private final int replacementsLength; 056 // The first code point in the safe range. 057 private final int safeMin; 058 // The last code point in the safe range. 059 private final int safeMax; 060 061 // Cropped values used in the fast path range checks. 062 private final char safeMinChar; 063 private final char safeMaxChar; 064 065 /** 066 * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement 067 * map and specified safe range. If {@code safeMax < safeMin} then no code 068 * points are considered safe. 069 * 070 * <p>If a code point has no mapped replacement then it is checked against the 071 * safe range. If it lies outside that, then {@link #escapeUnsafe} is 072 * called, otherwise no escaping is performed. 073 * 074 * @param replacementMap a map of characters to their escaped representations 075 * @param safeMin the lowest character value in the safe range 076 * @param safeMax the highest character value in the safe range 077 * @param unsafeReplacement the default replacement for unsafe characters or 078 * null if no default replacement is required 079 */ 080 protected ArrayBasedUnicodeEscaper(Map<Character, String> replacementMap, 081 int safeMin, int safeMax, @Nullable String unsafeReplacement) { 082 083 this(ArrayBasedEscaperMap.create(replacementMap), safeMin, safeMax, 084 unsafeReplacement); 085 } 086 087 /** 088 * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement 089 * map and specified safe range. If {@code safeMax < safeMin} then no code 090 * points are considered safe. This initializer is useful when explicit 091 * instances of ArrayBasedEscaperMap are used to allow the sharing of large 092 * replacement mappings. 093 * 094 * <p>If a code point has no mapped replacement then it is checked against the 095 * safe range. If it lies outside that, then {@link #escapeUnsafe} is 096 * called, otherwise no escaping is performed. 097 * 098 * @param escaperMap the map of replacements 099 * @param safeMin the lowest character value in the safe range 100 * @param safeMax the highest character value in the safe range 101 * @param unsafeReplacement the default replacement for unsafe characters or 102 * null if no default replacement is required 103 */ 104 protected ArrayBasedUnicodeEscaper(ArrayBasedEscaperMap escaperMap, 105 int safeMin, int safeMax, @Nullable String unsafeReplacement) { 106 107 checkNotNull(escaperMap); // GWT specific check (do not optimize) 108 this.replacements = escaperMap.getReplacementArray(); 109 this.replacementsLength = replacements.length; 110 if (safeMax < safeMin) { 111 // If the safe range is empty, set the range limits to opposite extremes 112 // to ensure the first test of either value will fail. 113 safeMax = -1; 114 safeMin = Integer.MAX_VALUE; 115 } 116 this.safeMin = safeMin; 117 this.safeMax = safeMax; 118 119 // This is a bit of a hack but lets us do quicker per-character checks in 120 // the fast path code. The safe min/max values are very unlikely to extend 121 // into the range of surrogate characters, but if they do we must not test 122 // any values in that range. To see why, consider the case where: 123 // safeMin <= {hi,lo} <= safeMax 124 // where {hi,lo} are characters forming a surrogate pair such that: 125 // codePointOf(hi, lo) > safeMax 126 // which would result in the surrogate pair being (wrongly) considered safe. 127 // If we clip the safe range used during the per-character tests so it is 128 // below the values of characters in surrogate pairs, this cannot occur. 129 // This approach does mean that we break out of the fast path code in cases 130 // where we don't strictly need to, but this situation will almost never 131 // occur in practice. 132 if (safeMin >= Character.MIN_HIGH_SURROGATE) { 133 // The safe range is empty or the all safe code points lie in or above the 134 // surrogate range. Either way the character range is empty. 135 this.safeMinChar = Character.MAX_VALUE; 136 this.safeMaxChar = 0; 137 } else { 138 // The safe range is non empty and contains values below the surrogate 139 // range but may extend above it. We may need to clip the maximum value. 140 this.safeMinChar = (char) safeMin; 141 this.safeMaxChar = (char) Math.min(safeMax, 142 Character.MIN_HIGH_SURROGATE - 1); 143 } 144 } 145 146 /* 147 * This is overridden to improve performance. Rough benchmarking shows that 148 * this almost doubles the speed when processing strings that do not require 149 * any escaping. 150 */ 151 @Override 152 public final String escape(String s) { 153 checkNotNull(s); // GWT specific check (do not optimize) 154 for (int i = 0; i < s.length(); i++) { 155 char c = s.charAt(i); 156 if ((c < replacementsLength && replacements[c] != null) || 157 c > safeMaxChar || c < safeMinChar) { 158 return escapeSlow(s, i); 159 } 160 } 161 return s; 162 } 163 164 /* Overridden for performance. */ 165 @Override 166 protected final int nextEscapeIndex(CharSequence csq, int index, int end) { 167 while (index < end) { 168 char c = csq.charAt(index); 169 if ((c < replacementsLength && replacements[c] != null) || 170 c > safeMaxChar || c < safeMinChar) { 171 break; 172 } 173 index++; 174 } 175 return index; 176 } 177 178 /** 179 * Escapes a single Unicode code point using the replacement array and safe 180 * range values. If the given character does not have an explicit replacement 181 * and lies outside the safe range then {@link #escapeUnsafe} is called. 182 */ 183 @Override 184 protected final char[] escape(int cp) { 185 if (cp < replacementsLength) { 186 char[] chars = replacements[cp]; 187 if (chars != null) { 188 return chars; 189 } 190 } 191 if (cp >= safeMin && cp <= safeMax) { 192 return null; 193 } 194 return escapeUnsafe(cp); 195 } 196 197 /** 198 * Escapes a code point that has no direct explicit value in the replacement 199 * array and lies outside the stated safe range. Subclasses should override 200 * this method to provide generalized escaping for code points if required. 201 * 202 * <p>Note that arrays returned by this method must not be modified once they 203 * have been returned. However it is acceptable to return the same array 204 * multiple times (even for different input characters). 205 * 206 * @param cp the Unicode code point to escape 207 * @return the replacement characters, or {@code null} if no escaping was 208 * required 209 */ 210 protected abstract char[] escapeUnsafe(int cp); 211}