001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.escape; 016 017import static com.google.common.base.Preconditions.checkNotNull; 018import static java.lang.Math.min; 019 020import com.google.common.annotations.GwtCompatible; 021import java.util.Map; 022import javax.annotation.CheckForNull; 023import org.checkerframework.checker.nullness.qual.Nullable; 024 025/** 026 * A {@link UnicodeEscaper} that uses an array to quickly look up replacement characters for a given 027 * code point. An additional safe range is provided that determines whether code points without 028 * specific replacements are to be considered safe and left unescaped or should be escaped in a 029 * general way. 030 * 031 * <p>A good example of usage of this class is for HTML escaping where the replacement array 032 * contains information about the named HTML entities such as {@code &} and {@code "} while 033 * {@link #escapeUnsafe} is overridden to handle general escaping of the form {@code &#NNNNN;}. 034 * 035 * <p>The size of the data structure used by {@link ArrayBasedUnicodeEscaper} is proportional to the 036 * highest valued code point that requires escaping. For example a replacement map containing the 037 * single character '{@code \}{@code u1000}' will require approximately 16K of memory. If you need 038 * to create multiple escaper instances that have the same character replacement mapping consider 039 * using {@link ArrayBasedEscaperMap}. 040 * 041 * @author David Beaumont 042 * @since 15.0 043 */ 044@GwtCompatible 045@ElementTypesAreNonnullByDefault 046public abstract class ArrayBasedUnicodeEscaper extends UnicodeEscaper { 047 // The replacement array (see ArrayBasedEscaperMap). 048 private final char[][] replacements; 049 // The number of elements in the replacement array. 050 private final int replacementsLength; 051 // The first code point in the safe range. 052 private final int safeMin; 053 // The last code point in the safe range. 054 private final int safeMax; 055 056 // Cropped values used in the fast path range checks. 057 private final char safeMinChar; 058 private final char safeMaxChar; 059 060 /** 061 * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement map and specified 062 * safe range. If {@code safeMax < safeMin} then no code points are considered safe. 063 * 064 * <p>If a code point has no mapped replacement then it is checked against the safe range. If it 065 * lies outside that, then {@link #escapeUnsafe} is called, otherwise no escaping is performed. 066 * 067 * @param replacementMap a map of characters to their escaped representations 068 * @param safeMin the lowest character value in the safe range 069 * @param safeMax the highest character value in the safe range 070 * @param unsafeReplacement the default replacement for unsafe characters or null if no default 071 * replacement is required 072 */ 073 protected ArrayBasedUnicodeEscaper( 074 Map<Character, String> replacementMap, 075 int safeMin, 076 int safeMax, 077 @Nullable String unsafeReplacement) { 078 this(ArrayBasedEscaperMap.create(replacementMap), safeMin, safeMax, unsafeReplacement); 079 } 080 081 /** 082 * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement map and specified 083 * safe range. If {@code safeMax < safeMin} then no code points are considered safe. This 084 * initializer is useful when explicit instances of ArrayBasedEscaperMap are used to allow the 085 * sharing of large replacement mappings. 086 * 087 * <p>If a code point has no mapped replacement then it is checked against the safe range. If it 088 * lies outside that, then {@link #escapeUnsafe} is called, otherwise no escaping is performed. 089 * 090 * @param escaperMap the map of replacements 091 * @param safeMin the lowest character value in the safe range 092 * @param safeMax the highest character value in the safe range 093 * @param unsafeReplacement the default replacement for unsafe characters or null if no default 094 * replacement is required 095 */ 096 protected ArrayBasedUnicodeEscaper( 097 ArrayBasedEscaperMap escaperMap, 098 int safeMin, 099 int safeMax, 100 @Nullable String unsafeReplacement) { 101 checkNotNull(escaperMap); // GWT specific check (do not optimize) 102 this.replacements = escaperMap.getReplacementArray(); 103 this.replacementsLength = replacements.length; 104 if (safeMax < safeMin) { 105 // If the safe range is empty, set the range limits to opposite extremes 106 // to ensure the first test of either value will fail. 107 safeMax = -1; 108 safeMin = Integer.MAX_VALUE; 109 } 110 this.safeMin = safeMin; 111 this.safeMax = safeMax; 112 113 // This is a bit of a hack but lets us do quicker per-character checks in 114 // the fast path code. The safe min/max values are very unlikely to extend 115 // into the range of surrogate characters, but if they do we must not test 116 // any values in that range. To see why, consider the case where: 117 // safeMin <= {hi,lo} <= safeMax 118 // where {hi,lo} are characters forming a surrogate pair such that: 119 // codePointOf(hi, lo) > safeMax 120 // which would result in the surrogate pair being (wrongly) considered safe. 121 // If we clip the safe range used during the per-character tests so it is 122 // below the values of characters in surrogate pairs, this cannot occur. 123 // This approach does mean that we break out of the fast path code in cases 124 // where we don't strictly need to, but this situation will almost never 125 // occur in practice. 126 if (safeMin >= Character.MIN_HIGH_SURROGATE) { 127 // The safe range is empty or the all safe code points lie in or above the 128 // surrogate range. Either way the character range is empty. 129 this.safeMinChar = Character.MAX_VALUE; 130 this.safeMaxChar = 0; 131 } else { 132 // The safe range is non-empty and contains values below the surrogate 133 // range but may extend above it. We may need to clip the maximum value. 134 this.safeMinChar = (char) safeMin; 135 this.safeMaxChar = (char) min(safeMax, Character.MIN_HIGH_SURROGATE - 1); 136 } 137 } 138 139 /* 140 * This is overridden to improve performance. Rough benchmarking shows that this almost doubles 141 * the speed when processing strings that do not require any escaping. 142 */ 143 @Override 144 public final String escape(String s) { 145 checkNotNull(s); // GWT specific check (do not optimize) 146 for (int i = 0; i < s.length(); i++) { 147 char c = s.charAt(i); 148 if ((c < replacementsLength && replacements[c] != null) 149 || c > safeMaxChar 150 || c < safeMinChar) { 151 return escapeSlow(s, i); 152 } 153 } 154 return s; 155 } 156 157 /** 158 * Escapes a single Unicode code point using the replacement array and safe range values. If the 159 * given character does not have an explicit replacement and lies outside the safe range then 160 * {@link #escapeUnsafe} is called. 161 * 162 * @return the replacement characters, or {@code null} if no escaping was required 163 */ 164 @Override 165 @CheckForNull 166 protected final char[] escape(int cp) { 167 if (cp < replacementsLength) { 168 char[] chars = replacements[cp]; 169 if (chars != null) { 170 return chars; 171 } 172 } 173 if (cp >= safeMin && cp <= safeMax) { 174 return null; 175 } 176 return escapeUnsafe(cp); 177 } 178 179 /* Overridden for performance. */ 180 @Override 181 protected final int nextEscapeIndex(CharSequence csq, int index, int end) { 182 while (index < end) { 183 char c = csq.charAt(index); 184 if ((c < replacementsLength && replacements[c] != null) 185 || c > safeMaxChar 186 || c < safeMinChar) { 187 break; 188 } 189 index++; 190 } 191 return index; 192 } 193 194 /** 195 * Escapes a code point that has no direct explicit value in the replacement array and lies 196 * outside the stated safe range. Subclasses should override this method to provide generalized 197 * escaping for code points if required. 198 * 199 * <p>Note that arrays returned by this method must not be modified once they have been returned. 200 * However it is acceptable to return the same array multiple times (even for different input 201 * characters). 202 * 203 * @param cp the Unicode code point to escape 204 * @return the replacement characters, or {@code null} if no escaping was required 205 */ 206 @CheckForNull 207 protected abstract char[] escapeUnsafe(int cp); 208}