001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.escape; 016 017import static com.google.common.base.Preconditions.checkNotNull; 018 019import com.google.common.annotations.Beta; 020import com.google.common.annotations.GwtCompatible; 021import java.util.Map; 022import javax.annotation.CheckForNull; 023import org.checkerframework.checker.nullness.qual.Nullable; 024 025/** 026 * A {@link UnicodeEscaper} that uses an array to quickly look up replacement characters for a given 027 * code point. An additional safe range is provided that determines whether code points without 028 * specific replacements are to be considered safe and left unescaped or should be escaped in a 029 * general way. 030 * 031 * <p>A good example of usage of this class is for HTML escaping where the replacement array 032 * contains information about the named HTML entities such as {@code &} and {@code "} while 033 * {@link #escapeUnsafe} is overridden to handle general escaping of the form {@code &#NNNNN;}. 034 * 035 * <p>The size of the data structure used by {@link ArrayBasedUnicodeEscaper} is proportional to the 036 * highest valued code point that requires escaping. For example a replacement map containing the 037 * single character '{@code \}{@code u1000}' will require approximately 16K of memory. If you need 038 * to create multiple escaper instances that have the same character replacement mapping consider 039 * using {@link ArrayBasedEscaperMap}. 040 * 041 * @author David Beaumont 042 * @since 15.0 043 */ 044@Beta 045@GwtCompatible 046@ElementTypesAreNonnullByDefault 047public abstract class ArrayBasedUnicodeEscaper extends UnicodeEscaper { 048 // The replacement array (see ArrayBasedEscaperMap). 049 private final char[][] replacements; 050 // The number of elements in the replacement array. 051 private final int replacementsLength; 052 // The first code point in the safe range. 053 private final int safeMin; 054 // The last code point in the safe range. 055 private final int safeMax; 056 057 // Cropped values used in the fast path range checks. 058 private final char safeMinChar; 059 private final char safeMaxChar; 060 061 /** 062 * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement map and specified 063 * safe range. If {@code safeMax < safeMin} then no code points are considered safe. 064 * 065 * <p>If a code point has no mapped replacement then it is checked against the safe range. If it 066 * lies outside that, then {@link #escapeUnsafe} is called, otherwise no escaping is performed. 067 * 068 * @param replacementMap a map of characters to their escaped representations 069 * @param safeMin the lowest character value in the safe range 070 * @param safeMax the highest character value in the safe range 071 * @param unsafeReplacement the default replacement for unsafe characters or null if no default 072 * replacement is required 073 */ 074 protected ArrayBasedUnicodeEscaper( 075 Map<Character, String> replacementMap, 076 int safeMin, 077 int safeMax, 078 @Nullable String unsafeReplacement) { 079 this(ArrayBasedEscaperMap.create(replacementMap), safeMin, safeMax, unsafeReplacement); 080 } 081 082 /** 083 * Creates a new ArrayBasedUnicodeEscaper instance with the given replacement map and specified 084 * safe range. If {@code safeMax < safeMin} then no code points are considered safe. This 085 * initializer is useful when explicit instances of ArrayBasedEscaperMap are used to allow the 086 * sharing of large replacement mappings. 087 * 088 * <p>If a code point has no mapped replacement then it is checked against the safe range. If it 089 * lies outside that, then {@link #escapeUnsafe} is called, otherwise no escaping is performed. 090 * 091 * @param escaperMap the map of replacements 092 * @param safeMin the lowest character value in the safe range 093 * @param safeMax the highest character value in the safe range 094 * @param unsafeReplacement the default replacement for unsafe characters or null if no default 095 * replacement is required 096 */ 097 protected ArrayBasedUnicodeEscaper( 098 ArrayBasedEscaperMap escaperMap, 099 int safeMin, 100 int safeMax, 101 @Nullable String unsafeReplacement) { 102 checkNotNull(escaperMap); // GWT specific check (do not optimize) 103 this.replacements = escaperMap.getReplacementArray(); 104 this.replacementsLength = replacements.length; 105 if (safeMax < safeMin) { 106 // If the safe range is empty, set the range limits to opposite extremes 107 // to ensure the first test of either value will fail. 108 safeMax = -1; 109 safeMin = Integer.MAX_VALUE; 110 } 111 this.safeMin = safeMin; 112 this.safeMax = safeMax; 113 114 // This is a bit of a hack but lets us do quicker per-character checks in 115 // the fast path code. The safe min/max values are very unlikely to extend 116 // into the range of surrogate characters, but if they do we must not test 117 // any values in that range. To see why, consider the case where: 118 // safeMin <= {hi,lo} <= safeMax 119 // where {hi,lo} are characters forming a surrogate pair such that: 120 // codePointOf(hi, lo) > safeMax 121 // which would result in the surrogate pair being (wrongly) considered safe. 122 // If we clip the safe range used during the per-character tests so it is 123 // below the values of characters in surrogate pairs, this cannot occur. 124 // This approach does mean that we break out of the fast path code in cases 125 // where we don't strictly need to, but this situation will almost never 126 // occur in practice. 127 if (safeMin >= Character.MIN_HIGH_SURROGATE) { 128 // The safe range is empty or the all safe code points lie in or above the 129 // surrogate range. Either way the character range is empty. 130 this.safeMinChar = Character.MAX_VALUE; 131 this.safeMaxChar = 0; 132 } else { 133 // The safe range is non empty and contains values below the surrogate 134 // range but may extend above it. We may need to clip the maximum value. 135 this.safeMinChar = (char) safeMin; 136 this.safeMaxChar = (char) Math.min(safeMax, Character.MIN_HIGH_SURROGATE - 1); 137 } 138 } 139 140 /* 141 * This is overridden to improve performance. Rough benchmarking shows that this almost doubles 142 * the speed when processing strings that do not require any escaping. 143 */ 144 @Override 145 public final String escape(String s) { 146 checkNotNull(s); // GWT specific check (do not optimize) 147 for (int i = 0; i < s.length(); i++) { 148 char c = s.charAt(i); 149 if ((c < replacementsLength && replacements[c] != null) 150 || c > safeMaxChar 151 || c < safeMinChar) { 152 return escapeSlow(s, i); 153 } 154 } 155 return s; 156 } 157 158 /** 159 * Escapes a single Unicode code point using the replacement array and safe range values. If the 160 * given character does not have an explicit replacement and lies outside the safe range then 161 * {@link #escapeUnsafe} is called. 162 * 163 * @return the replacement characters, or {@code null} if no escaping was required 164 */ 165 @Override 166 @CheckForNull 167 protected final char[] escape(int cp) { 168 if (cp < replacementsLength) { 169 char[] chars = replacements[cp]; 170 if (chars != null) { 171 return chars; 172 } 173 } 174 if (cp >= safeMin && cp <= safeMax) { 175 return null; 176 } 177 return escapeUnsafe(cp); 178 } 179 180 /* Overridden for performance. */ 181 @Override 182 protected final int nextEscapeIndex(CharSequence csq, int index, int end) { 183 while (index < end) { 184 char c = csq.charAt(index); 185 if ((c < replacementsLength && replacements[c] != null) 186 || c > safeMaxChar 187 || c < safeMinChar) { 188 break; 189 } 190 index++; 191 } 192 return index; 193 } 194 195 /** 196 * Escapes a code point that has no direct explicit value in the replacement array and lies 197 * outside the stated safe range. Subclasses should override this method to provide generalized 198 * escaping for code points if required. 199 * 200 * <p>Note that arrays returned by this method must not be modified once they have been returned. 201 * However it is acceptable to return the same array multiple times (even for different input 202 * characters). 203 * 204 * @param cp the Unicode code point to escape 205 * @return the replacement characters, or {@code null} if no escaping was required 206 */ 207 @CheckForNull 208 protected abstract char[] escapeUnsafe(int cp); 209}