001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.escape; 016 017import static com.google.common.base.Preconditions.checkNotNull; 018 019import com.google.common.annotations.Beta; 020import com.google.common.annotations.GwtCompatible; 021import com.google.errorprone.annotations.CanIgnoreReturnValue; 022import java.util.HashMap; 023import java.util.Map; 024import javax.annotation.CheckForNull; 025import org.checkerframework.checker.nullness.qual.Nullable; 026 027/** 028 * Static utility methods pertaining to {@link Escaper} instances. 029 * 030 * @author Sven Mawson 031 * @author David Beaumont 032 * @since 15.0 033 */ 034@Beta 035@GwtCompatible 036@ElementTypesAreNonnullByDefault 037public final class Escapers { 038 private Escapers() {} 039 040 /** 041 * Returns an {@link Escaper} that does no escaping, passing all character data through unchanged. 042 */ 043 public static Escaper nullEscaper() { 044 return NULL_ESCAPER; 045 } 046 047 // An Escaper that efficiently performs no escaping. 048 // Extending CharEscaper (instead of Escaper) makes Escapers.compose() easier. 049 private static final Escaper NULL_ESCAPER = 050 new CharEscaper() { 051 @Override 052 public String escape(String string) { 053 return checkNotNull(string); 054 } 055 056 @Override 057 @CheckForNull 058 protected char[] escape(char c) { 059 // TODO: Fix tests not to call this directly and make it throw an error. 060 return null; 061 } 062 }; 063 064 /** 065 * Returns a builder for creating simple, fast escapers. A builder instance can be reused and each 066 * escaper that is created will be a snapshot of the current builder state. Builders are not 067 * thread safe. 068 * 069 * <p>The initial state of the builder is such that: 070 * 071 * <ul> 072 * <li>There are no replacement mappings 073 * <li>{@code safeMin == Character.MIN_VALUE} 074 * <li>{@code safeMax == Character.MAX_VALUE} 075 * <li>{@code unsafeReplacement == null} 076 * </ul> 077 * 078 * <p>For performance reasons escapers created by this builder are not Unicode aware and will not 079 * validate the well-formedness of their input. 080 */ 081 public static Builder builder() { 082 return new Builder(); 083 } 084 085 /** 086 * A builder for simple, fast escapers. 087 * 088 * <p>Typically an escaper needs to deal with the escaping of high valued characters or code 089 * points. In these cases it is necessary to extend either {@link ArrayBasedCharEscaper} or {@link 090 * ArrayBasedUnicodeEscaper} to provide the desired behavior. However this builder is suitable for 091 * creating escapers that replace a relative small set of characters. 092 * 093 * @author David Beaumont 094 * @since 15.0 095 */ 096 @Beta 097 public static final class Builder { 098 private final Map<Character, String> replacementMap = new HashMap<>(); 099 private char safeMin = Character.MIN_VALUE; 100 private char safeMax = Character.MAX_VALUE; 101 @CheckForNull private String unsafeReplacement = null; 102 103 // The constructor is exposed via the builder() method above. 104 private Builder() {} 105 106 /** 107 * Sets the safe range of characters for the escaper. Characters in this range that have no 108 * explicit replacement are considered 'safe' and remain unescaped in the output. If {@code 109 * safeMax < safeMin} then the safe range is empty. 110 * 111 * @param safeMin the lowest 'safe' character 112 * @param safeMax the highest 'safe' character 113 * @return the builder instance 114 */ 115 @CanIgnoreReturnValue 116 public Builder setSafeRange(char safeMin, char safeMax) { 117 this.safeMin = safeMin; 118 this.safeMax = safeMax; 119 return this; 120 } 121 122 /** 123 * Sets the replacement string for any characters outside the 'safe' range that have no explicit 124 * replacement. If {@code unsafeReplacement} is {@code null} then no replacement will occur, if 125 * it is {@code ""} then the unsafe characters are removed from the output. 126 * 127 * @param unsafeReplacement the string to replace unsafe characters 128 * @return the builder instance 129 */ 130 @CanIgnoreReturnValue 131 public Builder setUnsafeReplacement(@Nullable String unsafeReplacement) { 132 this.unsafeReplacement = unsafeReplacement; 133 return this; 134 } 135 136 /** 137 * Adds a replacement string for the given input character. The specified character will be 138 * replaced by the given string whenever it occurs in the input, irrespective of whether it lies 139 * inside or outside the 'safe' range. 140 * 141 * @param c the character to be replaced 142 * @param replacement the string to replace the given character 143 * @return the builder instance 144 * @throws NullPointerException if {@code replacement} is null 145 */ 146 @CanIgnoreReturnValue 147 public Builder addEscape(char c, String replacement) { 148 checkNotNull(replacement); 149 // This can replace an existing character (the builder is re-usable). 150 replacementMap.put(c, replacement); 151 return this; 152 } 153 154 /** Returns a new escaper based on the current state of the builder. */ 155 public Escaper build() { 156 return new ArrayBasedCharEscaper(replacementMap, safeMin, safeMax) { 157 @CheckForNull 158 private final char[] replacementChars = 159 unsafeReplacement != null ? unsafeReplacement.toCharArray() : null; 160 161 @Override 162 @CheckForNull 163 protected char[] escapeUnsafe(char c) { 164 return replacementChars; 165 } 166 }; 167 } 168 } 169 170 /** 171 * Returns a {@link UnicodeEscaper} equivalent to the given escaper instance. If the escaper is 172 * already a UnicodeEscaper then it is simply returned, otherwise it is wrapped in a 173 * UnicodeEscaper. 174 * 175 * <p>When a {@link CharEscaper} escaper is wrapped by this method it acquires extra behavior with 176 * respect to the well-formedness of Unicode character sequences and will throw {@link 177 * IllegalArgumentException} when given bad input. 178 * 179 * @param escaper the instance to be wrapped 180 * @return a UnicodeEscaper with the same behavior as the given instance 181 * @throws NullPointerException if escaper is null 182 * @throws IllegalArgumentException if escaper is not a UnicodeEscaper or a CharEscaper 183 */ 184 static UnicodeEscaper asUnicodeEscaper(Escaper escaper) { 185 checkNotNull(escaper); 186 if (escaper instanceof UnicodeEscaper) { 187 return (UnicodeEscaper) escaper; 188 } else if (escaper instanceof CharEscaper) { 189 return wrap((CharEscaper) escaper); 190 } 191 // In practice this shouldn't happen because it would be very odd not to 192 // extend either CharEscaper or UnicodeEscaper for non trivial cases. 193 throw new IllegalArgumentException( 194 "Cannot create a UnicodeEscaper from: " + escaper.getClass().getName()); 195 } 196 197 /** 198 * Returns a string that would replace the given character in the specified escaper, or {@code 199 * null} if no replacement should be made. This method is intended for use in tests through the 200 * {@code EscaperAsserts} class; production users of {@link CharEscaper} should limit themselves 201 * to its public interface. 202 * 203 * @param c the character to escape if necessary 204 * @return the replacement string, or {@code null} if no escaping was needed 205 */ 206 @CheckForNull 207 public static String computeReplacement(CharEscaper escaper, char c) { 208 return stringOrNull(escaper.escape(c)); 209 } 210 211 /** 212 * Returns a string that would replace the given character in the specified escaper, or {@code 213 * null} if no replacement should be made. This method is intended for use in tests through the 214 * {@code EscaperAsserts} class; production users of {@link UnicodeEscaper} should limit 215 * themselves to its public interface. 216 * 217 * @param cp the Unicode code point to escape if necessary 218 * @return the replacement string, or {@code null} if no escaping was needed 219 */ 220 @CheckForNull 221 public static String computeReplacement(UnicodeEscaper escaper, int cp) { 222 return stringOrNull(escaper.escape(cp)); 223 } 224 225 @CheckForNull 226 private static String stringOrNull(@CheckForNull char[] in) { 227 return (in == null) ? null : new String(in); 228 } 229 230 /** Private helper to wrap a CharEscaper as a UnicodeEscaper. */ 231 private static UnicodeEscaper wrap(final CharEscaper escaper) { 232 return new UnicodeEscaper() { 233 @Override 234 @CheckForNull 235 protected char[] escape(int cp) { 236 // If a code point maps to a single character, just escape that. 237 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 238 return escaper.escape((char) cp); 239 } 240 // Convert the code point to a surrogate pair and escape them both. 241 // Note: This code path is horribly slow and typically allocates 4 new 242 // char[] each time it is invoked. However this avoids any 243 // synchronization issues and makes the escaper thread safe. 244 char[] surrogateChars = new char[2]; 245 Character.toChars(cp, surrogateChars, 0); 246 char[] hiChars = escaper.escape(surrogateChars[0]); 247 char[] loChars = escaper.escape(surrogateChars[1]); 248 249 // If either hiChars or lowChars are non-null, the CharEscaper is trying 250 // to escape the characters of a surrogate pair separately. This is 251 // uncommon and applies only to escapers that assume UCS-2 rather than 252 // UTF-16. See: http://en.wikipedia.org/wiki/UTF-16/UCS-2 253 if (hiChars == null && loChars == null) { 254 // We expect this to be the common code path for most escapers. 255 return null; 256 } 257 // Combine the characters and/or escaped sequences into a single array. 258 int hiCount = hiChars != null ? hiChars.length : 1; 259 int loCount = loChars != null ? loChars.length : 1; 260 char[] output = new char[hiCount + loCount]; 261 if (hiChars != null) { 262 // TODO: Is this faster than System.arraycopy() for small arrays? 263 for (int n = 0; n < hiChars.length; ++n) { 264 output[n] = hiChars[n]; 265 } 266 } else { 267 output[0] = surrogateChars[0]; 268 } 269 if (loChars != null) { 270 for (int n = 0; n < loChars.length; ++n) { 271 output[hiCount + n] = loChars[n]; 272 } 273 } else { 274 output[hiCount] = surrogateChars[1]; 275 } 276 return output; 277 } 278 }; 279 } 280}