001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.escape;
016
017import static com.google.common.base.Preconditions.checkNotNull;
018
019import com.google.common.annotations.Beta;
020import com.google.common.annotations.GwtCompatible;
021import com.google.errorprone.annotations.CanIgnoreReturnValue;
022import java.util.HashMap;
023import java.util.Map;
024import javax.annotation.CheckForNull;
025import org.checkerframework.checker.nullness.qual.Nullable;
026
027/**
028 * Static utility methods pertaining to {@link Escaper} instances.
029 *
030 * @author Sven Mawson
031 * @author David Beaumont
032 * @since 15.0
033 */
034@Beta
035@GwtCompatible
036@ElementTypesAreNonnullByDefault
037public final class Escapers {
038  private Escapers() {}
039
040  /**
041   * Returns an {@link Escaper} that does no escaping, passing all character data through unchanged.
042   */
043  public static Escaper nullEscaper() {
044    return NULL_ESCAPER;
045  }
046
047  // An Escaper that efficiently performs no escaping.
048  // Extending CharEscaper (instead of Escaper) makes Escapers.compose() easier.
049  private static final Escaper NULL_ESCAPER =
050      new CharEscaper() {
051        @Override
052        public String escape(String string) {
053          return checkNotNull(string);
054        }
055
056        @Override
057        @CheckForNull
058        protected char[] escape(char c) {
059          // TODO: Fix tests not to call this directly and make it throw an error.
060          return null;
061        }
062      };
063
064  /**
065   * Returns a builder for creating simple, fast escapers. A builder instance can be reused and each
066   * escaper that is created will be a snapshot of the current builder state. Builders are not
067   * thread safe.
068   *
069   * <p>The initial state of the builder is such that:
070   *
071   * <ul>
072   *   <li>There are no replacement mappings
073   *   <li>{@code safeMin == Character.MIN_VALUE}
074   *   <li>{@code safeMax == Character.MAX_VALUE}
075   *   <li>{@code unsafeReplacement == null}
076   * </ul>
077   *
078   * <p>For performance reasons escapers created by this builder are not Unicode aware and will not
079   * validate the well-formedness of their input.
080   */
081  public static Builder builder() {
082    return new Builder();
083  }
084
085  /**
086   * A builder for simple, fast escapers.
087   *
088   * <p>Typically an escaper needs to deal with the escaping of high valued characters or code
089   * points. In these cases it is necessary to extend either {@link ArrayBasedCharEscaper} or {@link
090   * ArrayBasedUnicodeEscaper} to provide the desired behavior. However this builder is suitable for
091   * creating escapers that replace a relative small set of characters.
092   *
093   * @author David Beaumont
094   * @since 15.0
095   */
096  @Beta
097  public static final class Builder {
098    private final Map<Character, String> replacementMap = new HashMap<>();
099    private char safeMin = Character.MIN_VALUE;
100    private char safeMax = Character.MAX_VALUE;
101    @CheckForNull private String unsafeReplacement = null;
102
103    // The constructor is exposed via the builder() method above.
104    private Builder() {}
105
106    /**
107     * Sets the safe range of characters for the escaper. Characters in this range that have no
108     * explicit replacement are considered 'safe' and remain unescaped in the output. If {@code
109     * safeMax < safeMin} then the safe range is empty.
110     *
111     * @param safeMin the lowest 'safe' character
112     * @param safeMax the highest 'safe' character
113     * @return the builder instance
114     */
115    @CanIgnoreReturnValue
116    public Builder setSafeRange(char safeMin, char safeMax) {
117      this.safeMin = safeMin;
118      this.safeMax = safeMax;
119      return this;
120    }
121
122    /**
123     * Sets the replacement string for any characters outside the 'safe' range that have no explicit
124     * replacement. If {@code unsafeReplacement} is {@code null} then no replacement will occur, if
125     * it is {@code ""} then the unsafe characters are removed from the output.
126     *
127     * @param unsafeReplacement the string to replace unsafe characters
128     * @return the builder instance
129     */
130    @CanIgnoreReturnValue
131    public Builder setUnsafeReplacement(@Nullable String unsafeReplacement) {
132      this.unsafeReplacement = unsafeReplacement;
133      return this;
134    }
135
136    /**
137     * Adds a replacement string for the given input character. The specified character will be
138     * replaced by the given string whenever it occurs in the input, irrespective of whether it lies
139     * inside or outside the 'safe' range.
140     *
141     * @param c the character to be replaced
142     * @param replacement the string to replace the given character
143     * @return the builder instance
144     * @throws NullPointerException if {@code replacement} is null
145     */
146    @CanIgnoreReturnValue
147    public Builder addEscape(char c, String replacement) {
148      checkNotNull(replacement);
149      // This can replace an existing character (the builder is re-usable).
150      replacementMap.put(c, replacement);
151      return this;
152    }
153
154    /** Returns a new escaper based on the current state of the builder. */
155    public Escaper build() {
156      return new ArrayBasedCharEscaper(replacementMap, safeMin, safeMax) {
157        @CheckForNull
158        private final char[] replacementChars =
159            unsafeReplacement != null ? unsafeReplacement.toCharArray() : null;
160
161        @Override
162        @CheckForNull
163        protected char[] escapeUnsafe(char c) {
164          return replacementChars;
165        }
166      };
167    }
168  }
169
170  /**
171   * Returns a {@link UnicodeEscaper} equivalent to the given escaper instance. If the escaper is
172   * already a UnicodeEscaper then it is simply returned, otherwise it is wrapped in a
173   * UnicodeEscaper.
174   *
175   * <p>When a {@link CharEscaper} escaper is wrapped by this method it acquires extra behavior with
176   * respect to the well-formedness of Unicode character sequences and will throw {@link
177   * IllegalArgumentException} when given bad input.
178   *
179   * @param escaper the instance to be wrapped
180   * @return a UnicodeEscaper with the same behavior as the given instance
181   * @throws NullPointerException if escaper is null
182   * @throws IllegalArgumentException if escaper is not a UnicodeEscaper or a CharEscaper
183   */
184  static UnicodeEscaper asUnicodeEscaper(Escaper escaper) {
185    checkNotNull(escaper);
186    if (escaper instanceof UnicodeEscaper) {
187      return (UnicodeEscaper) escaper;
188    } else if (escaper instanceof CharEscaper) {
189      return wrap((CharEscaper) escaper);
190    }
191    // In practice this shouldn't happen because it would be very odd not to
192    // extend either CharEscaper or UnicodeEscaper for non trivial cases.
193    throw new IllegalArgumentException(
194        "Cannot create a UnicodeEscaper from: " + escaper.getClass().getName());
195  }
196
197  /**
198   * Returns a string that would replace the given character in the specified escaper, or {@code
199   * null} if no replacement should be made. This method is intended for use in tests through the
200   * {@code EscaperAsserts} class; production users of {@link CharEscaper} should limit themselves
201   * to its public interface.
202   *
203   * @param c the character to escape if necessary
204   * @return the replacement string, or {@code null} if no escaping was needed
205   */
206  @CheckForNull
207  public static String computeReplacement(CharEscaper escaper, char c) {
208    return stringOrNull(escaper.escape(c));
209  }
210
211  /**
212   * Returns a string that would replace the given character in the specified escaper, or {@code
213   * null} if no replacement should be made. This method is intended for use in tests through the
214   * {@code EscaperAsserts} class; production users of {@link UnicodeEscaper} should limit
215   * themselves to its public interface.
216   *
217   * @param cp the Unicode code point to escape if necessary
218   * @return the replacement string, or {@code null} if no escaping was needed
219   */
220  @CheckForNull
221  public static String computeReplacement(UnicodeEscaper escaper, int cp) {
222    return stringOrNull(escaper.escape(cp));
223  }
224
225  @CheckForNull
226  private static String stringOrNull(@CheckForNull char[] in) {
227    return (in == null) ? null : new String(in);
228  }
229
230  /** Private helper to wrap a CharEscaper as a UnicodeEscaper. */
231  private static UnicodeEscaper wrap(final CharEscaper escaper) {
232    return new UnicodeEscaper() {
233      @Override
234      @CheckForNull
235      protected char[] escape(int cp) {
236        // If a code point maps to a single character, just escape that.
237        if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
238          return escaper.escape((char) cp);
239        }
240        // Convert the code point to a surrogate pair and escape them both.
241        // Note: This code path is horribly slow and typically allocates 4 new
242        // char[] each time it is invoked. However this avoids any
243        // synchronization issues and makes the escaper thread safe.
244        char[] surrogateChars = new char[2];
245        Character.toChars(cp, surrogateChars, 0);
246        char[] hiChars = escaper.escape(surrogateChars[0]);
247        char[] loChars = escaper.escape(surrogateChars[1]);
248
249        // If either hiChars or lowChars are non-null, the CharEscaper is trying
250        // to escape the characters of a surrogate pair separately. This is
251        // uncommon and applies only to escapers that assume UCS-2 rather than
252        // UTF-16. See: http://en.wikipedia.org/wiki/UTF-16/UCS-2
253        if (hiChars == null && loChars == null) {
254          // We expect this to be the common code path for most escapers.
255          return null;
256        }
257        // Combine the characters and/or escaped sequences into a single array.
258        int hiCount = hiChars != null ? hiChars.length : 1;
259        int loCount = loChars != null ? loChars.length : 1;
260        char[] output = new char[hiCount + loCount];
261        if (hiChars != null) {
262          // TODO: Is this faster than System.arraycopy() for small arrays?
263          for (int n = 0; n < hiChars.length; ++n) {
264            output[n] = hiChars[n];
265          }
266        } else {
267          output[0] = surrogateChars[0];
268        }
269        if (loChars != null) {
270          for (int n = 0; n < loChars.length; ++n) {
271            output[hiCount + n] = loChars[n];
272          }
273        } else {
274          output[hiCount] = surrogateChars[1];
275        }
276        return output;
277      }
278    };
279  }
280}