001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.escape;
016
017import static com.google.common.base.Preconditions.checkNotNull;
018
019import com.google.common.annotations.Beta;
020import com.google.common.annotations.GwtCompatible;
021import com.google.errorprone.annotations.CanIgnoreReturnValue;
022import java.util.HashMap;
023import java.util.Map;
024import javax.annotation.Nullable;
025
026/**
027 * Static utility methods pertaining to {@link Escaper} instances.
028 *
029 * @author Sven Mawson
030 * @author David Beaumont
031 * @since 15.0
032 */
033@Beta
034@GwtCompatible
035public final class Escapers {
036  private Escapers() {}
037
038  /**
039   * Returns an {@link Escaper} that does no escaping, passing all character data through unchanged.
040   */
041  public static Escaper nullEscaper() {
042    return NULL_ESCAPER;
043  }
044
045  // An Escaper that efficiently performs no escaping.
046  // Extending CharEscaper (instead of Escaper) makes Escapers.compose() easier.
047  private static final Escaper NULL_ESCAPER =
048      new CharEscaper() {
049        @Override
050        public String escape(String string) {
051          return checkNotNull(string);
052        }
053
054        @Override
055        protected char[] escape(char c) {
056          // TODO: Fix tests not to call this directly and make it throw an error.
057          return null;
058        }
059      };
060
061  /**
062   * Returns a builder for creating simple, fast escapers. A builder instance can be reused and each
063   * escaper that is created will be a snapshot of the current builder state. Builders are not
064   * thread safe.
065   *
066   * <p>The initial state of the builder is such that:
067   * <ul>
068   * <li>There are no replacement mappings
069   * <li>{@code safeMin == Character.MIN_VALUE}
070   * <li>{@code safeMax == Character.MAX_VALUE}
071   * <li>{@code unsafeReplacement == null}
072   * </ul>
073   * <p>For performance reasons escapers created by this builder are not Unicode aware and will not
074   * validate the well-formedness of their input.
075   */
076  public static Builder builder() {
077    return new Builder();
078  }
079
080  /**
081   * A builder for simple, fast escapers.
082   *
083   * <p>Typically an escaper needs to deal with the escaping of high valued characters or code
084   * points. In these cases it is necessary to extend either {@link ArrayBasedCharEscaper} or
085   * {@link ArrayBasedUnicodeEscaper} to provide the desired behavior. However this builder is
086   * suitable for creating escapers that replace a relative small set of characters.
087   *
088   * @author David Beaumont
089   * @since 15.0
090   */
091  @Beta
092  public static final class Builder {
093    private final Map<Character, String> replacementMap = new HashMap<Character, String>();
094    private char safeMin = Character.MIN_VALUE;
095    private char safeMax = Character.MAX_VALUE;
096    private String unsafeReplacement = null;
097
098    // The constructor is exposed via the builder() method above.
099    private Builder() {}
100
101    /**
102     * Sets the safe range of characters for the escaper. Characters in this range that have no
103     * explicit replacement are considered 'safe' and remain unescaped in the output. If
104     * {@code safeMax < safeMin} then the safe range is empty.
105     *
106     * @param safeMin the lowest 'safe' character
107     * @param safeMax the highest 'safe' character
108     * @return the builder instance
109     */
110    @CanIgnoreReturnValue
111    public Builder setSafeRange(char safeMin, char safeMax) {
112      this.safeMin = safeMin;
113      this.safeMax = safeMax;
114      return this;
115    }
116
117    /**
118     * Sets the replacement string for any characters outside the 'safe' range that have no explicit
119     * replacement. If {@code unsafeReplacement} is {@code null} then no replacement will occur, if
120     * it is {@code ""} then the unsafe characters are removed from the output.
121     *
122     * @param unsafeReplacement the string to replace unsafe characters
123     * @return the builder instance
124     */
125    @CanIgnoreReturnValue
126    public Builder setUnsafeReplacement(@Nullable String unsafeReplacement) {
127      this.unsafeReplacement = unsafeReplacement;
128      return this;
129    }
130
131    /**
132     * Adds a replacement string for the given input character. The specified character will be
133     * replaced by the given string whenever it occurs in the input, irrespective of whether it lies
134     * inside or outside the 'safe' range.
135     *
136     * @param c the character to be replaced
137     * @param replacement the string to replace the given character
138     * @return the builder instance
139     * @throws NullPointerException if {@code replacement} is null
140     */
141    @CanIgnoreReturnValue
142    public Builder addEscape(char c, String replacement) {
143      checkNotNull(replacement);
144      // This can replace an existing character (the builder is re-usable).
145      replacementMap.put(c, replacement);
146      return this;
147    }
148
149    /**
150     * Returns a new escaper based on the current state of the builder.
151     */
152    public Escaper build() {
153      return new ArrayBasedCharEscaper(replacementMap, safeMin, safeMax) {
154        private final char[] replacementChars =
155            unsafeReplacement != null ? unsafeReplacement.toCharArray() : null;
156
157        @Override
158        protected char[] escapeUnsafe(char c) {
159          return replacementChars;
160        }
161      };
162    }
163  }
164
165  /**
166   * Returns a {@link UnicodeEscaper} equivalent to the given escaper instance. If the escaper is
167   * already a UnicodeEscaper then it is simply returned, otherwise it is wrapped in a
168   * UnicodeEscaper.
169   *
170   * <p>When a {@link CharEscaper} escaper is wrapped by this method it acquires extra behavior with
171   * respect to the well-formedness of Unicode character sequences and will throw
172   * {@link IllegalArgumentException} when given bad input.
173   *
174   * @param escaper the instance to be wrapped
175   * @return a UnicodeEscaper with the same behavior as the given instance
176   * @throws NullPointerException if escaper is null
177   * @throws IllegalArgumentException if escaper is not a UnicodeEscaper or a CharEscaper
178   */
179  static UnicodeEscaper asUnicodeEscaper(Escaper escaper) {
180    checkNotNull(escaper);
181    if (escaper instanceof UnicodeEscaper) {
182      return (UnicodeEscaper) escaper;
183    } else if (escaper instanceof CharEscaper) {
184      return wrap((CharEscaper) escaper);
185    }
186    // In practice this shouldn't happen because it would be very odd not to
187    // extend either CharEscaper or UnicodeEscaper for non trivial cases.
188    throw new IllegalArgumentException(
189        "Cannot create a UnicodeEscaper from: " + escaper.getClass().getName());
190  }
191
192  /**
193   * Returns a string that would replace the given character in the specified escaper, or
194   * {@code null} if no replacement should be made. This method is intended for use in tests through
195   * the {@code EscaperAsserts} class; production users of {@link CharEscaper} should limit
196   * themselves to its public interface.
197   *
198   * @param c the character to escape if necessary
199   * @return the replacement string, or {@code null} if no escaping was needed
200   */
201  public static String computeReplacement(CharEscaper escaper, char c) {
202    return stringOrNull(escaper.escape(c));
203  }
204
205  /**
206   * Returns a string that would replace the given character in the specified escaper, or
207   * {@code null} if no replacement should be made. This method is intended for use in tests through
208   * the {@code EscaperAsserts} class; production users of {@link UnicodeEscaper} should limit
209   * themselves to its public interface.
210   *
211   * @param cp the Unicode code point to escape if necessary
212   * @return the replacement string, or {@code null} if no escaping was needed
213   */
214  public static String computeReplacement(UnicodeEscaper escaper, int cp) {
215    return stringOrNull(escaper.escape(cp));
216  }
217
218  private static String stringOrNull(char[] in) {
219    return (in == null) ? null : new String(in);
220  }
221
222  /** Private helper to wrap a CharEscaper as a UnicodeEscaper. */
223  private static UnicodeEscaper wrap(final CharEscaper escaper) {
224    return new UnicodeEscaper() {
225      @Override
226      protected char[] escape(int cp) {
227        // If a code point maps to a single character, just escape that.
228        if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
229          return escaper.escape((char) cp);
230        }
231        // Convert the code point to a surrogate pair and escape them both.
232        // Note: This code path is horribly slow and typically allocates 4 new
233        // char[] each time it is invoked. However this avoids any
234        // synchronization issues and makes the escaper thread safe.
235        char[] surrogateChars = new char[2];
236        Character.toChars(cp, surrogateChars, 0);
237        char[] hiChars = escaper.escape(surrogateChars[0]);
238        char[] loChars = escaper.escape(surrogateChars[1]);
239
240        // If either hiChars or lowChars are non-null, the CharEscaper is trying
241        // to escape the characters of a surrogate pair separately. This is
242        // uncommon and applies only to escapers that assume UCS-2 rather than
243        // UTF-16. See: http://en.wikipedia.org/wiki/UTF-16/UCS-2
244        if (hiChars == null && loChars == null) {
245          // We expect this to be the common code path for most escapers.
246          return null;
247        }
248        // Combine the characters and/or escaped sequences into a single array.
249        int hiCount = hiChars != null ? hiChars.length : 1;
250        int loCount = loChars != null ? loChars.length : 1;
251        char[] output = new char[hiCount + loCount];
252        if (hiChars != null) {
253          // TODO: Is this faster than System.arraycopy() for small arrays?
254          for (int n = 0; n < hiChars.length; ++n) {
255            output[n] = hiChars[n];
256          }
257        } else {
258          output[0] = surrogateChars[0];
259        }
260        if (loChars != null) {
261          for (int n = 0; n < loChars.length; ++n) {
262            output[hiCount + n] = loChars[n];
263          }
264        } else {
265          output[hiCount] = surrogateChars[1];
266        }
267        return output;
268      }
269    };
270  }
271}