001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.escape;
018
019import static com.google.common.base.Preconditions.checkNotNull;
020
021import com.google.common.annotations.Beta;
022import com.google.common.annotations.GwtCompatible;
023
024import java.util.HashMap;
025import java.util.Map;
026
027import javax.annotation.Nullable;
028
029/**
030 * Static utility methods pertaining to {@link Escaper} instances.
031 *
032 * @author Sven Mawson
033 * @author David Beaumont
034 * @since 15.0
035 */
036@Beta
037@GwtCompatible
038public final class Escapers {
039  private Escapers() {}
040
041  /**
042   * Returns an {@link Escaper} that does no escaping, passing all character
043   * data through unchanged.
044   */
045  public static Escaper nullEscaper() {
046    return NULL_ESCAPER;
047  }
048
049  // An Escaper that efficiently performs no escaping.
050  // Extending CharEscaper (instead of Escaper) makes Escapers.compose() easier.
051  private static final Escaper NULL_ESCAPER = new CharEscaper() {
052    @Override public String escape(String string) {
053      return checkNotNull(string);
054    }
055
056    @Override protected char[] escape(char c) {
057      // TODO: Fix tests not to call this directly and make it throw an error.
058      return null;
059    }
060  };
061
062  /**
063   * Returns a builder for creating simple, fast escapers. A builder instance
064   * can be reused and each escaper that is created will be a snapshot of the
065   * current builder state. Builders are not thread safe.
066   *
067   * <p>The initial state of the builder is such that:
068   * <ul>
069   * <li>There are no replacement mappings<li>
070   * <li>{@code safeMin == Character.MIN_VALUE}</li>
071   * <li>{@code safeMax == Character.MAX_VALUE}</li>
072   * <li>{@code unsafeReplacement == null}</li>
073   * </ul>
074   * <p>For performance reasons escapers created by this builder are not
075   * Unicode aware and will not validate the well-formedness of their input.
076   */
077  public static Builder builder() {
078    return new Builder();
079  }
080
081  /**
082   * A builder for simple, fast escapers.
083   *
084   * <p>Typically an escaper needs to deal with the escaping of high valued
085   * characters or code points. In these cases it is necessary to extend either
086   * {@link ArrayBasedCharEscaper} or {@link ArrayBasedUnicodeEscaper} to
087   * provide the desired behavior. However this builder is suitable for creating
088   * escapers that replace a relative small set of characters.
089   *
090   * @author David Beaumont
091   * @since 15.0
092   */
093  @Beta
094  public static final class Builder {
095    private final Map<Character, String> replacementMap =
096        new HashMap<Character, String>();
097    private char safeMin = Character.MIN_VALUE;
098    private char safeMax = Character.MAX_VALUE;
099    private String unsafeReplacement = null;
100
101    // The constructor is exposed via the builder() method above.
102    private Builder() {}
103
104    /**
105     * Sets the safe range of characters for the escaper. Characters in this
106     * range that have no explicit replacement are considered 'safe' and remain
107     * unescaped in the output. If {@code safeMax < safeMin} then the safe range
108     * is empty.
109     *
110     * @param safeMin the lowest 'safe' character
111     * @param safeMax the highest 'safe' character
112     * @return the builder instance
113     */
114    public Builder setSafeRange(char safeMin, char safeMax) {
115      this.safeMin = safeMin;
116      this.safeMax = safeMax;
117      return this;
118    }
119
120    /**
121     * Sets the replacement string for any characters outside the 'safe' range
122     * that have no explicit replacement. If {@code unsafeReplacement} is
123     * {@code null} then no replacement will occur, if it is {@code ""} then
124     * the unsafe characters are removed from the output.
125     *
126     * @param unsafeReplacement the string to replace unsafe chracters
127     * @return the builder instance
128     */
129    public Builder setUnsafeReplacement(@Nullable String unsafeReplacement) {
130      this.unsafeReplacement = unsafeReplacement;
131      return this;
132    }
133
134    /**
135     * Adds a replacement string for the given input character. The specified
136     * character will be replaced by the given string whenever it occurs in the
137     * input, irrespective of whether it lies inside or outside the 'safe'
138     * range.
139     *
140     * @param c the character to be replaced
141     * @param replacement the string to replace the given character
142     * @return the builder instance
143     * @throws NullPointerException if {@code replacement} is null
144     */
145    public Builder addEscape(char c, String replacement) {
146      checkNotNull(replacement);
147      // This can replace an existing character (the builder is re-usable).
148      replacementMap.put(c, replacement);
149      return this;
150    }
151
152    /**
153     * Returns a new escaper based on the current state of the builder.
154     */
155    public Escaper build() {
156      return new ArrayBasedCharEscaper(replacementMap, safeMin, safeMax) {
157        private final char[] replacementChars =
158            unsafeReplacement != null ? unsafeReplacement.toCharArray() : null;
159        @Override protected char[] escapeUnsafe(char c) {
160          return replacementChars;
161        }
162      };
163    }
164  }
165
166  /**
167   * Returns a {@link UnicodeEscaper} equivalent to the given escaper instance.
168   * If the escaper is already a UnicodeEscaper then it is simply returned,
169   * otherwise it is wrapped in a UnicodeEscaper.
170   *
171   * <p>When a {@link CharEscaper} escaper is wrapped by this method it acquires
172   * extra behavior with respect to the well-formedness of Unicode character
173   * sequences and will throw {@link IllegalArgumentException} when given bad
174   * input.
175   *
176   * @param escaper the instance to be wrapped
177   * @return a UnicodeEscaper with the same behavior as the given instance
178   * @throws NullPointerException if escaper is null
179   * @throws IllegalArgumentException if escaper is not a UnicodeEscaper or a
180   *         CharEscaper
181   */
182  static UnicodeEscaper asUnicodeEscaper(Escaper escaper) {
183    checkNotNull(escaper);
184    if (escaper instanceof UnicodeEscaper) {
185      return (UnicodeEscaper) escaper;
186    } else if (escaper instanceof CharEscaper) {
187      return wrap((CharEscaper) escaper);
188    }
189    // In practice this shouldn't happen because it would be very odd not to
190    // extend either CharEscaper or UnicodeEscaper for non trivial cases.
191    throw new IllegalArgumentException("Cannot create a UnicodeEscaper from: " +
192        escaper.getClass().getName());
193  }
194
195  /**
196   * Returns a string that would replace the given character in the specified
197   * escaper, or {@code null} if no replacement should be made. This method is
198   * intended for use in tests through the {@code EscaperAsserts} class;
199   * production users of {@link CharEscaper} should limit themselves to its
200   * public interface.
201   *
202   * @param c the character to escape if necessary
203   * @return the replacement string, or {@code null} if no escaping was needed
204   */
205  public static String computeReplacement(CharEscaper escaper, char c) {
206    return stringOrNull(escaper.escape(c));
207  }
208
209  /**
210   * Returns a string that would replace the given character in the specified
211   * escaper, or {@code null} if no replacement should be made. This method is
212   * intended for use in tests through the {@code EscaperAsserts} class;
213   * production users of {@link UnicodeEscaper} should limit themselves to its
214   * public interface.
215   *
216   * @param cp the Unicode code point to escape if necessary
217   * @return the replacement string, or {@code null} if no escaping was needed
218   */
219  public static String computeReplacement(UnicodeEscaper escaper, int cp) {
220    return stringOrNull(escaper.escape(cp));
221  }
222
223  private static String stringOrNull(char[] in) {
224    return (in == null) ? null : new String(in);
225  }
226
227  /** Private helper to wrap a CharEscaper as a UnicodeEscaper. */
228  private static UnicodeEscaper wrap(final CharEscaper escaper) {
229    return new UnicodeEscaper() {
230      @Override protected char[] escape(int cp) {
231        // If a code point maps to a single character, just escape that.
232        if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
233          return escaper.escape((char) cp);
234        }
235        // Convert the code point to a surrogate pair and escape them both.
236        // Note: This code path is horribly slow and typically allocates 4 new
237        // char[] each time it is invoked. However this avoids any
238        // synchronization issues and makes the escaper thread safe.
239        char[] surrogateChars = new char[2];
240        Character.toChars(cp, surrogateChars, 0);
241        char[] hiChars = escaper.escape(surrogateChars[0]);
242        char[] loChars = escaper.escape(surrogateChars[1]);
243
244        // If either hiChars or lowChars are non-null, the CharEscaper is trying
245        // to escape the characters of a surrogate pair separately. This is
246        // uncommon and applies only to escapers that assume UCS-2 rather than
247        // UTF-16. See: http://en.wikipedia.org/wiki/UTF-16/UCS-2
248        if (hiChars == null && loChars == null) {
249          // We expect this to be the common code path for most escapers.
250          return null;
251        }
252        // Combine the characters and/or escaped sequences into a single array.
253        int hiCount = hiChars != null ? hiChars.length : 1;
254        int loCount = loChars != null ? loChars.length : 1;
255        char[] output = new char[hiCount + loCount];
256        if (hiChars != null) {
257          // TODO: Is this faster than System.arraycopy() for small arrays?
258          for (int n = 0; n < hiChars.length; ++n) {
259            output[n] = hiChars[n];
260          }
261        } else {
262          output[0] = surrogateChars[0];
263        }
264        if (loChars != null) {
265          for (int n = 0; n < loChars.length; ++n) {
266            output[hiCount + n] = loChars[n];
267          }
268        } else {
269          output[hiCount] = surrogateChars[1];
270        }
271        return output;
272      }
273    };
274  }
275}