001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.net;
018
019import static com.google.common.base.Preconditions.checkNotNull;
020
021import com.google.common.annotations.Beta;
022import com.google.common.annotations.GwtCompatible;
023import com.google.common.escape.UnicodeEscaper;
024
025/**
026 * A {@code UnicodeEscaper} that escapes some set of Java characters using a
027 * UTF-8 based percent encoding scheme. The set of safe characters (those which
028 * remain unescaped) can be specified on construction.
029 *
030 * <p>This class is primarily used for creating URI escapers in {@link
031 * UrlEscapers} but can be used directly if required. While URI escapers impose
032 * specific semantics on which characters are considered 'safe', this class has
033 * a minimal set of restrictions.
034 *
035 * <p>When escaping a String, the following rules apply:
036 * <ul>
037 * <li>All specified safe characters remain unchanged.
038 * <li>If {@code plusForSpace} was specified, the space character " " is
039 *     converted into a plus sign {@code "+"}.
040 * <li>All other characters are converted into one or more bytes using UTF-8
041 *     encoding and each byte is then represented by the 3-character string
042 *     "%XX", where "XX" is the two-digit, uppercase, hexadecimal representation
043 *     of the byte value.
044 * </ul>
045 *
046 * <p>For performance reasons the only currently supported character encoding of
047 * this class is UTF-8.
048 *
049 * <p><b>Note:</b> This escaper produces uppercase hexadecimal sequences. From
050 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
051 * <i>"URI producers and normalizers should use uppercase hexadecimal digits
052 * for all percent-encodings."</i>
053 *
054 * @author David Beaumont
055 * @since 15.0
056 */
057@Beta
058@GwtCompatible
059public final class PercentEscaper extends UnicodeEscaper {
060
061  // In some escapers spaces are escaped to '+'
062  private static final char[] PLUS_SIGN = { '+' };
063
064  // Percent escapers output upper case hex digits (uri escapers require this).
065  private static final char[] UPPER_HEX_DIGITS =
066      "0123456789ABCDEF".toCharArray();
067
068  /**
069   * If true we should convert space to the {@code +} character.
070   */
071  private final boolean plusForSpace;
072
073  /**
074   * An array of flags where for any {@code char c} if {@code safeOctets[c]} is
075   * true then {@code c} should remain unmodified in the output. If
076   * {@code c > safeOctets.length} then it should be escaped.
077   */
078  private final boolean[] safeOctets;
079
080  /**
081   * Constructs a percent escaper with the specified safe characters and
082   * optional handling of the space character.
083   *
084   * <p>Not that it is allowed, but not necessarily desirable to specify {@code %}
085   * as a safe character. This has the effect of creating an escaper which has no
086   * well defined inverse but it can be useful when escaping additional characters.
087   *
088   * @param safeChars a non null string specifying additional safe characters
089   *        for this escaper (the ranges 0..9, a..z and A..Z are always safe and
090   *        should not be specified here)
091   * @param plusForSpace true if ASCII space should be escaped to {@code +}
092   *        rather than {@code %20}
093   * @throws IllegalArgumentException if any of the parameters were invalid
094   */
095  public PercentEscaper(String safeChars, boolean plusForSpace) {
096    // TODO(user): Switch to static factory methods for creation now that class is final.
097    // TODO(user): Support escapers where alphanumeric chars are not safe.
098    checkNotNull(safeChars);  // eager for GWT.
099    // Avoid any misunderstandings about the behavior of this escaper
100    if (safeChars.matches(".*[0-9A-Za-z].*")) {
101      throw new IllegalArgumentException(
102          "Alphanumeric characters are always 'safe' and should not be " +
103          "explicitly specified");
104    }
105    safeChars += "abcdefghijklmnopqrstuvwxyz" +
106                 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
107                 "0123456789";
108    // Avoid ambiguous parameters. Safe characters are never modified so if
109    // space is a safe character then setting plusForSpace is meaningless.
110    if (plusForSpace && safeChars.contains(" ")) {
111      throw new IllegalArgumentException(
112          "plusForSpace cannot be specified when space is a 'safe' character");
113    }
114    this.plusForSpace = plusForSpace;
115    this.safeOctets = createSafeOctets(safeChars);
116  }
117
118  /**
119   * Creates a boolean array with entries corresponding to the character values
120   * specified in safeChars set to true. The array is as small as is required to
121   * hold the given character information.
122   */
123  private static boolean[] createSafeOctets(String safeChars) {
124    int maxChar = -1;
125    char[] safeCharArray = safeChars.toCharArray();
126    for (char c : safeCharArray) {
127      maxChar = Math.max(c, maxChar);
128    }
129    boolean[] octets = new boolean[maxChar + 1];
130    for (char c : safeCharArray) {
131      octets[c] = true;
132    }
133    return octets;
134  }
135
136  /*
137   * Overridden for performance. For unescaped strings this improved the
138   * performance of the uri escaper from ~760ns to ~400ns as measured by
139   * {@link CharEscapersBenchmark}.
140   */
141  @Override
142  protected int nextEscapeIndex(CharSequence csq, int index, int end) {
143    checkNotNull(csq);
144    for (; index < end; index++) {
145      char c = csq.charAt(index);
146      if (c >= safeOctets.length || !safeOctets[c]) {
147        break;
148      }
149    }
150    return index;
151  }
152
153  /*
154   * Overridden for performance. For unescaped strings this improved the
155   * performance of the uri escaper from ~400ns to ~170ns as measured by
156   * {@link CharEscapersBenchmark}.
157   */
158  @Override
159  public String escape(String s) {
160    checkNotNull(s);
161    int slen = s.length();
162    for (int index = 0; index < slen; index++) {
163      char c = s.charAt(index);
164      if (c >= safeOctets.length || !safeOctets[c]) {
165        return escapeSlow(s, index);
166      }
167    }
168    return s;
169  }
170
171  /**
172   * Escapes the given Unicode code point in UTF-8.
173   */
174  @Override
175  protected char[] escape(int cp) {
176    // We should never get negative values here but if we do it will throw an
177    // IndexOutOfBoundsException, so at least it will get spotted.
178    if (cp < safeOctets.length && safeOctets[cp]) {
179      return null;
180    } else if (cp == ' ' && plusForSpace) {
181      return PLUS_SIGN;
182    } else if (cp <= 0x7F) {
183      // Single byte UTF-8 characters
184      // Start with "%--" and fill in the blanks
185      char[] dest = new char[3];
186      dest[0] = '%';
187      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
188      dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
189      return dest;
190    } else if (cp <= 0x7ff) {
191      // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
192      // Start with "%--%--" and fill in the blanks
193      char[] dest = new char[6];
194      dest[0] = '%';
195      dest[3] = '%';
196      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
197      cp >>>= 4;
198      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
199      cp >>>= 2;
200      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
201      cp >>>= 4;
202      dest[1] = UPPER_HEX_DIGITS[0xC | cp];
203      return dest;
204    } else if (cp <= 0xffff) {
205      // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
206      // Start with "%E-%--%--" and fill in the blanks
207      char[] dest = new char[9];
208      dest[0] = '%';
209      dest[1] = 'E';
210      dest[3] = '%';
211      dest[6] = '%';
212      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
213      cp >>>= 4;
214      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
215      cp >>>= 2;
216      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
217      cp >>>= 4;
218      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
219      cp >>>= 2;
220      dest[2] = UPPER_HEX_DIGITS[cp];
221      return dest;
222    } else if (cp <= 0x10ffff) {
223      char[] dest = new char[12];
224      // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
225      // Start with "%F-%--%--%--" and fill in the blanks
226      dest[0] = '%';
227      dest[1] = 'F';
228      dest[3] = '%';
229      dest[6] = '%';
230      dest[9] = '%';
231      dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
232      cp >>>= 4;
233      dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
234      cp >>>= 2;
235      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
236      cp >>>= 4;
237      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
238      cp >>>= 2;
239      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
240      cp >>>= 4;
241      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
242      cp >>>= 2;
243      dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
244      return dest;
245    } else {
246      // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
247      throw new IllegalArgumentException(
248          "Invalid unicode character value " + cp);
249    }
250  }
251}