001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.net;
016
017import static com.google.common.base.Preconditions.checkNotNull;
018
019import com.google.common.annotations.GwtCompatible;
020import com.google.common.escape.UnicodeEscaper;
021import javax.annotation.CheckForNull;
022
023/**
024 * A {@code UnicodeEscaper} that escapes some set of Java characters using a UTF-8 based percent
025 * encoding scheme. The set of safe characters (those which remain unescaped) can be specified on
026 * construction.
027 *
028 * <p>This class is primarily used for creating URI escapers in {@link UrlEscapers} but can be used
029 * directly if required. While URI escapers impose specific semantics on which characters are
030 * considered 'safe', this class has a minimal set of restrictions.
031 *
032 * <p>When escaping a String, the following rules apply:
033 *
034 * <ul>
035 *   <li>All specified safe characters remain unchanged.
036 *   <li>If {@code plusForSpace} was specified, the space character " " is converted into a plus
037 *       sign {@code "+"}.
038 *   <li>All other characters are converted into one or more bytes using UTF-8 encoding and each
039 *       byte is then represented by the 3-character string "%XX", where "XX" is the two-digit,
040 *       uppercase, hexadecimal representation of the byte value.
041 * </ul>
042 *
043 * <p>For performance reasons the only currently supported character encoding of this class is
044 * UTF-8.
045 *
046 * <p><b>Note:</b> This escaper produces <a
047 * href="https://url.spec.whatwg.org/#percent-encode">uppercase</a> hexadecimal sequences.
048 *
049 * @author David Beaumont
050 * @since 15.0
051 */
052@GwtCompatible
053@ElementTypesAreNonnullByDefault
054public final class PercentEscaper extends UnicodeEscaper {
055
056  // In some escapers spaces are escaped to '+'
057  private static final char[] PLUS_SIGN = {'+'};
058
059  // Percent escapers output upper case hex digits (uri escapers require this).
060  private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray();
061
062  /** If true we should convert space to the {@code +} character. */
063  private final boolean plusForSpace;
064
065  /**
066   * An array of flags where for any {@code char c} if {@code safeOctets[c]} is true then {@code c}
067   * should remain unmodified in the output. If {@code c >= safeOctets.length} then it should be
068   * escaped.
069   */
070  private final boolean[] safeOctets;
071
072  /**
073   * Constructs a percent escaper with the specified safe characters and optional handling of the
074   * space character.
075   *
076   * <p>Not that it is allowed, but not necessarily desirable to specify {@code %} as a safe
077   * character. This has the effect of creating an escaper which has no well-defined inverse but it
078   * can be useful when escaping additional characters.
079   *
080   * @param safeChars a non-null string specifying additional safe characters for this escaper (the
081   *     ranges 0..9, a..z and A..Z are always safe and should not be specified here)
082   * @param plusForSpace true if ASCII space should be escaped to {@code +} rather than {@code %20}
083   * @throws IllegalArgumentException if any of the parameters were invalid
084   */
085  public PercentEscaper(String safeChars, boolean plusForSpace) {
086    // TODO(dbeaumont): Switch to static factory methods for creation now that class is final.
087    // TODO(dbeaumont): Support escapers where alphanumeric chars are not safe.
088    checkNotNull(safeChars); // eager for GWT.
089    // Avoid any misunderstandings about the behavior of this escaper
090    if (safeChars.matches(".*[0-9A-Za-z].*")) {
091      throw new IllegalArgumentException(
092          "Alphanumeric characters are always 'safe' and should not be explicitly specified");
093    }
094    safeChars += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
095    // Avoid ambiguous parameters. Safe characters are never modified so if
096    // space is a safe character then setting plusForSpace is meaningless.
097    if (plusForSpace && safeChars.contains(" ")) {
098      throw new IllegalArgumentException(
099          "plusForSpace cannot be specified when space is a 'safe' character");
100    }
101    this.plusForSpace = plusForSpace;
102    this.safeOctets = createSafeOctets(safeChars);
103  }
104
105  /**
106   * Creates a boolean array with entries corresponding to the character values specified in
107   * safeChars set to true. The array is as small as is required to hold the given character
108   * information.
109   */
110  private static boolean[] createSafeOctets(String safeChars) {
111    int maxChar = -1;
112    char[] safeCharArray = safeChars.toCharArray();
113    for (char c : safeCharArray) {
114      maxChar = Math.max(c, maxChar);
115    }
116    boolean[] octets = new boolean[maxChar + 1];
117    for (char c : safeCharArray) {
118      octets[c] = true;
119    }
120    return octets;
121  }
122
123  /*
124   * Overridden for performance. For unescaped strings this improved the performance of the uri
125   * escaper from ~760ns to ~400ns as measured by {@link CharEscapersBenchmark}.
126   */
127  @Override
128  protected int nextEscapeIndex(CharSequence csq, int index, int end) {
129    checkNotNull(csq);
130    for (; index < end; index++) {
131      char c = csq.charAt(index);
132      if (c >= safeOctets.length || !safeOctets[c]) {
133        break;
134      }
135    }
136    return index;
137  }
138
139  /*
140   * Overridden for performance. For unescaped strings this improved the performance of the uri
141   * escaper from ~400ns to ~170ns as measured by {@link CharEscapersBenchmark}.
142   */
143  @Override
144  public String escape(String s) {
145    checkNotNull(s);
146    int slen = s.length();
147    for (int index = 0; index < slen; index++) {
148      char c = s.charAt(index);
149      if (c >= safeOctets.length || !safeOctets[c]) {
150        return escapeSlow(s, index);
151      }
152    }
153    return s;
154  }
155
156  /** Escapes the given Unicode code point in UTF-8. */
157  @Override
158  @CheckForNull
159  protected char[] escape(int cp) {
160    // We should never get negative values here but if we do it will throw an
161    // IndexOutOfBoundsException, so at least it will get spotted.
162    if (cp < safeOctets.length && safeOctets[cp]) {
163      return null;
164    } else if (cp == ' ' && plusForSpace) {
165      return PLUS_SIGN;
166    } else if (cp <= 0x7F) {
167      // Single byte UTF-8 characters
168      // Start with "%--" and fill in the blanks
169      char[] dest = new char[3];
170      dest[0] = '%';
171      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
172      dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
173      return dest;
174    } else if (cp <= 0x7ff) {
175      // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
176      // Start with "%--%--" and fill in the blanks
177      char[] dest = new char[6];
178      dest[0] = '%';
179      dest[3] = '%';
180      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
181      cp >>>= 4;
182      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
183      cp >>>= 2;
184      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
185      cp >>>= 4;
186      dest[1] = UPPER_HEX_DIGITS[0xC | cp];
187      return dest;
188    } else if (cp <= 0xffff) {
189      // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
190      // Start with "%E-%--%--" and fill in the blanks
191      char[] dest = new char[9];
192      dest[0] = '%';
193      dest[1] = 'E';
194      dest[3] = '%';
195      dest[6] = '%';
196      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
197      cp >>>= 4;
198      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
199      cp >>>= 2;
200      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
201      cp >>>= 4;
202      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
203      cp >>>= 2;
204      dest[2] = UPPER_HEX_DIGITS[cp];
205      return dest;
206    } else if (cp <= 0x10ffff) {
207      char[] dest = new char[12];
208      // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
209      // Start with "%F-%--%--%--" and fill in the blanks
210      dest[0] = '%';
211      dest[1] = 'F';
212      dest[3] = '%';
213      dest[6] = '%';
214      dest[9] = '%';
215      dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
216      cp >>>= 4;
217      dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
218      cp >>>= 2;
219      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
220      cp >>>= 4;
221      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
222      cp >>>= 2;
223      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
224      cp >>>= 4;
225      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
226      cp >>>= 2;
227      dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
228      return dest;
229    } else {
230      // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
231      throw new IllegalArgumentException("Invalid unicode character value " + cp);
232    }
233  }
234}