001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.net;
016
017import static com.google.common.base.Preconditions.checkNotNull;
018
019import com.google.common.annotations.Beta;
020import com.google.common.annotations.GwtCompatible;
021import com.google.common.escape.UnicodeEscaper;
022import javax.annotation.CheckForNull;
023
024/**
025 * A {@code UnicodeEscaper} that escapes some set of Java characters using a UTF-8 based percent
026 * encoding scheme. The set of safe characters (those which remain unescaped) can be specified on
027 * construction.
028 *
029 * <p>This class is primarily used for creating URI escapers in {@link UrlEscapers} but can be used
030 * directly if required. While URI escapers impose specific semantics on which characters are
031 * considered 'safe', this class has a minimal set of restrictions.
032 *
033 * <p>When escaping a String, the following rules apply:
034 *
035 * <ul>
036 *   <li>All specified safe characters remain unchanged.
037 *   <li>If {@code plusForSpace} was specified, the space character " " is converted into a plus
038 *       sign {@code "+"}.
039 *   <li>All other characters are converted into one or more bytes using UTF-8 encoding and each
040 *       byte is then represented by the 3-character string "%XX", where "XX" is the two-digit,
041 *       uppercase, hexadecimal representation of the byte value.
042 * </ul>
043 *
044 * <p>For performance reasons the only currently supported character encoding of this class is
045 * UTF-8.
046 *
047 * <p><b>Note:</b> This escaper produces <a
048 * href="https://url.spec.whatwg.org/#percent-encode">uppercase</a> hexadecimal sequences.
049 *
050 * @author David Beaumont
051 * @since 15.0
052 */
053@Beta
054@GwtCompatible
055@ElementTypesAreNonnullByDefault
056public final class PercentEscaper extends UnicodeEscaper {
057
058  // In some escapers spaces are escaped to '+'
059  private static final char[] PLUS_SIGN = {'+'};
060
061  // Percent escapers output upper case hex digits (uri escapers require this).
062  private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray();
063
064  /** If true we should convert space to the {@code +} character. */
065  private final boolean plusForSpace;
066
067  /**
068   * An array of flags where for any {@code char c} if {@code safeOctets[c]} is true then {@code c}
069   * should remain unmodified in the output. If {@code c >= safeOctets.length} then it should be
070   * escaped.
071   */
072  private final boolean[] safeOctets;
073
074  /**
075   * Constructs a percent escaper with the specified safe characters and optional handling of the
076   * space character.
077   *
078   * <p>Not that it is allowed, but not necessarily desirable to specify {@code %} as a safe
079   * character. This has the effect of creating an escaper which has no well defined inverse but it
080   * can be useful when escaping additional characters.
081   *
082   * @param safeChars a non null string specifying additional safe characters for this escaper (the
083   *     ranges 0..9, a..z and A..Z are always safe and should not be specified here)
084   * @param plusForSpace true if ASCII space should be escaped to {@code +} rather than {@code %20}
085   * @throws IllegalArgumentException if any of the parameters were invalid
086   */
087  public PercentEscaper(String safeChars, boolean plusForSpace) {
088    // TODO(dbeaumont): Switch to static factory methods for creation now that class is final.
089    // TODO(dbeaumont): Support escapers where alphanumeric chars are not safe.
090    checkNotNull(safeChars); // eager for GWT.
091    // Avoid any misunderstandings about the behavior of this escaper
092    if (safeChars.matches(".*[0-9A-Za-z].*")) {
093      throw new IllegalArgumentException(
094          "Alphanumeric characters are always 'safe' and should not be explicitly specified");
095    }
096    safeChars += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
097    // Avoid ambiguous parameters. Safe characters are never modified so if
098    // space is a safe character then setting plusForSpace is meaningless.
099    if (plusForSpace && safeChars.contains(" ")) {
100      throw new IllegalArgumentException(
101          "plusForSpace cannot be specified when space is a 'safe' character");
102    }
103    this.plusForSpace = plusForSpace;
104    this.safeOctets = createSafeOctets(safeChars);
105  }
106
107  /**
108   * Creates a boolean array with entries corresponding to the character values specified in
109   * safeChars set to true. The array is as small as is required to hold the given character
110   * information.
111   */
112  private static boolean[] createSafeOctets(String safeChars) {
113    int maxChar = -1;
114    char[] safeCharArray = safeChars.toCharArray();
115    for (char c : safeCharArray) {
116      maxChar = Math.max(c, maxChar);
117    }
118    boolean[] octets = new boolean[maxChar + 1];
119    for (char c : safeCharArray) {
120      octets[c] = true;
121    }
122    return octets;
123  }
124
125  /*
126   * Overridden for performance. For unescaped strings this improved the performance of the uri
127   * escaper from ~760ns to ~400ns as measured by {@link CharEscapersBenchmark}.
128   */
129  @Override
130  protected int nextEscapeIndex(CharSequence csq, int index, int end) {
131    checkNotNull(csq);
132    for (; index < end; index++) {
133      char c = csq.charAt(index);
134      if (c >= safeOctets.length || !safeOctets[c]) {
135        break;
136      }
137    }
138    return index;
139  }
140
141  /*
142   * Overridden for performance. For unescaped strings this improved the performance of the uri
143   * escaper from ~400ns to ~170ns as measured by {@link CharEscapersBenchmark}.
144   */
145  @Override
146  public String escape(String s) {
147    checkNotNull(s);
148    int slen = s.length();
149    for (int index = 0; index < slen; index++) {
150      char c = s.charAt(index);
151      if (c >= safeOctets.length || !safeOctets[c]) {
152        return escapeSlow(s, index);
153      }
154    }
155    return s;
156  }
157
158  /** Escapes the given Unicode code point in UTF-8. */
159  @Override
160  @CheckForNull
161  protected char[] escape(int cp) {
162    // We should never get negative values here but if we do it will throw an
163    // IndexOutOfBoundsException, so at least it will get spotted.
164    if (cp < safeOctets.length && safeOctets[cp]) {
165      return null;
166    } else if (cp == ' ' && plusForSpace) {
167      return PLUS_SIGN;
168    } else if (cp <= 0x7F) {
169      // Single byte UTF-8 characters
170      // Start with "%--" and fill in the blanks
171      char[] dest = new char[3];
172      dest[0] = '%';
173      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
174      dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
175      return dest;
176    } else if (cp <= 0x7ff) {
177      // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
178      // Start with "%--%--" and fill in the blanks
179      char[] dest = new char[6];
180      dest[0] = '%';
181      dest[3] = '%';
182      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
183      cp >>>= 4;
184      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
185      cp >>>= 2;
186      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
187      cp >>>= 4;
188      dest[1] = UPPER_HEX_DIGITS[0xC | cp];
189      return dest;
190    } else if (cp <= 0xffff) {
191      // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
192      // Start with "%E-%--%--" and fill in the blanks
193      char[] dest = new char[9];
194      dest[0] = '%';
195      dest[1] = 'E';
196      dest[3] = '%';
197      dest[6] = '%';
198      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
199      cp >>>= 4;
200      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
201      cp >>>= 2;
202      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
203      cp >>>= 4;
204      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
205      cp >>>= 2;
206      dest[2] = UPPER_HEX_DIGITS[cp];
207      return dest;
208    } else if (cp <= 0x10ffff) {
209      char[] dest = new char[12];
210      // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
211      // Start with "%F-%--%--%--" and fill in the blanks
212      dest[0] = '%';
213      dest[1] = 'F';
214      dest[3] = '%';
215      dest[6] = '%';
216      dest[9] = '%';
217      dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
218      cp >>>= 4;
219      dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
220      cp >>>= 2;
221      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
222      cp >>>= 4;
223      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
224      cp >>>= 2;
225      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
226      cp >>>= 4;
227      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
228      cp >>>= 2;
229      dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
230      return dest;
231    } else {
232      // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
233      throw new IllegalArgumentException("Invalid unicode character value " + cp);
234    }
235  }
236}