001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.net; 016 017import static com.google.common.base.Preconditions.checkNotNull; 018import static java.lang.Math.max; 019 020import com.google.common.annotations.GwtCompatible; 021import com.google.common.escape.UnicodeEscaper; 022import javax.annotation.CheckForNull; 023 024/** 025 * A {@code UnicodeEscaper} that escapes some set of Java characters using a UTF-8 based percent 026 * encoding scheme. The set of safe characters (those which remain unescaped) can be specified on 027 * construction. 028 * 029 * <p>This class is primarily used for creating URI escapers in {@link UrlEscapers} but can be used 030 * directly if required. While URI escapers impose specific semantics on which characters are 031 * considered 'safe', this class has a minimal set of restrictions. 032 * 033 * <p>When escaping a String, the following rules apply: 034 * 035 * <ul> 036 * <li>All specified safe characters remain unchanged. 037 * <li>If {@code plusForSpace} was specified, the space character " " is converted into a plus 038 * sign {@code "+"}. 039 * <li>All other characters are converted into one or more bytes using UTF-8 encoding and each 040 * byte is then represented by the 3-character string "%XX", where "XX" is the two-digit, 041 * uppercase, hexadecimal representation of the byte value. 042 * </ul> 043 * 044 * <p>For performance reasons the only currently supported character encoding of this class is 045 * UTF-8. 046 * 047 * <p><b>Note:</b> This escaper produces <a 048 * href="https://url.spec.whatwg.org/#percent-encode">uppercase</a> hexadecimal sequences. 049 * 050 * @author David Beaumont 051 * @since 15.0 052 */ 053@GwtCompatible 054@ElementTypesAreNonnullByDefault 055public final class PercentEscaper extends UnicodeEscaper { 056 057 // In some escapers spaces are escaped to '+' 058 private static final char[] PLUS_SIGN = {'+'}; 059 060 // Percent escapers output upper case hex digits (uri escapers require this). 061 private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray(); 062 063 /** If true we should convert space to the {@code +} character. */ 064 private final boolean plusForSpace; 065 066 /** 067 * An array of flags where for any {@code char c} if {@code safeOctets[c]} is true then {@code c} 068 * should remain unmodified in the output. If {@code c >= safeOctets.length} then it should be 069 * escaped. 070 */ 071 private final boolean[] safeOctets; 072 073 /** 074 * Constructs a percent escaper with the specified safe characters and optional handling of the 075 * space character. 076 * 077 * <p>Not that it is allowed, but not necessarily desirable to specify {@code %} as a safe 078 * character. This has the effect of creating an escaper which has no well-defined inverse but it 079 * can be useful when escaping additional characters. 080 * 081 * @param safeChars a non-null string specifying additional safe characters for this escaper (the 082 * ranges 0..9, a..z and A..Z are always safe and should not be specified here) 083 * @param plusForSpace true if ASCII space should be escaped to {@code +} rather than {@code %20} 084 * @throws IllegalArgumentException if any of the parameters were invalid 085 */ 086 public PercentEscaper(String safeChars, boolean plusForSpace) { 087 // TODO(dbeaumont): Switch to static factory methods for creation now that class is final. 088 // TODO(dbeaumont): Support escapers where alphanumeric chars are not safe. 089 checkNotNull(safeChars); // eager for GWT. 090 // Avoid any misunderstandings about the behavior of this escaper 091 if (safeChars.matches(".*[0-9A-Za-z].*")) { 092 throw new IllegalArgumentException( 093 "Alphanumeric characters are always 'safe' and should not be explicitly specified"); 094 } 095 safeChars += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 096 // Avoid ambiguous parameters. Safe characters are never modified so if 097 // space is a safe character then setting plusForSpace is meaningless. 098 if (plusForSpace && safeChars.contains(" ")) { 099 throw new IllegalArgumentException( 100 "plusForSpace cannot be specified when space is a 'safe' character"); 101 } 102 this.plusForSpace = plusForSpace; 103 this.safeOctets = createSafeOctets(safeChars); 104 } 105 106 /** 107 * Creates a boolean array with entries corresponding to the character values specified in 108 * safeChars set to true. The array is as small as is required to hold the given character 109 * information. 110 */ 111 private static boolean[] createSafeOctets(String safeChars) { 112 int maxChar = -1; 113 char[] safeCharArray = safeChars.toCharArray(); 114 for (char c : safeCharArray) { 115 maxChar = max(c, maxChar); 116 } 117 boolean[] octets = new boolean[maxChar + 1]; 118 for (char c : safeCharArray) { 119 octets[c] = true; 120 } 121 return octets; 122 } 123 124 /* 125 * Overridden for performance. For unescaped strings this improved the performance of the uri 126 * escaper from ~760ns to ~400ns as measured by {@link CharEscapersBenchmark}. 127 */ 128 @Override 129 protected int nextEscapeIndex(CharSequence csq, int index, int end) { 130 checkNotNull(csq); 131 for (; index < end; index++) { 132 char c = csq.charAt(index); 133 if (c >= safeOctets.length || !safeOctets[c]) { 134 break; 135 } 136 } 137 return index; 138 } 139 140 /* 141 * Overridden for performance. For unescaped strings this improved the performance of the uri 142 * escaper from ~400ns to ~170ns as measured by {@link CharEscapersBenchmark}. 143 */ 144 @Override 145 public String escape(String s) { 146 checkNotNull(s); 147 int slen = s.length(); 148 for (int index = 0; index < slen; index++) { 149 char c = s.charAt(index); 150 if (c >= safeOctets.length || !safeOctets[c]) { 151 return escapeSlow(s, index); 152 } 153 } 154 return s; 155 } 156 157 /** Escapes the given Unicode code point in UTF-8. */ 158 @Override 159 @CheckForNull 160 protected char[] escape(int cp) { 161 // We should never get negative values here but if we do it will throw an 162 // IndexOutOfBoundsException, so at least it will get spotted. 163 if (cp < safeOctets.length && safeOctets[cp]) { 164 return null; 165 } else if (cp == ' ' && plusForSpace) { 166 return PLUS_SIGN; 167 } else if (cp <= 0x7F) { 168 // Single byte UTF-8 characters 169 // Start with "%--" and fill in the blanks 170 char[] dest = new char[3]; 171 dest[0] = '%'; 172 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 173 dest[1] = UPPER_HEX_DIGITS[cp >>> 4]; 174 return dest; 175 } else if (cp <= 0x7ff) { 176 // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff] 177 // Start with "%--%--" and fill in the blanks 178 char[] dest = new char[6]; 179 dest[0] = '%'; 180 dest[3] = '%'; 181 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 182 cp >>>= 4; 183 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 184 cp >>>= 2; 185 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 186 cp >>>= 4; 187 dest[1] = UPPER_HEX_DIGITS[0xC | cp]; 188 return dest; 189 } else if (cp <= 0xffff) { 190 // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff] 191 // Start with "%E-%--%--" and fill in the blanks 192 char[] dest = new char[9]; 193 dest[0] = '%'; 194 dest[1] = 'E'; 195 dest[3] = '%'; 196 dest[6] = '%'; 197 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 198 cp >>>= 4; 199 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 200 cp >>>= 2; 201 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 202 cp >>>= 4; 203 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 204 cp >>>= 2; 205 dest[2] = UPPER_HEX_DIGITS[cp]; 206 return dest; 207 } else if (cp <= 0x10ffff) { 208 char[] dest = new char[12]; 209 // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff] 210 // Start with "%F-%--%--%--" and fill in the blanks 211 dest[0] = '%'; 212 dest[1] = 'F'; 213 dest[3] = '%'; 214 dest[6] = '%'; 215 dest[9] = '%'; 216 dest[11] = UPPER_HEX_DIGITS[cp & 0xF]; 217 cp >>>= 4; 218 dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 219 cp >>>= 2; 220 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 221 cp >>>= 4; 222 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 223 cp >>>= 2; 224 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 225 cp >>>= 4; 226 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 227 cp >>>= 2; 228 dest[2] = UPPER_HEX_DIGITS[cp & 0x7]; 229 return dest; 230 } else { 231 // If this ever happens it is due to bug in UnicodeEscaper, not bad input. 232 throw new IllegalArgumentException("Invalid unicode character value " + cp); 233 } 234 } 235}