001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package com.google.common.net; 018 019import static com.google.common.base.Preconditions.checkNotNull; 020 021import com.google.common.annotations.Beta; 022import com.google.common.annotations.GwtCompatible; 023import com.google.common.escape.UnicodeEscaper; 024 025/** 026 * A {@code UnicodeEscaper} that escapes some set of Java characters using a 027 * UTF-8 based percent encoding scheme. The set of safe characters (those which 028 * remain unescaped) can be specified on construction. 029 * 030 * <p>This class is primarily used for creating URI escapers in {@link 031 * UrlEscapers} but can be used directly if required. While URI escapers impose 032 * specific semantics on which characters are considered 'safe', this class has 033 * a minimal set of restrictions. 034 * 035 * <p>When escaping a String, the following rules apply: 036 * <ul> 037 * <li>All specified safe characters remain unchanged. 038 * <li>If {@code plusForSpace} was specified, the space character " " is 039 * converted into a plus sign {@code "+"}. 040 * <li>All other characters are converted into one or more bytes using UTF-8 041 * encoding and each byte is then represented by the 3-character string 042 * "%XX", where "XX" is the two-digit, uppercase, hexadecimal representation 043 * of the byte value. 044 * </ul> 045 * 046 * <p>For performance reasons the only currently supported character encoding of 047 * this class is UTF-8. 048 * 049 * <p><b>Note</b>: This escaper produces uppercase hexadecimal sequences. From 050 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br> 051 * <i>"URI producers and normalizers should use uppercase hexadecimal digits 052 * for all percent-encodings."</i> 053 * 054 * @author David Beaumont 055 * @since 15.0 056 */ 057@Beta 058@GwtCompatible 059public final class PercentEscaper extends UnicodeEscaper { 060 061 // In some escapers spaces are escaped to '+' 062 private static final char[] PLUS_SIGN = { '+' }; 063 064 // Percent escapers output upper case hex digits (uri escapers require this). 065 private static final char[] UPPER_HEX_DIGITS = 066 "0123456789ABCDEF".toCharArray(); 067 068 /** 069 * If true we should convert space to the {@code +} character. 070 */ 071 private final boolean plusForSpace; 072 073 /** 074 * An array of flags where for any {@code char c} if {@code safeOctets[c]} is 075 * true then {@code c} should remain unmodified in the output. If 076 * {@code c > safeOctets.length} then it should be escaped. 077 */ 078 private final boolean[] safeOctets; 079 080 /** 081 * Constructs a percent escaper with the specified safe characters and 082 * optional handling of the space character. 083 * 084 * <p>Not that it is allowed, but not necessarily desirable to specify {@code %} 085 * as a safe character. This has the effect of creating an escaper which has no 086 * well defined inverse but it can be useful when escaping additional characters. 087 * 088 * @param safeChars a non null string specifying additional safe characters 089 * for this escaper (the ranges 0..9, a..z and A..Z are always safe and 090 * should not be specified here) 091 * @param plusForSpace true if ASCII space should be escaped to {@code +} 092 * rather than {@code %20} 093 * @throws IllegalArgumentException if any of the parameters were invalid 094 */ 095 public PercentEscaper(String safeChars, boolean plusForSpace) { 096 // TODO(user): Switch to static factory methods for creation now that class is final. 097 // TODO(user): Support escapers where alphanumeric chars are not safe. 098 checkNotNull(safeChars); // eager for GWT. 099 // Avoid any misunderstandings about the behavior of this escaper 100 if (safeChars.matches(".*[0-9A-Za-z].*")) { 101 throw new IllegalArgumentException( 102 "Alphanumeric characters are always 'safe' and should not be " + 103 "explicitly specified"); 104 } 105 safeChars += "abcdefghijklmnopqrstuvwxyz" + 106 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + 107 "0123456789"; 108 // Avoid ambiguous parameters. Safe characters are never modified so if 109 // space is a safe character then setting plusForSpace is meaningless. 110 if (plusForSpace && safeChars.contains(" ")) { 111 throw new IllegalArgumentException( 112 "plusForSpace cannot be specified when space is a 'safe' character"); 113 } 114 this.plusForSpace = plusForSpace; 115 this.safeOctets = createSafeOctets(safeChars); 116 } 117 118 /** 119 * Creates a boolean array with entries corresponding to the character values 120 * specified in safeChars set to true. The array is as small as is required to 121 * hold the given character information. 122 */ 123 private static boolean[] createSafeOctets(String safeChars) { 124 int maxChar = -1; 125 char[] safeCharArray = safeChars.toCharArray(); 126 for (char c : safeCharArray) { 127 maxChar = Math.max(c, maxChar); 128 } 129 boolean[] octets = new boolean[maxChar + 1]; 130 for (char c : safeCharArray) { 131 octets[c] = true; 132 } 133 return octets; 134 } 135 136 /* 137 * Overridden for performance. For unescaped strings this improved the 138 * performance of the uri escaper from ~760ns to ~400ns as measured by 139 * {@link CharEscapersBenchmark}. 140 */ 141 @Override 142 protected int nextEscapeIndex(CharSequence csq, int index, int end) { 143 checkNotNull(csq); 144 for (; index < end; index++) { 145 char c = csq.charAt(index); 146 if (c >= safeOctets.length || !safeOctets[c]) { 147 break; 148 } 149 } 150 return index; 151 } 152 153 /* 154 * Overridden for performance. For unescaped strings this improved the 155 * performance of the uri escaper from ~400ns to ~170ns as measured by 156 * {@link CharEscapersBenchmark}. 157 */ 158 @Override 159 public String escape(String s) { 160 checkNotNull(s); 161 int slen = s.length(); 162 for (int index = 0; index < slen; index++) { 163 char c = s.charAt(index); 164 if (c >= safeOctets.length || !safeOctets[c]) { 165 return escapeSlow(s, index); 166 } 167 } 168 return s; 169 } 170 171 /** 172 * Escapes the given Unicode code point in UTF-8. 173 */ 174 @Override 175 protected char[] escape(int cp) { 176 // We should never get negative values here but if we do it will throw an 177 // IndexOutOfBoundsException, so at least it will get spotted. 178 if (cp < safeOctets.length && safeOctets[cp]) { 179 return null; 180 } else if (cp == ' ' && plusForSpace) { 181 return PLUS_SIGN; 182 } else if (cp <= 0x7F) { 183 // Single byte UTF-8 characters 184 // Start with "%--" and fill in the blanks 185 char[] dest = new char[3]; 186 dest[0] = '%'; 187 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 188 dest[1] = UPPER_HEX_DIGITS[cp >>> 4]; 189 return dest; 190 } else if (cp <= 0x7ff) { 191 // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff] 192 // Start with "%--%--" and fill in the blanks 193 char[] dest = new char[6]; 194 dest[0] = '%'; 195 dest[3] = '%'; 196 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 197 cp >>>= 4; 198 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 199 cp >>>= 2; 200 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 201 cp >>>= 4; 202 dest[1] = UPPER_HEX_DIGITS[0xC | cp]; 203 return dest; 204 } else if (cp <= 0xffff) { 205 // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff] 206 // Start with "%E-%--%--" and fill in the blanks 207 char[] dest = new char[9]; 208 dest[0] = '%'; 209 dest[1] = 'E'; 210 dest[3] = '%'; 211 dest[6] = '%'; 212 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 213 cp >>>= 4; 214 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 215 cp >>>= 2; 216 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 217 cp >>>= 4; 218 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 219 cp >>>= 2; 220 dest[2] = UPPER_HEX_DIGITS[cp]; 221 return dest; 222 } else if (cp <= 0x10ffff) { 223 char[] dest = new char[12]; 224 // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff] 225 // Start with "%F-%--%--%--" and fill in the blanks 226 dest[0] = '%'; 227 dest[1] = 'F'; 228 dest[3] = '%'; 229 dest[6] = '%'; 230 dest[9] = '%'; 231 dest[11] = UPPER_HEX_DIGITS[cp & 0xF]; 232 cp >>>= 4; 233 dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 234 cp >>>= 2; 235 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 236 cp >>>= 4; 237 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 238 cp >>>= 2; 239 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 240 cp >>>= 4; 241 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 242 cp >>>= 2; 243 dest[2] = UPPER_HEX_DIGITS[cp & 0x7]; 244 return dest; 245 } else { 246 // If this ever happens it is due to bug in UnicodeEscaper, not bad input. 247 throw new IllegalArgumentException( 248 "Invalid unicode character value " + cp); 249 } 250 } 251}