001/* 002 * Copyright (C) 2013 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkPositionIndexes; 018import static java.lang.Character.MAX_SURROGATE; 019import static java.lang.Character.MIN_SURROGATE; 020 021import com.google.common.annotations.Beta; 022import com.google.common.annotations.GwtCompatible; 023 024import javax.annotation.CheckReturnValue; 025 026/** 027 * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8} 028 * character encoding. UTF-8 is defined in section D92 of 029 * <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core 030 * Specification, Chapter 3</a>. 031 * 032 * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8 033 * introduced in Unicode 3.1. One implication of this is that it rejects 034 * <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte 035 * sequences, even though the JDK decoder may accept them. 036 * 037 * @author Martin Buchholz 038 * @author Clément Roux 039 * @since 16.0 040 */ 041@Beta 042@GwtCompatible 043public final class Utf8 { 044 /** 045 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, 046 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in 047 * both time and space. 048 * 049 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired 050 * surrogates) 051 */ 052 @CheckReturnValue 053 public static int encodedLength(CharSequence sequence) { 054 // Warning to maintainers: this implementation is highly optimized. 055 int utf16Length = sequence.length(); 056 int utf8Length = utf16Length; 057 int i = 0; 058 059 // This loop optimizes for pure ASCII. 060 while (i < utf16Length && sequence.charAt(i) < 0x80) { 061 i++; 062 } 063 064 // This loop optimizes for chars less than 0x800. 065 for (; i < utf16Length; i++) { 066 char c = sequence.charAt(i); 067 if (c < 0x800) { 068 utf8Length += ((0x7f - c) >>> 31); // branch free! 069 } else { 070 utf8Length += encodedLengthGeneral(sequence, i); 071 break; 072 } 073 } 074 075 if (utf8Length < utf16Length) { 076 // Necessary and sufficient condition for overflow because of maximum 3x expansion 077 throw new IllegalArgumentException( 078 "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32))); 079 } 080 return utf8Length; 081 } 082 083 private static int encodedLengthGeneral(CharSequence sequence, int start) { 084 int utf16Length = sequence.length(); 085 int utf8Length = 0; 086 for (int i = start; i < utf16Length; i++) { 087 char c = sequence.charAt(i); 088 if (c < 0x800) { 089 utf8Length += (0x7f - c) >>> 31; // branch free! 090 } else { 091 utf8Length += 2; 092 // jdk7+: if (Character.isSurrogate(c)) { 093 if (MIN_SURROGATE <= c && c <= MAX_SURROGATE) { 094 // Check that we have a well-formed surrogate pair. 095 if (Character.codePointAt(sequence, i) == c) { 096 throw new IllegalArgumentException(unpairedSurrogateMsg(i)); 097 } 098 i++; 099 } 100 } 101 } 102 return utf8Length; 103 } 104 105 /** 106 * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to 107 * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be 108 * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte 109 * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered 110 * well-formed. 111 * 112 * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new 113 * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space. 114 */ 115 @CheckReturnValue 116 public static boolean isWellFormed(byte[] bytes) { 117 return isWellFormed(bytes, 0, bytes.length); 118 } 119 120 /** 121 * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by 122 * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code 123 * isWellFormed(bytes)} is true. 124 * 125 * @param bytes the input buffer 126 * @param off the offset in the buffer of the first byte to read 127 * @param len the number of bytes to read from the buffer 128 */ 129 @CheckReturnValue 130 public static boolean isWellFormed(byte[] bytes, int off, int len) { 131 int end = off + len; 132 checkPositionIndexes(off, end, bytes.length); 133 // Look for the first non-ASCII character. 134 for (int i = off; i < end; i++) { 135 if (bytes[i] < 0) { 136 return isWellFormedSlowPath(bytes, i, end); 137 } 138 } 139 return true; 140 } 141 142 private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) { 143 int index = off; 144 while (true) { 145 int byte1; 146 147 // Optimize for interior runs of ASCII bytes. 148 do { 149 if (index >= end) { 150 return true; 151 } 152 } while ((byte1 = bytes[index++]) >= 0); 153 154 if (byte1 < (byte) 0xE0) { 155 // Two-byte form. 156 if (index == end) { 157 return false; 158 } 159 // Simultaneously check for illegal trailing-byte in leading position 160 // and overlong 2-byte form. 161 if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) { 162 return false; 163 } 164 } else if (byte1 < (byte) 0xF0) { 165 // Three-byte form. 166 if (index + 1 >= end) { 167 return false; 168 } 169 int byte2 = bytes[index++]; 170 if (byte2 > (byte) 0xBF 171 // Overlong? 5 most significant bits must not all be zero. 172 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 173 // Check for illegal surrogate codepoints. 174 || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2) 175 // Third byte trailing-byte test. 176 || bytes[index++] > (byte) 0xBF) { 177 return false; 178 } 179 } else { 180 // Four-byte form. 181 if (index + 2 >= end) { 182 return false; 183 } 184 int byte2 = bytes[index++]; 185 if (byte2 > (byte) 0xBF 186 // Check that 1 <= plane <= 16. Tricky optimized form of: 187 // if (byte1 > (byte) 0xF4 188 // || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 189 // || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 190 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 191 // Third byte trailing-byte test 192 || bytes[index++] > (byte) 0xBF 193 // Fourth byte trailing-byte test 194 || bytes[index++] > (byte) 0xBF) { 195 return false; 196 } 197 } 198 } 199 } 200 201 private static String unpairedSurrogateMsg(int i) { 202 return "Unpaired surrogate at index " + i; 203 } 204 205 private Utf8() {} 206}