001/*
002 * Copyright (C) 2013 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.base;
016
017import static com.google.common.base.Preconditions.checkPositionIndexes;
018import static java.lang.Character.MAX_SURROGATE;
019import static java.lang.Character.MIN_SURROGATE;
020
021import com.google.common.annotations.Beta;
022import com.google.common.annotations.GwtCompatible;
023
024import javax.annotation.CheckReturnValue;
025
026/**
027 * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8}
028 * character encoding. UTF-8 is defined in section D92 of
029 * <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core
030 * Specification, Chapter 3</a>.
031 *
032 * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8
033 * introduced in Unicode 3.1. One implication of this is that it rejects
034 * <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte
035 * sequences, even though the JDK decoder may accept them.
036 *
037 * @author Martin Buchholz
038 * @author Clément Roux
039 * @since 16.0
040 */
041@Beta
042@GwtCompatible
043public final class Utf8 {
044  /**
045   * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
046   * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
047   * both time and space.
048   *
049   * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
050   *     surrogates)
051   */
052  @CheckReturnValue
053  public static int encodedLength(CharSequence sequence) {
054    // Warning to maintainers: this implementation is highly optimized.
055    int utf16Length = sequence.length();
056    int utf8Length = utf16Length;
057    int i = 0;
058
059    // This loop optimizes for pure ASCII.
060    while (i < utf16Length && sequence.charAt(i) < 0x80) {
061      i++;
062    }
063
064    // This loop optimizes for chars less than 0x800.
065    for (; i < utf16Length; i++) {
066      char c = sequence.charAt(i);
067      if (c < 0x800) {
068        utf8Length += ((0x7f - c) >>> 31); // branch free!
069      } else {
070        utf8Length += encodedLengthGeneral(sequence, i);
071        break;
072      }
073    }
074
075    if (utf8Length < utf16Length) {
076      // Necessary and sufficient condition for overflow because of maximum 3x expansion
077      throw new IllegalArgumentException(
078          "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
079    }
080    return utf8Length;
081  }
082
083  private static int encodedLengthGeneral(CharSequence sequence, int start) {
084    int utf16Length = sequence.length();
085    int utf8Length = 0;
086    for (int i = start; i < utf16Length; i++) {
087      char c = sequence.charAt(i);
088      if (c < 0x800) {
089        utf8Length += (0x7f - c) >>> 31; // branch free!
090      } else {
091        utf8Length += 2;
092        // jdk7+: if (Character.isSurrogate(c)) {
093        if (MIN_SURROGATE <= c && c <= MAX_SURROGATE) {
094          // Check that we have a well-formed surrogate pair.
095          if (Character.codePointAt(sequence, i) == c) {
096            throw new IllegalArgumentException(unpairedSurrogateMsg(i));
097          }
098          i++;
099        }
100      }
101    }
102    return utf8Length;
103  }
104
105  /**
106   * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to
107   * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be
108   * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte
109   * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered
110   * well-formed.
111   *
112   * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new
113   * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space.
114   */
115  @CheckReturnValue
116  public static boolean isWellFormed(byte[] bytes) {
117    return isWellFormed(bytes, 0, bytes.length);
118  }
119
120  /**
121   * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by
122   * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code
123   * isWellFormed(bytes)} is true.
124   *
125   * @param bytes the input buffer
126   * @param off the offset in the buffer of the first byte to read
127   * @param len the number of bytes to read from the buffer
128   */
129  @CheckReturnValue
130  public static boolean isWellFormed(byte[] bytes, int off, int len) {
131    int end = off + len;
132    checkPositionIndexes(off, end, bytes.length);
133    // Look for the first non-ASCII character.
134    for (int i = off; i < end; i++) {
135      if (bytes[i] < 0) {
136        return isWellFormedSlowPath(bytes, i, end);
137      }
138    }
139    return true;
140  }
141
142  private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) {
143    int index = off;
144    while (true) {
145      int byte1;
146
147      // Optimize for interior runs of ASCII bytes.
148      do {
149        if (index >= end) {
150          return true;
151        }
152      } while ((byte1 = bytes[index++]) >= 0);
153
154      if (byte1 < (byte) 0xE0) {
155        // Two-byte form.
156        if (index == end) {
157          return false;
158        }
159        // Simultaneously check for illegal trailing-byte in leading position
160        // and overlong 2-byte form.
161        if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
162          return false;
163        }
164      } else if (byte1 < (byte) 0xF0) {
165        // Three-byte form.
166        if (index + 1 >= end) {
167          return false;
168        }
169        int byte2 = bytes[index++];
170        if (byte2 > (byte) 0xBF
171            // Overlong? 5 most significant bits must not all be zero.
172            || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
173            // Check for illegal surrogate codepoints.
174            || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)
175            // Third byte trailing-byte test.
176            || bytes[index++] > (byte) 0xBF) {
177          return false;
178        }
179      } else {
180        // Four-byte form.
181        if (index + 2 >= end) {
182          return false;
183        }
184        int byte2 = bytes[index++];
185        if (byte2 > (byte) 0xBF
186            // Check that 1 <= plane <= 16. Tricky optimized form of:
187            // if (byte1 > (byte) 0xF4
188            //     || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
189            //     || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
190            || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
191            // Third byte trailing-byte test
192            || bytes[index++] > (byte) 0xBF
193            // Fourth byte trailing-byte test
194            || bytes[index++] > (byte) 0xBF) {
195          return false;
196        }
197      }
198    }
199  }
200
201  private static String unpairedSurrogateMsg(int i) {
202    return "Unpaired surrogate at index " + i;
203  }
204
205  private Utf8() {}
206}