001/*
002 * Copyright (C) 2013 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.base;
016
017import static com.google.common.base.Preconditions.checkPositionIndexes;
018
019import com.google.common.annotations.Beta;
020import com.google.common.annotations.GwtCompatible;
021
022/**
023 * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8}
024 * character encoding. UTF-8 is defined in section D92 of
025 * <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core
026 * Specification, Chapter 3</a>.
027 *
028 * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8
029 * introduced in Unicode 3.1. One implication of this is that it rejects
030 * <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte
031 * sequences, even though the JDK decoder may accept them.
032 *
033 * @author Martin Buchholz
034 * @author Clément Roux
035 * @since 16.0
036 */
037@Beta
038@GwtCompatible
039public final class Utf8 {
040  /**
041   * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
042   * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
043   * both time and space.
044   *
045   * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
046   *     surrogates)
047   */
048  public static int encodedLength(CharSequence sequence) {
049    // Warning to maintainers: this implementation is highly optimized.
050    int utf16Length = sequence.length();
051    int utf8Length = utf16Length;
052    int i = 0;
053
054    // This loop optimizes for pure ASCII.
055    while (i < utf16Length && sequence.charAt(i) < 0x80) {
056      i++;
057    }
058
059    // This loop optimizes for chars less than 0x800.
060    for (; i < utf16Length; i++) {
061      char c = sequence.charAt(i);
062      if (c < 0x800) {
063        utf8Length += ((0x7f - c) >>> 31);  // branch free!
064      } else {
065        utf8Length += encodedLengthGeneral(sequence, i);
066        break;
067      }
068    }
069
070    if (utf8Length < utf16Length) {
071      // Necessary and sufficient condition for overflow because of maximum 3x expansion
072      throw new IllegalArgumentException("UTF-8 length does not fit in int: "
073                                         + (utf8Length + (1L << 32)));
074    }
075    return utf8Length;
076  }
077
078  private static int encodedLengthGeneral(CharSequence sequence, int start) {
079    int utf16Length = sequence.length();
080    int utf8Length = 0;
081    for (int i = start; i < utf16Length; i++) {
082      char c = sequence.charAt(i);
083      if (c < 0x800) {
084        utf8Length += (0x7f - c) >>> 31; // branch free!
085      } else {
086        utf8Length += 2;
087        // jdk7+: if (Character.isSurrogate(c)) {
088        if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
089          // Check that we have a well-formed surrogate pair.
090          int cp = Character.codePointAt(sequence, i);
091          if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
092            throw new IllegalArgumentException("Unpaired surrogate at index " + i);
093          }
094          i++;
095        }
096      }
097    }
098    return utf8Length;
099  }
100
101  /**
102   * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to
103   * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be
104   * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte
105   * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered
106   * well-formed.
107   *
108   * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new
109   * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space.
110   */
111  public static boolean isWellFormed(byte[] bytes) {
112    return isWellFormed(bytes, 0, bytes.length);
113  }
114
115  /**
116   * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by
117   * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code
118   * isWellFormed(bytes)} is true.
119   *
120   * @param bytes the input buffer
121   * @param off the offset in the buffer of the first byte to read
122   * @param len the number of bytes to read from the buffer
123   */
124  public static boolean isWellFormed(byte[] bytes, int off, int len) {
125    int end = off + len;
126    checkPositionIndexes(off, end, bytes.length);
127    // Look for the first non-ASCII character.
128    for (int i = off; i < end; i++) {
129      if (bytes[i] < 0) {
130        return isWellFormedSlowPath(bytes, i, end);
131      }
132    }
133    return true;
134  }
135
136  private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) {
137    int index = off;
138    while (true) {
139      int byte1;
140
141      // Optimize for interior runs of ASCII bytes.
142      do {
143        if (index >= end) {
144          return true;
145        }
146      } while ((byte1 = bytes[index++]) >= 0);
147
148      if (byte1 < (byte) 0xE0) {
149        // Two-byte form.
150        if (index == end) {
151          return false;
152        }
153        // Simultaneously check for illegal trailing-byte in leading position
154        // and overlong 2-byte form.
155        if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
156          return false;
157        }
158      } else if (byte1 < (byte) 0xF0) {
159        // Three-byte form.
160        if (index + 1 >= end) {
161          return false;
162        }
163        int byte2 = bytes[index++];
164        if (byte2 > (byte) 0xBF
165            // Overlong? 5 most significant bits must not all be zero.
166            || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
167            // Check for illegal surrogate codepoints.
168            || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)
169            // Third byte trailing-byte test.
170            || bytes[index++] > (byte) 0xBF) {
171          return false;
172        }
173      } else {
174        // Four-byte form.
175        if (index + 2 >= end) {
176          return false;
177        }
178        int byte2 = bytes[index++];
179        if (byte2 > (byte) 0xBF
180            // Check that 1 <= plane <= 16. Tricky optimized form of:
181            // if (byte1 > (byte) 0xF4
182            //     || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
183            //     || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
184            || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
185            // Third byte trailing-byte test
186            || bytes[index++] > (byte) 0xBF
187            // Fourth byte trailing-byte test
188            || bytes[index++] > (byte) 0xBF) {
189          return false;
190        }
191      }
192    }
193  }
194
195  private Utf8() {}
196}