001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.base;
016
017import static com.google.common.base.Preconditions.checkArgument;
018import static com.google.common.base.Preconditions.checkNotNull;
019import static com.google.common.base.Preconditions.checkPositionIndex;
020
021import com.google.common.annotations.GwtCompatible;
022import com.google.common.annotations.GwtIncompatible;
023import com.google.common.annotations.VisibleForTesting;
024import java.util.Arrays;
025import java.util.BitSet;
026
027/**
028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
029 * for any {@link Object}. Also offers basic text processing methods based on this function.
030 * Implementations are strongly encouraged to be side-effect-free and immutable.
031 *
032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
033 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}".
034 *
035 * <p><b>Warning:</b> This class deals only with {@code char} values, that is,
036 * <a href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>.
037 * It does not understand
038 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode
039 * code points</a> in the range {@code 0x10000} to {@code 0x10FFFF}
040 * which includes the majority of assigned characters, including important CJK characters and emoji.
041 *
042 * <p>Supplementary characters are
043 * <a href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">
044 * encoded into a {@code String} using surrogate pairs</a>,
045 * and a {@code CharMatcher} treats these just as two separate characters.
046 * {@link #countIn} counts each supplementary character as 2 {@code char}s.
047 *
048 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for
049 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building).
050 * For basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner.
051 *
052 * <p>Example usages:
053 *
054 * <pre>
055 *   String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput);
056 *   if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
057 *
058 * <p>See the Guava User Guide article on <a
059 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher}
060 * </a>.
061 *
062 * @author Kevin Bourrillion
063 * @since 1.0
064 */
065@GwtCompatible(emulated = true)
066public abstract class CharMatcher implements Predicate<Character> {
067  /*
068   *           N777777777NO
069   *         N7777777777777N
070   *        M777777777777777N
071   *        $N877777777D77777M
072   *       N M77777777ONND777M
073   *       MN777777777NN  D777
074   *     N7ZN777777777NN ~M7778
075   *    N777777777777MMNN88777N
076   *    N777777777777MNZZZ7777O
077   *    DZN7777O77777777777777
078   *     N7OONND7777777D77777N
079   *      8$M++++?N???$77777$
080   *       M7++++N+M77777777N
081   *        N77O777777777777$                              M
082   *          DNNM$$$$777777N                              D
083   *         N$N:=N$777N7777M                             NZ
084   *        77Z::::N777777777                          ODZZZ
085   *       77N::::::N77777777M                         NNZZZ$
086   *     $777:::::::77777777MN                        ZM8ZZZZZ
087   *     777M::::::Z7777777Z77                        N++ZZZZNN
088   *    7777M:::::M7777777$777M                       $++IZZZZM
089   *   M777$:::::N777777$M7777M                       +++++ZZZDN
090   *     NN$::::::7777$$M777777N                      N+++ZZZZNZ
091   *       N::::::N:7$O:77777777                      N++++ZZZZN
092   *       M::::::::::::N77777777+                   +?+++++ZZZM
093   *       8::::::::::::D77777777M                    O+++++ZZ
094   *        ::::::::::::M777777777N                      O+?D
095   *        M:::::::::::M77777777778                     77=
096   *        D=::::::::::N7777777777N                    777
097   *       INN===::::::=77777777777N                  I777N
098   *      ?777N========N7777777777787M               N7777
099   *      77777$D======N77777777777N777N?         N777777
100   *     I77777$$$N7===M$$77777777$77777777$MMZ77777777N
101   *      $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON
102   *       M$$$$$$$$M    M$$$$$$$$N=N$$$$7777777$$$ND
103   *      O77Z$$$$$$$     M$$$$$$$$MNI==$DNNNNM=~N
104   *   7 :N MNN$$$$M$      $$$777$8      8D8I
105   *     NMM.:7O           777777778
106   *                       7777777MN
107   *                       M NO .7:
108   *                       M   :   M
109   *                            8
110   */
111
112  // Constant matcher factory methods
113
114  /**
115   * Matches any character.
116   *
117   * @since 19.0 (since 1.0 as constant {@code ANY})
118   */
119  public static CharMatcher any() {
120    return Any.INSTANCE;
121  }
122
123  /**
124   * Matches no characters.
125   *
126   * @since 19.0 (since 1.0 as constant {@code NONE})
127   */
128  public static CharMatcher none() {
129    return None.INSTANCE;
130  }
131
132  /**
133   * Determines whether a character is whitespace according to the latest Unicode standard, as
134   * illustrated
135   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
136   * This is not the same definition used by other Java APIs. (See a
137   * <a href="https://goo.gl/Y6SLWx">comparison of several definitions of
138   * "whitespace"</a>.)
139   *
140   * <p>All Unicode White_Space characters are on the BMP and thus supported by this API.
141   *
142   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to
143   * date.
144   *
145   * @since 19.0 (since 1.0 as constant {@code WHITESPACE})
146   */
147  public static CharMatcher whitespace() {
148    return Whitespace.INSTANCE;
149  }
150
151  /**
152   * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
153   * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a
154   * discussion of that term.
155   *
156   * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE})
157   */
158  public static CharMatcher breakingWhitespace() {
159    return BreakingWhitespace.INSTANCE;
160  }
161
162  /**
163   * Determines whether a character is ASCII, meaning that its code point is less than 128.
164   *
165   * @since 19.0 (since 1.0 as constant {@code ASCII})
166   */
167  public static CharMatcher ascii() {
168    return Ascii.INSTANCE;
169  }
170
171  /**
172   * Determines whether a character is a BMP digit according to
173   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If
174   * you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
175   *
176   * @deprecated Many digits are supplementary characters; see the class documentation.
177   * @since 19.0 (since 1.0 as constant {@code DIGIT})
178   */
179  @Deprecated
180  public static CharMatcher digit() {
181    return Digit.INSTANCE;
182  }
183
184  /**
185   * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char)
186   * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0',
187   * '9')}.
188   *
189   * @deprecated Many digits are supplementary characters; see the class documentation.
190   * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT})
191   */
192  @Deprecated
193  public static CharMatcher javaDigit() {
194    return JavaDigit.INSTANCE;
195  }
196
197  /**
198   * Determines whether a character is a BMP letter according to
199   * {@linkplain Character#isLetter(char) Java's definition}.
200   * If you only care to match letters of the Latin alphabet, you can use
201   * {@code inRange('a', 'z').or(inRange('A', 'Z'))}.
202   *
203   * @deprecated Most letters are supplementary characters; see the class documentation.
204   * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER})
205   */
206  @Deprecated
207  public static CharMatcher javaLetter() {
208    return JavaLetter.INSTANCE;
209  }
210
211  /**
212   * Determines whether a character is a BMP letter or digit according to
213   * {@linkplain Character#isLetterOrDigit(char) Java's definition}.
214   *
215   * @deprecated Most letters and digits are supplementary characters; see the class documentation.
216   * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}).
217   */
218  @Deprecated
219  public static CharMatcher javaLetterOrDigit() {
220    return JavaLetterOrDigit.INSTANCE;
221  }
222
223  /**
224   * Determines whether a BMP character is upper case according to
225   * {@linkplain Character#isUpperCase(char) Java's definition}.
226   *
227   * @deprecated Some uppercase characters are supplementary characters;
228   *     see the class documentation.
229   * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE})
230   */
231  @Deprecated
232  public static CharMatcher javaUpperCase() {
233    return JavaUpperCase.INSTANCE;
234  }
235
236  /**
237   * Determines whether a BMP character is lower case according to
238   * {@linkplain Character#isLowerCase(char) Java's definition}.
239   *
240   * @deprecated Some lowercase characters are supplementary characters;
241   *     see the class documentation.
242   * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE})
243   */
244  @Deprecated
245  public static CharMatcher javaLowerCase() {
246    return JavaLowerCase.INSTANCE;
247  }
248
249  /**
250   * Determines whether a character is an ISO control character as specified by
251   * {@link Character#isISOControl(char)}.
252   *
253   * <p>All ISO control codes are on the BMP and thus supported by this API.
254   *
255   * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL})
256   */
257  public static CharMatcher javaIsoControl() {
258    return JavaIsoControl.INSTANCE;
259  }
260
261  /**
262   * Determines whether a character is invisible; that is, if its Unicode category is any of
263   * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
264   * PRIVATE_USE according to ICU4J.
265   *
266   * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU).
267   *
268   * @deprecated Most invisible characters are supplementary characters;
269   *     see the class documentation.
270   * @since 19.0 (since 1.0 as constant {@code INVISIBLE})
271   */
272  @Deprecated
273  public static CharMatcher invisible() {
274    return Invisible.INSTANCE;
275  }
276
277  /**
278   * Determines whether a character is single-width (not double-width). When in doubt, this matcher
279   * errs on the side of returning {@code false} (that is, it tends to assume a character is
280   * double-width).
281   *
282   * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to
283   * date.
284   *
285   * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>.
286   *
287   * @deprecated Many such characters are supplementary characters; see the class documentation.
288   * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH})
289   */
290  @Deprecated
291  public static CharMatcher singleWidth() {
292    return SingleWidth.INSTANCE;
293  }
294
295  // Legacy constants
296
297  /**
298   * Determines whether a character is whitespace according to the latest Unicode
299   * standard, as illustrated
300   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
301   * This is not the same definition used by other Java APIs. (See a
302   * <a href="https://goo.gl/Y6SLWx">comparison of several definitions of
303   * "whitespace"</a>.)
304   *
305   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant
306   * to keep it up to date.
307   *
308   * @deprecated Use {@link #whitespace()} instead. This constant is scheduled to be
309   *     removed in June 2018.
310   */
311  @com.google.common.annotations.Beta
312  @Deprecated
313  public static final CharMatcher WHITESPACE = whitespace();
314
315  /**
316   * Determines whether a character is a breaking whitespace (that is, a whitespace
317   * which can be interpreted as a break between words for formatting purposes). See
318   * {@link #whitespace} for a discussion of that term.
319   *
320   * @since 2.0
321   * @deprecated Use {@link #breakingWhitespace()} instead. This constant is scheduled
322   *     to be removed in June 2018.
323   */
324  @com.google.common.annotations.Beta
325  @Deprecated
326  public static final CharMatcher BREAKING_WHITESPACE = breakingWhitespace();
327
328  /**
329   * Determines whether a character is ASCII, meaning that its code point is less than
330   * 128.
331   *
332   * @deprecated Use {@link #ascii()} instead. This constant is scheduled to be
333   *     removed in June 2018.
334   */
335  @com.google.common.annotations.Beta
336  @Deprecated
337  public static final CharMatcher ASCII = ascii();
338
339  /**
340   * Determines whether a character is a digit according to
341   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">
342   * Unicode</a>. If you only care to match ASCII digits, you can use
343   * {@code inRange('0', '9')}.
344   *
345   * @deprecated Many digits are supplementary characters; see the class
346   *     documentation. If you need to use this, use {@link #digit()} instead. This
347   * .   constant is scheduled to be removed in June 2018.
348   */
349  @com.google.common.annotations.Beta
350  @Deprecated
351  public static final CharMatcher DIGIT = digit();
352
353  /**
354   * Determines whether a character is a digit according to
355   * {@linkplain Character#isDigit(char) Java's definition}. If you only care to match
356   * ASCII digits, you can use {@code inRange('0', '9')}.
357   *
358   * @deprecated Many digits are supplementary characters; see the class
359   *     documentation. If you need to use this, use {@link #javaDigit()} instead.
360   *     This constant is scheduled to be removed in June 2018.
361   */
362  @com.google.common.annotations.Beta
363  @Deprecated
364  public static final CharMatcher JAVA_DIGIT = javaDigit();
365
366  /**
367   * Determines whether a character is a letter according to
368   * {@linkplain Character#isLetter(char) Java's definition}. If you only care to
369   * match letters of the Latin alphabet, you can use
370   * {@code inRange('a', 'z').or(inRange('A', 'Z'))}.
371   *
372   * @deprecated Most letters are supplementary characters; see the class
373   *     documentation. If you need to use this, use {@link #javaLetter()} instead.
374   *     This constant is scheduled to be removed in June 2018.
375   */
376  @com.google.common.annotations.Beta
377  @Deprecated
378  public static final CharMatcher JAVA_LETTER = javaLetter();
379
380  /**
381   * Determines whether a character is a letter or digit according to
382   * {@linkplain Character#isLetterOrDigit(char) Java's definition}.
383   *
384   * @deprecated Most letters and digits are supplementary characters; see the class
385   *     documentation. If you need to use this, use {@link #javaLetterOrDigit()}
386   *     instead. This constant is scheduled to be removed in June 2018.
387   */
388  @com.google.common.annotations.Beta
389  @Deprecated
390  public static final CharMatcher JAVA_LETTER_OR_DIGIT = javaLetterOrDigit();
391
392  /**
393   * Determines whether a character is upper case according to
394   * {@linkplain Character#isUpperCase(char) Java's definition}.
395   *
396   * @deprecated Some uppercase letters are supplementary characters; see the class
397   *     documentation. If you need to use this, use {@link #javaUpperCase()} instead.
398   *     This constant is scheduled to be removed in June 2018.
399   */
400  @com.google.common.annotations.Beta
401  @Deprecated
402  public static final CharMatcher JAVA_UPPER_CASE = javaUpperCase();
403
404  /**
405   * Determines whether a character is lower case according to
406   * {@linkplain Character#isLowerCase(char) Java's definition}.
407   *
408   * @deprecated Some lowercase letters are supplementary characters; see the class
409   *     documentation. If you need to use this, use {@link #javaLowerCase()} instead.
410   *     This constant is scheduled to be removed in June 2018.
411   */
412  @com.google.common.annotations.Beta
413  @Deprecated
414  public static final CharMatcher JAVA_LOWER_CASE = javaLowerCase();
415
416  /**
417   * Determines whether a character is an ISO control character as specified by
418   * {@link Character#isISOControl(char)}.
419   *
420   * @deprecated Use {@link #javaIsoControl()} instead. This constant is scheduled to
421   *     be removed in June 2018.
422   */
423  @com.google.common.annotations.Beta
424  @Deprecated
425  public static final CharMatcher JAVA_ISO_CONTROL = javaIsoControl();
426
427  /**
428   * Determines whether a character is invisible; that is, if its Unicode category is
429   * any of SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT,
430   * SURROGATE, and PRIVATE_USE according to ICU4J.
431   *
432   * @deprecated Most invisible characters are supplementary characters; see the class
433   *     documentation. If you need to use this, use {@link #invisible()} instead.
434   *     This constant is scheduled to be removed in June 2018.
435   */
436  @com.google.common.annotations.Beta
437  @Deprecated
438  public static final CharMatcher INVISIBLE = invisible();
439
440  /**
441   * Determines whether a character is single-width (not double-width). When in doubt,
442   * this matcher errs on the side of returning {@code false} (that is, it tends to
443   * assume a character is double-width).
444   *
445   * <p><b>Note:</b> as the reference file evolves, we will modify this constant to
446   * keep it up to date.
447   *
448   * @deprecated Many such characters are supplementary characters; see the class
449   *     documentation. If you need to use this, use {@link #singleWidth()} instead.
450   *     This constant is scheduled to be removed in June 2018.
451   */
452  @com.google.common.annotations.Beta
453  @Deprecated
454  public static final CharMatcher SINGLE_WIDTH = singleWidth();
455
456  /**
457   * Matches any character.
458   *
459   * @deprecated Use {@link #any()} instead. This constant is scheduled to be
460   *     removed in June 2018.
461   */
462  @com.google.common.annotations.Beta
463  @Deprecated
464  public static final CharMatcher ANY = any();
465
466  /**
467   * Matches no characters.
468   *
469   * @deprecated Use {@link #none()} instead. This constant is scheduled to be
470   *     removed in June 2018.
471   */
472  @com.google.common.annotations.Beta
473  @Deprecated
474  public static final CharMatcher NONE = none();
475
476  // Static factories
477
478  /**
479   * Returns a {@code char} matcher that matches only one specified BMP character.
480   */
481  public static CharMatcher is(final char match) {
482    return new Is(match);
483  }
484
485  /**
486   * Returns a {@code char} matcher that matches any character except the BMP character specified.
487   *
488   * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
489   */
490  public static CharMatcher isNot(final char match) {
491    return new IsNot(match);
492  }
493
494  /**
495   * Returns a {@code char} matcher that matches any BMP character present in the given character
496   * sequence. Returns a bogus matcher if the sequence contains supplementary characters.
497   */
498  public static CharMatcher anyOf(final CharSequence sequence) {
499    switch (sequence.length()) {
500      case 0:
501        return none();
502      case 1:
503        return is(sequence.charAt(0));
504      case 2:
505        return isEither(sequence.charAt(0), sequence.charAt(1));
506      default:
507        // TODO(lowasser): is it potentially worth just going ahead and building a precomputed
508        // matcher?
509        return new AnyOf(sequence);
510    }
511  }
512
513  /**
514   * Returns a {@code char} matcher that matches any BMP character not present in the given
515   * character sequence. Returns a bogus matcher if the sequence contains supplementary characters.
516   */
517  public static CharMatcher noneOf(CharSequence sequence) {
518    return anyOf(sequence).negate();
519  }
520
521  /**
522   * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints
523   * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
524   * CharMatcher.inRange('a', 'z')}.
525   *
526   * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
527   */
528  public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
529    return new InRange(startInclusive, endInclusive);
530  }
531
532  /**
533   * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
534   * which operates on primitive {@code char} instances instead.
535   */
536  public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
537    return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate);
538  }
539
540  // Constructors
541
542  /**
543   * Constructor for use by subclasses. When subclassing, you may want to override
544   * {@code toString()} to provide a useful description.
545   */
546  protected CharMatcher() {}
547
548  // Abstract methods
549
550  /** Determines a true or false value for the given character. */
551  public abstract boolean matches(char c);
552
553  // Non-static factories
554
555  /**
556   * Returns a matcher that matches any character not matched by this matcher.
557   */
558  // @Override under Java 8 but not under Java 7
559  public CharMatcher negate() {
560    return new Negated(this);
561  }
562
563  /**
564   * Returns a matcher that matches any character matched by both this matcher and {@code other}.
565   */
566  public CharMatcher and(CharMatcher other) {
567    return new And(this, other);
568  }
569
570  /**
571   * Returns a matcher that matches any character matched by either this matcher or {@code other}.
572   */
573  public CharMatcher or(CharMatcher other) {
574    return new Or(this, other);
575  }
576
577  /**
578   * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
579   * query than the original; your mileage may vary. Precomputation takes time and is likely to be
580   * worthwhile only if the precomputed matcher is queried many thousands of times.
581   *
582   * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
583   * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
584   * worthwhile tradeoff in a browser.
585   */
586  public CharMatcher precomputed() {
587    return Platform.precomputeCharMatcher(this);
588  }
589
590  private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1;
591
592  /**
593   * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
594   * on {@link Platform} so that we can have different behavior in GWT.
595   *
596   * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the
597   * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables
598   * for matchers that only match a few characters, and so on. In the worst-case scenario, it
599   * constructs an eight-kilobyte bit array and queries that. In many situations this produces a
600   * matcher which is faster to query than the original.
601   */
602  @GwtIncompatible // SmallCharMatcher
603  CharMatcher precomputedInternal() {
604    final BitSet table = new BitSet();
605    setBits(table);
606    int totalCharacters = table.cardinality();
607    if (totalCharacters * 2 <= DISTINCT_CHARS) {
608      return precomputedPositive(totalCharacters, table, toString());
609    } else {
610      // TODO(lowasser): is it worth it to worry about the last character of large matchers?
611      table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
612      int negatedCharacters = DISTINCT_CHARS - totalCharacters;
613      String suffix = ".negate()";
614      final String description = toString();
615      String negatedDescription =
616          description.endsWith(suffix)
617              ? description.substring(0, description.length() - suffix.length())
618              : description + suffix;
619      return new NegatedFastMatcher(
620          precomputedPositive(negatedCharacters, table, negatedDescription)) {
621        @Override
622        public String toString() {
623          return description;
624        }
625      };
626    }
627  }
628
629  /**
630   * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper.
631   */
632  @GwtIncompatible // SmallCharMatcher
633  private static CharMatcher precomputedPositive(
634      int totalCharacters, BitSet table, String description) {
635    switch (totalCharacters) {
636      case 0:
637        return none();
638      case 1:
639        return is((char) table.nextSetBit(0));
640      case 2:
641        char c1 = (char) table.nextSetBit(0);
642        char c2 = (char) table.nextSetBit(c1 + 1);
643        return isEither(c1, c2);
644      default:
645        return isSmall(totalCharacters, table.length())
646            ? SmallCharMatcher.from(table, description)
647            : new BitSetMatcher(table, description);
648    }
649  }
650
651  @GwtIncompatible // SmallCharMatcher
652  private static boolean isSmall(int totalCharacters, int tableLength) {
653    return totalCharacters <= SmallCharMatcher.MAX_SIZE
654        && tableLength > (totalCharacters * 4 * Character.SIZE);
655    // err on the side of BitSetMatcher
656  }
657
658  /**
659   * Sets bits in {@code table} matched by this matcher.
660   */
661  @GwtIncompatible // used only from other GwtIncompatible code
662  void setBits(BitSet table) {
663    for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) {
664      if (matches((char) c)) {
665        table.set(c);
666      }
667    }
668  }
669
670  // Text processing routines
671
672  /**
673   * Returns {@code true} if a character sequence contains at least one matching BMP character.
674   * Equivalent to {@code !matchesNoneOf(sequence)}.
675   *
676   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
677   * character, until this returns {@code true} or the end is reached.
678   *
679   * @param sequence the character sequence to examine, possibly empty
680   * @return {@code true} if this matcher matches at least one character in the sequence
681   * @since 8.0
682   */
683  public boolean matchesAnyOf(CharSequence sequence) {
684    return !matchesNoneOf(sequence);
685  }
686
687  /**
688   * Returns {@code true} if a character sequence contains only matching BMP characters.
689   *
690   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
691   * character, until this returns {@code false} or the end is reached.
692   *
693   * @param sequence the character sequence to examine, possibly empty
694   * @return {@code true} if this matcher matches every character in the sequence, including when
695   *     the sequence is empty
696   */
697  public boolean matchesAllOf(CharSequence sequence) {
698    for (int i = sequence.length() - 1; i >= 0; i--) {
699      if (!matches(sequence.charAt(i))) {
700        return false;
701      }
702    }
703    return true;
704  }
705
706  /**
707   * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to
708   * {@code !matchesAnyOf(sequence)}.
709   *
710   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
711   * character, until this returns {@code true} or the end is reached.
712   *
713   * @param sequence the character sequence to examine, possibly empty
714   * @return {@code true} if this matcher matches no characters in the sequence, including when
715   *     the sequence is empty
716   */
717  public boolean matchesNoneOf(CharSequence sequence) {
718    return indexIn(sequence) == -1;
719  }
720
721  /**
722   * Returns the index of the first matching BMP character in a character sequence,
723   * or {@code -1} if no matching character is present.
724   *
725   * <p>The default implementation iterates over the sequence in forward order calling
726   * {@link #matches} for each character.
727   *
728   * @param sequence the character sequence to examine from the beginning
729   * @return an index, or {@code -1} if no character matches
730   */
731  public int indexIn(CharSequence sequence) {
732    return indexIn(sequence, 0);
733  }
734
735  /**
736   * Returns the index of the first matching BMP character in a character sequence, starting from a
737   * given position, or {@code -1} if no character matches after that position.
738   *
739   * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
740   * start}, calling {@link #matches} for each character.
741   *
742   * @param sequence the character sequence to examine
743   * @param start the first index to examine; must be nonnegative and no greater than {@code
744   *        sequence.length()}
745   * @return the index of the first matching character, guaranteed to be no less than {@code start},
746   *     or {@code -1} if no character matches
747   * @throws IndexOutOfBoundsException if start is negative or greater than {@code
748   *         sequence.length()}
749   */
750  public int indexIn(CharSequence sequence, int start) {
751    int length = sequence.length();
752    checkPositionIndex(start, length);
753    for (int i = start; i < length; i++) {
754      if (matches(sequence.charAt(i))) {
755        return i;
756      }
757    }
758    return -1;
759  }
760
761  /**
762   * Returns the index of the last matching BMP character in a character sequence,
763   * or {@code -1} if no matching character is present.
764   *
765   * <p>The default implementation iterates over the sequence in reverse order calling
766   * {@link #matches} for each character.
767   *
768   * @param sequence the character sequence to examine from the end
769   * @return an index, or {@code -1} if no character matches
770   */
771  public int lastIndexIn(CharSequence sequence) {
772    for (int i = sequence.length() - 1; i >= 0; i--) {
773      if (matches(sequence.charAt(i))) {
774        return i;
775      }
776    }
777    return -1;
778  }
779
780  /**
781   * Returns the number of matching {@code char}s found in a character sequence.
782   *
783   * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}().
784   */
785  public int countIn(CharSequence sequence) {
786    int count = 0;
787    for (int i = 0; i < sequence.length(); i++) {
788      if (matches(sequence.charAt(i))) {
789        count++;
790      }
791    }
792    return count;
793  }
794
795  /**
796   * Returns a string containing all non-matching characters of a character sequence, in order. For
797   * example: <pre>   {@code
798   *
799   *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
800   *
801   * ... returns {@code "bzr"}.
802   */
803  public String removeFrom(CharSequence sequence) {
804    String string = sequence.toString();
805    int pos = indexIn(string);
806    if (pos == -1) {
807      return string;
808    }
809
810    char[] chars = string.toCharArray();
811    int spread = 1;
812
813    // This unusual loop comes from extensive benchmarking
814    OUT:
815    while (true) {
816      pos++;
817      while (true) {
818        if (pos == chars.length) {
819          break OUT;
820        }
821        if (matches(chars[pos])) {
822          break;
823        }
824        chars[pos - spread] = chars[pos];
825        pos++;
826      }
827      spread++;
828    }
829    return new String(chars, 0, pos - spread);
830  }
831
832  /**
833   * Returns a string containing all matching BMP characters of a character sequence, in order. For
834   * example: <pre>   {@code
835   *
836   *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
837   *
838   * ... returns {@code "aaa"}.
839   */
840  public String retainFrom(CharSequence sequence) {
841    return negate().removeFrom(sequence);
842  }
843
844  /**
845   * Returns a string copy of the input character sequence, with each matching BMP character
846   * replaced by a given replacement character. For example: <pre>   {@code
847   *
848   *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
849   *
850   * ... returns {@code "rodor"}.
851   *
852   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
853   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
854   * character.
855   *
856   * @param sequence the character sequence to replace matching characters in
857   * @param replacement the character to append to the result string in place of each matching
858   *     character in {@code sequence}
859   * @return the new string
860   */
861  public String replaceFrom(CharSequence sequence, char replacement) {
862    String string = sequence.toString();
863    int pos = indexIn(string);
864    if (pos == -1) {
865      return string;
866    }
867    char[] chars = string.toCharArray();
868    chars[pos] = replacement;
869    for (int i = pos + 1; i < chars.length; i++) {
870      if (matches(chars[i])) {
871        chars[i] = replacement;
872      }
873    }
874    return new String(chars);
875  }
876
877  /**
878   * Returns a string copy of the input character sequence, with each matching BMP character
879   * replaced by a given replacement sequence. For example: <pre>   {@code
880   *
881   *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
882   *
883   * ... returns {@code "yoohoo"}.
884   *
885   * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
886   * off calling {@link #replaceFrom(CharSequence, char)} directly.
887   *
888   * @param sequence the character sequence to replace matching characters in
889   * @param replacement the characters to append to the result string in place of each matching
890   *     character in {@code sequence}
891   * @return the new string
892   */
893  public String replaceFrom(CharSequence sequence, CharSequence replacement) {
894    int replacementLen = replacement.length();
895    if (replacementLen == 0) {
896      return removeFrom(sequence);
897    }
898    if (replacementLen == 1) {
899      return replaceFrom(sequence, replacement.charAt(0));
900    }
901
902    String string = sequence.toString();
903    int pos = indexIn(string);
904    if (pos == -1) {
905      return string;
906    }
907
908    int len = string.length();
909    StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
910
911    int oldpos = 0;
912    do {
913      buf.append(string, oldpos, pos);
914      buf.append(replacement);
915      oldpos = pos + 1;
916      pos = indexIn(string, oldpos);
917    } while (pos != -1);
918
919    buf.append(string, oldpos, len);
920    return buf.toString();
921  }
922
923  /**
924   * Returns a substring of the input character sequence that omits all matching BMP characters
925   * from the beginning and from the end of the string. For example: <pre>   {@code
926   *
927   *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
928   *
929   * ... returns {@code "cat"}.
930   *
931   * <p>Note that: <pre>   {@code
932   *
933   *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
934   *
935   * ... is equivalent to {@link String#trim()}.
936   */
937  public String trimFrom(CharSequence sequence) {
938    int len = sequence.length();
939    int first;
940    int last;
941
942    for (first = 0; first < len; first++) {
943      if (!matches(sequence.charAt(first))) {
944        break;
945      }
946    }
947    for (last = len - 1; last > first; last--) {
948      if (!matches(sequence.charAt(last))) {
949        break;
950      }
951    }
952
953    return sequence.subSequence(first, last + 1).toString();
954  }
955
956  /**
957   * Returns a substring of the input character sequence that omits all matching BMP characters
958   * from the beginning of the string. For example: <pre> {@code
959   *
960   *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
961   *
962   * ... returns {@code "catbab"}.
963   */
964  public String trimLeadingFrom(CharSequence sequence) {
965    int len = sequence.length();
966    for (int first = 0; first < len; first++) {
967      if (!matches(sequence.charAt(first))) {
968        return sequence.subSequence(first, len).toString();
969      }
970    }
971    return "";
972  }
973
974  /**
975   * Returns a substring of the input character sequence that omits all matching BMP characters
976   * from the end of the string. For example: <pre> {@code
977   *
978   *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
979   *
980   * ... returns {@code "abacat"}.
981   */
982  public String trimTrailingFrom(CharSequence sequence) {
983    int len = sequence.length();
984    for (int last = len - 1; last >= 0; last--) {
985      if (!matches(sequence.charAt(last))) {
986        return sequence.subSequence(0, last + 1).toString();
987      }
988    }
989    return "";
990  }
991
992  /**
993   * Returns a string copy of the input character sequence, with each group of consecutive
994   * matching BMP characters replaced by a single replacement character. For example:
995   * <pre>   {@code
996   *
997   *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
998   *
999   * ... returns {@code "b-p-r"}.
1000   *
1001   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1002   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1003   * character.
1004   *
1005   * @param sequence the character sequence to replace matching groups of characters in
1006   * @param replacement the character to append to the result string in place of each group of
1007   *     matching characters in {@code sequence}
1008   * @return the new string
1009   */
1010  public String collapseFrom(CharSequence sequence, char replacement) {
1011    // This implementation avoids unnecessary allocation.
1012    int len = sequence.length();
1013    for (int i = 0; i < len; i++) {
1014      char c = sequence.charAt(i);
1015      if (matches(c)) {
1016        if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) {
1017          // a no-op replacement
1018          i++;
1019        } else {
1020          StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement);
1021          return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true);
1022        }
1023      }
1024    }
1025    // no replacement needed
1026    return sequence.toString();
1027  }
1028
1029  /**
1030   * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1031   * groups of matching BMP characters at the start or end of the sequence are removed without
1032   * replacement.
1033   */
1034  public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1035    // This implementation avoids unnecessary allocation.
1036    int len = sequence.length();
1037    int first = 0;
1038    int last = len - 1;
1039
1040    while (first < len && matches(sequence.charAt(first))) {
1041      first++;
1042    }
1043
1044    while (last > first && matches(sequence.charAt(last))) {
1045      last--;
1046    }
1047
1048    return (first == 0 && last == len - 1)
1049        ? collapseFrom(sequence, replacement)
1050        : finishCollapseFrom(
1051            sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false);
1052  }
1053
1054  private String finishCollapseFrom(
1055      CharSequence sequence,
1056      int start,
1057      int end,
1058      char replacement,
1059      StringBuilder builder,
1060      boolean inMatchingGroup) {
1061    for (int i = start; i < end; i++) {
1062      char c = sequence.charAt(i);
1063      if (matches(c)) {
1064        if (!inMatchingGroup) {
1065          builder.append(replacement);
1066          inMatchingGroup = true;
1067        }
1068      } else {
1069        builder.append(c);
1070        inMatchingGroup = false;
1071      }
1072    }
1073    return builder.toString();
1074  }
1075
1076  /**
1077   * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches}
1078   *     instead.
1079   */
1080  @Deprecated
1081  @Override
1082  public boolean apply(Character character) {
1083    return matches(character);
1084  }
1085
1086  /**
1087   * Returns a string representation of this {@code CharMatcher}, such as
1088   * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1089   */
1090  @Override
1091  public String toString() {
1092    return super.toString();
1093  }
1094
1095  /**
1096   * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB"
1097   * where "12AB" is the four hexadecimal digits representing the 16-bit code unit.
1098   */
1099  private static String showCharacter(char c) {
1100    String hex = "0123456789ABCDEF";
1101    char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'};
1102    for (int i = 0; i < 4; i++) {
1103      tmp[5 - i] = hex.charAt(c & 0xF);
1104      c = (char) (c >> 4);
1105    }
1106    return String.copyValueOf(tmp);
1107  }
1108
1109  // Fast matchers
1110
1111  /** A matcher for which precomputation will not yield any significant benefit. */
1112  abstract static class FastMatcher extends CharMatcher {
1113
1114    @Override
1115    public final CharMatcher precomputed() {
1116      return this;
1117    }
1118
1119    @Override
1120    public CharMatcher negate() {
1121      return new NegatedFastMatcher(this);
1122    }
1123  }
1124
1125  /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */
1126  abstract static class NamedFastMatcher extends FastMatcher {
1127
1128    private final String description;
1129
1130    NamedFastMatcher(String description) {
1131      this.description = checkNotNull(description);
1132    }
1133
1134    @Override
1135    public final String toString() {
1136      return description;
1137    }
1138  }
1139
1140  /** Negation of a {@link FastMatcher}. */
1141  static class NegatedFastMatcher extends Negated {
1142
1143    NegatedFastMatcher(CharMatcher original) {
1144      super(original);
1145    }
1146
1147    @Override
1148    public final CharMatcher precomputed() {
1149      return this;
1150    }
1151  }
1152
1153  /** Fast matcher using a {@link BitSet} table of matching characters. */
1154  @GwtIncompatible // used only from other GwtIncompatible code
1155  private static final class BitSetMatcher extends NamedFastMatcher {
1156
1157    private final BitSet table;
1158
1159    private BitSetMatcher(BitSet table, String description) {
1160      super(description);
1161      if (table.length() + Long.SIZE < table.size()) {
1162        table = (BitSet) table.clone();
1163        // If only we could actually call BitSet.trimToSize() ourselves...
1164      }
1165      this.table = table;
1166    }
1167
1168    @Override
1169    public boolean matches(char c) {
1170      return table.get(c);
1171    }
1172
1173    @Override
1174    void setBits(BitSet bitSet) {
1175      bitSet.or(table);
1176    }
1177  }
1178
1179  // Static constant implementation classes
1180
1181  /** Implementation of {@link #any()}. */
1182  private static final class Any extends NamedFastMatcher {
1183
1184    static final Any INSTANCE = new Any();
1185
1186    private Any() {
1187      super("CharMatcher.any()");
1188    }
1189
1190    @Override
1191    public boolean matches(char c) {
1192      return true;
1193    }
1194
1195    @Override
1196    public int indexIn(CharSequence sequence) {
1197      return (sequence.length() == 0) ? -1 : 0;
1198    }
1199
1200    @Override
1201    public int indexIn(CharSequence sequence, int start) {
1202      int length = sequence.length();
1203      checkPositionIndex(start, length);
1204      return (start == length) ? -1 : start;
1205    }
1206
1207    @Override
1208    public int lastIndexIn(CharSequence sequence) {
1209      return sequence.length() - 1;
1210    }
1211
1212    @Override
1213    public boolean matchesAllOf(CharSequence sequence) {
1214      checkNotNull(sequence);
1215      return true;
1216    }
1217
1218    @Override
1219    public boolean matchesNoneOf(CharSequence sequence) {
1220      return sequence.length() == 0;
1221    }
1222
1223    @Override
1224    public String removeFrom(CharSequence sequence) {
1225      checkNotNull(sequence);
1226      return "";
1227    }
1228
1229    @Override
1230    public String replaceFrom(CharSequence sequence, char replacement) {
1231      char[] array = new char[sequence.length()];
1232      Arrays.fill(array, replacement);
1233      return new String(array);
1234    }
1235
1236    @Override
1237    public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1238      StringBuilder result = new StringBuilder(sequence.length() * replacement.length());
1239      for (int i = 0; i < sequence.length(); i++) {
1240        result.append(replacement);
1241      }
1242      return result.toString();
1243    }
1244
1245    @Override
1246    public String collapseFrom(CharSequence sequence, char replacement) {
1247      return (sequence.length() == 0) ? "" : String.valueOf(replacement);
1248    }
1249
1250    @Override
1251    public String trimFrom(CharSequence sequence) {
1252      checkNotNull(sequence);
1253      return "";
1254    }
1255
1256    @Override
1257    public int countIn(CharSequence sequence) {
1258      return sequence.length();
1259    }
1260
1261    @Override
1262    public CharMatcher and(CharMatcher other) {
1263      return checkNotNull(other);
1264    }
1265
1266    @Override
1267    public CharMatcher or(CharMatcher other) {
1268      checkNotNull(other);
1269      return this;
1270    }
1271
1272    @Override
1273    public CharMatcher negate() {
1274      return none();
1275    }
1276  }
1277
1278  /** Implementation of {@link #none()}. */
1279  private static final class None extends NamedFastMatcher {
1280
1281    static final None INSTANCE = new None();
1282
1283    private None() {
1284      super("CharMatcher.none()");
1285    }
1286
1287    @Override
1288    public boolean matches(char c) {
1289      return false;
1290    }
1291
1292    @Override
1293    public int indexIn(CharSequence sequence) {
1294      checkNotNull(sequence);
1295      return -1;
1296    }
1297
1298    @Override
1299    public int indexIn(CharSequence sequence, int start) {
1300      int length = sequence.length();
1301      checkPositionIndex(start, length);
1302      return -1;
1303    }
1304
1305    @Override
1306    public int lastIndexIn(CharSequence sequence) {
1307      checkNotNull(sequence);
1308      return -1;
1309    }
1310
1311    @Override
1312    public boolean matchesAllOf(CharSequence sequence) {
1313      return sequence.length() == 0;
1314    }
1315
1316    @Override
1317    public boolean matchesNoneOf(CharSequence sequence) {
1318      checkNotNull(sequence);
1319      return true;
1320    }
1321
1322    @Override
1323    public String removeFrom(CharSequence sequence) {
1324      return sequence.toString();
1325    }
1326
1327    @Override
1328    public String replaceFrom(CharSequence sequence, char replacement) {
1329      return sequence.toString();
1330    }
1331
1332    @Override
1333    public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1334      checkNotNull(replacement);
1335      return sequence.toString();
1336    }
1337
1338    @Override
1339    public String collapseFrom(CharSequence sequence, char replacement) {
1340      return sequence.toString();
1341    }
1342
1343    @Override
1344    public String trimFrom(CharSequence sequence) {
1345      return sequence.toString();
1346    }
1347
1348    @Override
1349    public String trimLeadingFrom(CharSequence sequence) {
1350      return sequence.toString();
1351    }
1352
1353    @Override
1354    public String trimTrailingFrom(CharSequence sequence) {
1355      return sequence.toString();
1356    }
1357
1358    @Override
1359    public int countIn(CharSequence sequence) {
1360      checkNotNull(sequence);
1361      return 0;
1362    }
1363
1364    @Override
1365    public CharMatcher and(CharMatcher other) {
1366      checkNotNull(other);
1367      return this;
1368    }
1369
1370    @Override
1371    public CharMatcher or(CharMatcher other) {
1372      return checkNotNull(other);
1373    }
1374
1375    @Override
1376    public CharMatcher negate() {
1377      return any();
1378    }
1379  }
1380
1381  /** Implementation of {@link #whitespace()}. */
1382  @VisibleForTesting
1383  static final class Whitespace extends NamedFastMatcher {
1384
1385    static final String TABLE =
1386        "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000"
1387            + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680"
1388            + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009"
1389            + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000";
1390    static final int MULTIPLIER = 1682554634;
1391    static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1);
1392
1393    static final Whitespace INSTANCE = new Whitespace();
1394
1395    Whitespace() {
1396      super("CharMatcher.whitespace()");
1397    }
1398
1399    @Override
1400    public boolean matches(char c) {
1401      return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c;
1402    }
1403
1404    @GwtIncompatible // used only from other GwtIncompatible code
1405    @Override
1406    void setBits(BitSet table) {
1407      for (int i = 0; i < TABLE.length(); i++) {
1408        table.set(TABLE.charAt(i));
1409      }
1410    }
1411  }
1412
1413  /** Implementation of {@link #breakingWhitespace()}. */
1414  private static final class BreakingWhitespace extends CharMatcher {
1415
1416    static final CharMatcher INSTANCE = new BreakingWhitespace();
1417
1418    @Override
1419    public boolean matches(char c) {
1420      switch (c) {
1421        case '\t':
1422        case '\n':
1423        case '\013':
1424        case '\f':
1425        case '\r':
1426        case ' ':
1427        case '\u0085':
1428        case '\u1680':
1429        case '\u2028':
1430        case '\u2029':
1431        case '\u205f':
1432        case '\u3000':
1433          return true;
1434        case '\u2007':
1435          return false;
1436        default:
1437          return c >= '\u2000' && c <= '\u200a';
1438      }
1439    }
1440
1441    @Override
1442    public String toString() {
1443      return "CharMatcher.breakingWhitespace()";
1444    }
1445  }
1446
1447  /** Implementation of {@link #ascii()}. */
1448  private static final class Ascii extends NamedFastMatcher {
1449
1450    static final Ascii INSTANCE = new Ascii();
1451
1452    Ascii() {
1453      super("CharMatcher.ascii()");
1454    }
1455
1456    @Override
1457    public boolean matches(char c) {
1458      return c <= '\u007f';
1459    }
1460  }
1461
1462  /** Implementation that matches characters that fall within multiple ranges. */
1463  private static class RangesMatcher extends CharMatcher {
1464
1465    private final String description;
1466    private final char[] rangeStarts;
1467    private final char[] rangeEnds;
1468
1469    RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) {
1470      this.description = description;
1471      this.rangeStarts = rangeStarts;
1472      this.rangeEnds = rangeEnds;
1473      checkArgument(rangeStarts.length == rangeEnds.length);
1474      for (int i = 0; i < rangeStarts.length; i++) {
1475        checkArgument(rangeStarts[i] <= rangeEnds[i]);
1476        if (i + 1 < rangeStarts.length) {
1477          checkArgument(rangeEnds[i] < rangeStarts[i + 1]);
1478        }
1479      }
1480    }
1481
1482    @Override
1483    public boolean matches(char c) {
1484      int index = Arrays.binarySearch(rangeStarts, c);
1485      if (index >= 0) {
1486        return true;
1487      } else {
1488        index = ~index - 1;
1489        return index >= 0 && c <= rangeEnds[index];
1490      }
1491    }
1492
1493    @Override
1494    public String toString() {
1495      return description;
1496    }
1497  }
1498
1499  /** Implementation of {@link #digit()}. */
1500  private static final class Digit extends RangesMatcher {
1501
1502    // Must be in ascending order.
1503    private static final String ZEROES =
1504        "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66"
1505            + "\u0be6\u0c66\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810"
1506            + "\u1946\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
1507
1508    private static char[] zeroes() {
1509      return ZEROES.toCharArray();
1510    }
1511
1512    private static char[] nines() {
1513      char[] nines = new char[ZEROES.length()];
1514      for (int i = 0; i < ZEROES.length(); i++) {
1515        nines[i] = (char) (ZEROES.charAt(i) + 9);
1516      }
1517      return nines;
1518    }
1519
1520    static final Digit INSTANCE = new Digit();
1521
1522    private Digit() {
1523      super("CharMatcher.digit()", zeroes(), nines());
1524    }
1525  }
1526
1527  /** Implementation of {@link #javaDigit()}. */
1528  private static final class JavaDigit extends CharMatcher {
1529
1530    static final JavaDigit INSTANCE = new JavaDigit();
1531
1532    @Override
1533    public boolean matches(char c) {
1534      return Character.isDigit(c);
1535    }
1536
1537    @Override
1538    public String toString() {
1539      return "CharMatcher.javaDigit()";
1540    }
1541  }
1542
1543  /** Implementation of {@link #javaLetter()}. */
1544  private static final class JavaLetter extends CharMatcher {
1545
1546    static final JavaLetter INSTANCE = new JavaLetter();
1547
1548    @Override
1549    public boolean matches(char c) {
1550      return Character.isLetter(c);
1551    }
1552
1553    @Override
1554    public String toString() {
1555      return "CharMatcher.javaLetter()";
1556    }
1557  }
1558
1559  /** Implementation of {@link #javaLetterOrDigit()}. */
1560  private static final class JavaLetterOrDigit extends CharMatcher {
1561
1562    static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit();
1563
1564    @Override
1565    public boolean matches(char c) {
1566      return Character.isLetterOrDigit(c);
1567    }
1568
1569    @Override
1570    public String toString() {
1571      return "CharMatcher.javaLetterOrDigit()";
1572    }
1573  }
1574
1575  /** Implementation of {@link #javaUpperCase()}. */
1576  private static final class JavaUpperCase extends CharMatcher {
1577
1578    static final JavaUpperCase INSTANCE = new JavaUpperCase();
1579
1580    @Override
1581    public boolean matches(char c) {
1582      return Character.isUpperCase(c);
1583    }
1584
1585    @Override
1586    public String toString() {
1587      return "CharMatcher.javaUpperCase()";
1588    }
1589  }
1590
1591  /** Implementation of {@link #javaLowerCase()}. */
1592  private static final class JavaLowerCase extends CharMatcher {
1593
1594    static final JavaLowerCase INSTANCE = new JavaLowerCase();
1595
1596    @Override
1597    public boolean matches(char c) {
1598      return Character.isLowerCase(c);
1599    }
1600
1601    @Override
1602    public String toString() {
1603      return "CharMatcher.javaLowerCase()";
1604    }
1605  }
1606
1607  /** Implementation of {@link #javaIsoControl()}. */
1608  private static final class JavaIsoControl extends NamedFastMatcher {
1609
1610    static final JavaIsoControl INSTANCE = new JavaIsoControl();
1611
1612    private JavaIsoControl() {
1613      super("CharMatcher.javaIsoControl()");
1614    }
1615
1616    @Override
1617    public boolean matches(char c) {
1618      return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f');
1619    }
1620  }
1621
1622  /** Implementation of {@link #invisible()}. */
1623  private static final class Invisible extends RangesMatcher {
1624
1625    private static final String RANGE_STARTS =
1626        "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u1680\u180e\u2000\u2028\u205f\u2066\u2067"
1627            + "\u2068\u2069\u206a\u3000\ud800\ufeff\ufff9\ufffa";
1628    private static final String RANGE_ENDS =
1629        "\u0020\u00a0\u00ad\u0604\u061c\u06dd\u070f\u1680\u180e\u200f\u202f\u2064\u2066\u2067"
1630            + "\u2068\u2069\u206f\u3000\uf8ff\ufeff\ufff9\ufffb";
1631
1632    static final Invisible INSTANCE = new Invisible();
1633
1634    private Invisible() {
1635      super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray());
1636    }
1637  }
1638
1639  /** Implementation of {@link #singleWidth()}. */
1640  private static final class SingleWidth extends RangesMatcher {
1641
1642    static final SingleWidth INSTANCE = new SingleWidth();
1643
1644    private SingleWidth() {
1645      super(
1646          "CharMatcher.singleWidth()",
1647          "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(),
1648          "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray());
1649    }
1650  }
1651
1652  // Non-static factory implementation classes
1653
1654  /** Implementation of {@link #negate()}. */
1655  private static class Negated extends CharMatcher {
1656
1657    final CharMatcher original;
1658
1659    Negated(CharMatcher original) {
1660      this.original = checkNotNull(original);
1661    }
1662
1663    @Override
1664    public boolean matches(char c) {
1665      return !original.matches(c);
1666    }
1667
1668    @Override
1669    public boolean matchesAllOf(CharSequence sequence) {
1670      return original.matchesNoneOf(sequence);
1671    }
1672
1673    @Override
1674    public boolean matchesNoneOf(CharSequence sequence) {
1675      return original.matchesAllOf(sequence);
1676    }
1677
1678    @Override
1679    public int countIn(CharSequence sequence) {
1680      return sequence.length() - original.countIn(sequence);
1681    }
1682
1683    @GwtIncompatible // used only from other GwtIncompatible code
1684    @Override
1685    void setBits(BitSet table) {
1686      BitSet tmp = new BitSet();
1687      original.setBits(tmp);
1688      tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
1689      table.or(tmp);
1690    }
1691
1692    @Override
1693    public CharMatcher negate() {
1694      return original;
1695    }
1696
1697    @Override
1698    public String toString() {
1699      return original + ".negate()";
1700    }
1701  }
1702
1703  /** Implementation of {@link #and(CharMatcher)}. */
1704  private static final class And extends CharMatcher {
1705
1706    final CharMatcher first;
1707    final CharMatcher second;
1708
1709    And(CharMatcher a, CharMatcher b) {
1710      first = checkNotNull(a);
1711      second = checkNotNull(b);
1712    }
1713
1714    @Override
1715    public boolean matches(char c) {
1716      return first.matches(c) && second.matches(c);
1717    }
1718
1719    @GwtIncompatible // used only from other GwtIncompatible code
1720    @Override
1721    void setBits(BitSet table) {
1722      BitSet tmp1 = new BitSet();
1723      first.setBits(tmp1);
1724      BitSet tmp2 = new BitSet();
1725      second.setBits(tmp2);
1726      tmp1.and(tmp2);
1727      table.or(tmp1);
1728    }
1729
1730    @Override
1731    public String toString() {
1732      return "CharMatcher.and(" + first + ", " + second + ")";
1733    }
1734  }
1735
1736  /** Implementation of {@link #or(CharMatcher)}. */
1737  private static final class Or extends CharMatcher {
1738
1739    final CharMatcher first;
1740    final CharMatcher second;
1741
1742    Or(CharMatcher a, CharMatcher b) {
1743      first = checkNotNull(a);
1744      second = checkNotNull(b);
1745    }
1746
1747    @GwtIncompatible // used only from other GwtIncompatible code
1748    @Override
1749    void setBits(BitSet table) {
1750      first.setBits(table);
1751      second.setBits(table);
1752    }
1753
1754    @Override
1755    public boolean matches(char c) {
1756      return first.matches(c) || second.matches(c);
1757    }
1758
1759    @Override
1760    public String toString() {
1761      return "CharMatcher.or(" + first + ", " + second + ")";
1762    }
1763  }
1764
1765  // Static factory implementations
1766
1767  /** Implementation of {@link #is(char)}. */
1768  private static final class Is extends FastMatcher {
1769
1770    private final char match;
1771
1772    Is(char match) {
1773      this.match = match;
1774    }
1775
1776    @Override
1777    public boolean matches(char c) {
1778      return c == match;
1779    }
1780
1781    @Override
1782    public String replaceFrom(CharSequence sequence, char replacement) {
1783      return sequence.toString().replace(match, replacement);
1784    }
1785
1786    @Override
1787    public CharMatcher and(CharMatcher other) {
1788      return other.matches(match) ? this : none();
1789    }
1790
1791    @Override
1792    public CharMatcher or(CharMatcher other) {
1793      return other.matches(match) ? other : super.or(other);
1794    }
1795
1796    @Override
1797    public CharMatcher negate() {
1798      return isNot(match);
1799    }
1800
1801    @GwtIncompatible // used only from other GwtIncompatible code
1802    @Override
1803    void setBits(BitSet table) {
1804      table.set(match);
1805    }
1806
1807    @Override
1808    public String toString() {
1809      return "CharMatcher.is('" + showCharacter(match) + "')";
1810    }
1811  }
1812
1813  /** Implementation of {@link #isNot(char)}. */
1814  private static final class IsNot extends FastMatcher {
1815
1816    private final char match;
1817
1818    IsNot(char match) {
1819      this.match = match;
1820    }
1821
1822    @Override
1823    public boolean matches(char c) {
1824      return c != match;
1825    }
1826
1827    @Override
1828    public CharMatcher and(CharMatcher other) {
1829      return other.matches(match) ? super.and(other) : other;
1830    }
1831
1832    @Override
1833    public CharMatcher or(CharMatcher other) {
1834      return other.matches(match) ? any() : this;
1835    }
1836
1837    @GwtIncompatible // used only from other GwtIncompatible code
1838    @Override
1839    void setBits(BitSet table) {
1840      table.set(0, match);
1841      table.set(match + 1, Character.MAX_VALUE + 1);
1842    }
1843
1844    @Override
1845    public CharMatcher negate() {
1846      return is(match);
1847    }
1848
1849    @Override
1850    public String toString() {
1851      return "CharMatcher.isNot('" + showCharacter(match) + "')";
1852    }
1853  }
1854
1855  private static CharMatcher.IsEither isEither(char c1, char c2) {
1856    return new CharMatcher.IsEither(c1, c2);
1857  }
1858
1859  /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */
1860  private static final class IsEither extends FastMatcher {
1861
1862    private final char match1;
1863    private final char match2;
1864
1865    IsEither(char match1, char match2) {
1866      this.match1 = match1;
1867      this.match2 = match2;
1868    }
1869
1870    @Override
1871    public boolean matches(char c) {
1872      return c == match1 || c == match2;
1873    }
1874
1875    @GwtIncompatible // used only from other GwtIncompatible code
1876    @Override
1877    void setBits(BitSet table) {
1878      table.set(match1);
1879      table.set(match2);
1880    }
1881
1882    @Override
1883    public String toString() {
1884      return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")";
1885    }
1886  }
1887
1888  /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */
1889  private static final class AnyOf extends CharMatcher {
1890
1891    private final char[] chars;
1892
1893    public AnyOf(CharSequence chars) {
1894      this.chars = chars.toString().toCharArray();
1895      Arrays.sort(this.chars);
1896    }
1897
1898    @Override
1899    public boolean matches(char c) {
1900      return Arrays.binarySearch(chars, c) >= 0;
1901    }
1902
1903    @Override
1904    @GwtIncompatible // used only from other GwtIncompatible code
1905    void setBits(BitSet table) {
1906      for (char c : chars) {
1907        table.set(c);
1908      }
1909    }
1910
1911    @Override
1912    public String toString() {
1913      StringBuilder description = new StringBuilder("CharMatcher.anyOf(\"");
1914      for (char c : chars) {
1915        description.append(showCharacter(c));
1916      }
1917      description.append("\")");
1918      return description.toString();
1919    }
1920  }
1921
1922  /** Implementation of {@link #inRange(char, char)}. */
1923  private static final class InRange extends FastMatcher {
1924
1925    private final char startInclusive;
1926    private final char endInclusive;
1927
1928    InRange(char startInclusive, char endInclusive) {
1929      checkArgument(endInclusive >= startInclusive);
1930      this.startInclusive = startInclusive;
1931      this.endInclusive = endInclusive;
1932    }
1933
1934    @Override
1935    public boolean matches(char c) {
1936      return startInclusive <= c && c <= endInclusive;
1937    }
1938
1939    @GwtIncompatible // used only from other GwtIncompatible code
1940    @Override
1941    void setBits(BitSet table) {
1942      table.set(startInclusive, endInclusive + 1);
1943    }
1944
1945    @Override
1946    public String toString() {
1947      return "CharMatcher.inRange('"
1948          + showCharacter(startInclusive)
1949          + "', '"
1950          + showCharacter(endInclusive)
1951          + "')";
1952    }
1953  }
1954
1955  /** Implementation of {@link #forPredicate(Predicate)}. */
1956  private static final class ForPredicate extends CharMatcher {
1957
1958    private final Predicate<? super Character> predicate;
1959
1960    ForPredicate(Predicate<? super Character> predicate) {
1961      this.predicate = checkNotNull(predicate);
1962    }
1963
1964    @Override
1965    public boolean matches(char c) {
1966      return predicate.apply(c);
1967    }
1968
1969    @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily
1970    @Override
1971    public boolean apply(Character character) {
1972      return predicate.apply(checkNotNull(character));
1973    }
1974
1975    @Override
1976    public String toString() {
1977      return "CharMatcher.forPredicate(" + predicate + ")";
1978    }
1979  }
1980}