001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.base;
018
019import static com.google.common.base.Preconditions.checkArgument;
020import static com.google.common.base.Preconditions.checkNotNull;
021
022import com.google.common.annotations.Beta;
023import com.google.common.annotations.GwtCompatible;
024import com.google.common.annotations.GwtIncompatible;
025
026import java.util.Arrays;
027import java.util.BitSet;
028
029import javax.annotation.CheckReturnValue;
030
031/**
032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
033 * for any {@link Object}. Also offers basic text processing methods based on this function.
034 * Implementations are strongly encouraged to be side-effect-free and immutable.
035 *
036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
038 *
039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
042 * treats these just as two separate characters.
043 *
044 * <p>Example usages: <pre>
045 *   String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
046 *   if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
047 *
048 * <p>See the Guava User Guide article on <a href=
049 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
050 * {@code CharMatcher}</a>.
051 *
052 * @author Kevin Bourrillion
053 * @since 1.0
054 */
055@Beta // Possibly change from chars to code points; decide constants vs. methods
056@GwtCompatible(emulated = true)
057public abstract class CharMatcher implements Predicate<Character> {
058  // Constants
059  /**
060   * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
061   * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
062   * discussion of that term.
063   *
064   * @since 2.0
065   */
066  public static final CharMatcher BREAKING_WHITESPACE = new CharMatcher() {
067    @Override
068    public boolean matches(char c) {
069      switch (c) {
070        case '\t':
071        case '\n':
072        case '\013':
073        case '\f':
074        case '\r':
075        case ' ':
076        case '\u0085':
077        case '\u1680':
078        case '\u2028':
079        case '\u2029':
080        case '\u205f':
081        case '\u3000':
082          return true;
083        case '\u2007':
084          return false;
085        default:
086          return c >= '\u2000' && c <= '\u200a';
087      }
088    }
089
090    @Override
091    public String toString() {
092      return "CharMatcher.BREAKING_WHITESPACE";
093    }
094  };
095
096  /**
097   * Determines whether a character is ASCII, meaning that its code point is less than 128.
098   */
099  public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII");
100
101  private static class RangesMatcher extends CharMatcher {
102    private final char[] rangeStarts;
103    private final char[] rangeEnds;
104
105    RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) {
106      super(description);
107      this.rangeStarts = rangeStarts;
108      this.rangeEnds = rangeEnds;
109      checkArgument(rangeStarts.length == rangeEnds.length);
110      for (int i = 0; i < rangeStarts.length; i++) {
111        checkArgument(rangeStarts[i] <= rangeEnds[i]);
112        if (i + 1 < rangeStarts.length) {
113          checkArgument(rangeEnds[i] < rangeStarts[i + 1]);
114        }
115      }
116    }
117
118    @Override
119    public boolean matches(char c) {
120      int index = Arrays.binarySearch(rangeStarts, c);
121      if (index >= 0) {
122        return true;
123      } else {
124        index = ~index - 1;
125        return index >= 0 && c <= rangeEnds[index];
126      }
127    }
128  }
129
130  // Must be in ascending order.
131  private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6"
132      + "\u0c66\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1b50\u1bb0"
133      + "\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
134
135  private static final String NINES;
136  static {
137    StringBuilder builder = new StringBuilder(ZEROES.length());
138    for (int i = 0; i < ZEROES.length(); i++) {
139      builder.append((char) (ZEROES.charAt(i) + 9));
140    }
141    NINES = builder.toString();
142  }
143
144  /**
145   * Determines whether a character is a digit according to
146   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
147   * If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
148   */
149  public static final CharMatcher DIGIT = new RangesMatcher(
150      "CharMatcher.DIGIT", ZEROES.toCharArray(), NINES.toCharArray());
151
152  /**
153   * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
154   * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
155   */
156  public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") {
157    @Override public boolean matches(char c) {
158      return Character.isDigit(c);
159    }
160  };
161
162  /**
163   * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
164   * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
165   * inRange('a', 'z').or(inRange('A', 'Z'))}.
166   */
167  public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") {
168    @Override public boolean matches(char c) {
169      return Character.isLetter(c);
170    }
171  };
172
173  /**
174   * Determines whether a character is a letter or digit according to {@link
175   * Character#isLetterOrDigit(char) Java's definition}.
176   */
177  public static final CharMatcher JAVA_LETTER_OR_DIGIT =
178      new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") {
179    @Override public boolean matches(char c) {
180      return Character.isLetterOrDigit(c);
181    }
182  };
183
184  /**
185   * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
186   * Java's definition}.
187   */
188  public static final CharMatcher JAVA_UPPER_CASE =
189      new CharMatcher("CharMatcher.JAVA_UPPER_CASE") {
190    @Override public boolean matches(char c) {
191      return Character.isUpperCase(c);
192    }
193  };
194
195  /**
196   * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
197   * Java's definition}.
198   */
199  public static final CharMatcher JAVA_LOWER_CASE =
200      new CharMatcher("CharMatcher.JAVA_LOWER_CASE") {
201    @Override public boolean matches(char c) {
202      return Character.isLowerCase(c);
203    }
204  };
205
206  /**
207   * Determines whether a character is an ISO control character as specified by {@link
208   * Character#isISOControl(char)}.
209   */
210  public static final CharMatcher JAVA_ISO_CONTROL =
211      inRange('\u0000', '\u001f')
212      .or(inRange('\u007f', '\u009f'))
213      .withToString("CharMatcher.JAVA_ISO_CONTROL");
214
215  /**
216   * Determines whether a character is invisible; that is, if its Unicode category is any of
217   * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
218   * PRIVATE_USE according to ICU4J.
219   */
220  public static final CharMatcher INVISIBLE = new RangesMatcher("CharMatcher.INVISIBLE", (
221      "\u0000\u007f\u00ad\u0600\u06dd\u070f\u1680\u180e\u2000\u2028\u205f\u206a\u3000\ud800\ufeff"
222      + "\ufff9\ufffa").toCharArray(), (
223      "\u0020\u00a0\u00ad\u0604\u06dd\u070f\u1680\u180e\u200f\u202f\u2064\u206f\u3000\uf8ff\ufeff"
224      + "\ufff9\ufffb").toCharArray());
225
226  private static String showCharacter(char c) {
227    String hex = "0123456789ABCDEF";
228    char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'};
229    for (int i = 0; i < 4; i++) {
230      tmp[5 - i] = hex.charAt(c & 0xF);
231      c >>= 4;
232    }
233    return String.copyValueOf(tmp);
234
235  }
236
237  /**
238   * Determines whether a character is single-width (not double-width). When in doubt, this matcher
239   * errs on the side of returning {@code false} (that is, it tends to assume a character is
240   * double-width).
241   *
242   * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
243   * date.
244   */
245  public static final CharMatcher SINGLE_WIDTH = new RangesMatcher("CharMatcher.SINGLE_WIDTH",
246      "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(),
247      "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray());
248
249  /** Matches any character. */
250  public static final CharMatcher ANY =
251      new FastMatcher("CharMatcher.ANY") {
252        @Override public boolean matches(char c) {
253          return true;
254        }
255
256        @Override public int indexIn(CharSequence sequence) {
257          return (sequence.length() == 0) ? -1 : 0;
258        }
259
260        @Override public int indexIn(CharSequence sequence, int start) {
261          int length = sequence.length();
262          Preconditions.checkPositionIndex(start, length);
263          return (start == length) ? -1 : start;
264        }
265
266        @Override public int lastIndexIn(CharSequence sequence) {
267          return sequence.length() - 1;
268        }
269
270        @Override public boolean matchesAllOf(CharSequence sequence) {
271          checkNotNull(sequence);
272          return true;
273        }
274
275        @Override public boolean matchesNoneOf(CharSequence sequence) {
276          return sequence.length() == 0;
277        }
278
279        @Override public String removeFrom(CharSequence sequence) {
280          checkNotNull(sequence);
281          return "";
282        }
283
284        @Override public String replaceFrom(CharSequence sequence, char replacement) {
285          char[] array = new char[sequence.length()];
286          Arrays.fill(array, replacement);
287          return new String(array);
288        }
289
290        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
291          StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
292          for (int i = 0; i < sequence.length(); i++) {
293            retval.append(replacement);
294          }
295          return retval.toString();
296        }
297
298        @Override public String collapseFrom(CharSequence sequence, char replacement) {
299          return (sequence.length() == 0) ? "" : String.valueOf(replacement);
300        }
301
302        @Override public String trimFrom(CharSequence sequence) {
303          checkNotNull(sequence);
304          return "";
305        }
306
307        @Override public int countIn(CharSequence sequence) {
308          return sequence.length();
309        }
310
311        @Override public CharMatcher and(CharMatcher other) {
312          return checkNotNull(other);
313        }
314
315        @Override public CharMatcher or(CharMatcher other) {
316          checkNotNull(other);
317          return this;
318        }
319
320        @Override public CharMatcher negate() {
321          return NONE;
322        }
323      };
324
325  /** Matches no characters. */
326  public static final CharMatcher NONE =
327      new FastMatcher("CharMatcher.NONE") {
328        @Override public boolean matches(char c) {
329          return false;
330        }
331
332        @Override public int indexIn(CharSequence sequence) {
333          checkNotNull(sequence);
334          return -1;
335        }
336
337        @Override public int indexIn(CharSequence sequence, int start) {
338          int length = sequence.length();
339          Preconditions.checkPositionIndex(start, length);
340          return -1;
341        }
342
343        @Override public int lastIndexIn(CharSequence sequence) {
344          checkNotNull(sequence);
345          return -1;
346        }
347
348        @Override public boolean matchesAllOf(CharSequence sequence) {
349          return sequence.length() == 0;
350        }
351
352        @Override public boolean matchesNoneOf(CharSequence sequence) {
353          checkNotNull(sequence);
354          return true;
355        }
356
357        @Override public String removeFrom(CharSequence sequence) {
358          return sequence.toString();
359        }
360
361        @Override public String replaceFrom(CharSequence sequence, char replacement) {
362          return sequence.toString();
363        }
364
365        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
366          checkNotNull(replacement);
367          return sequence.toString();
368        }
369
370        @Override public String collapseFrom(CharSequence sequence, char replacement) {
371          return sequence.toString();
372        }
373
374        @Override public String trimFrom(CharSequence sequence) {
375          return sequence.toString();
376        }
377
378        @Override
379        public String trimLeadingFrom(CharSequence sequence) {
380          return sequence.toString();
381        }
382
383        @Override
384        public String trimTrailingFrom(CharSequence sequence) {
385          return sequence.toString();
386        }
387
388        @Override public int countIn(CharSequence sequence) {
389          checkNotNull(sequence);
390          return 0;
391        }
392
393        @Override public CharMatcher and(CharMatcher other) {
394          checkNotNull(other);
395          return this;
396        }
397
398        @Override public CharMatcher or(CharMatcher other) {
399          return checkNotNull(other);
400        }
401
402        @Override public CharMatcher negate() {
403          return ANY;
404        }
405      };
406
407  // Static factories
408
409  /**
410   * Returns a {@code char} matcher that matches only one specified character.
411   */
412  public static CharMatcher is(final char match) {
413    String description = "CharMatcher.is('" + showCharacter(match) + "')";
414    return new FastMatcher(description) {
415      @Override public boolean matches(char c) {
416        return c == match;
417      }
418
419      @Override public String replaceFrom(CharSequence sequence, char replacement) {
420        return sequence.toString().replace(match, replacement);
421      }
422
423      @Override public CharMatcher and(CharMatcher other) {
424        return other.matches(match) ? this : NONE;
425      }
426
427      @Override public CharMatcher or(CharMatcher other) {
428        return other.matches(match) ? other : super.or(other);
429      }
430
431      @Override public CharMatcher negate() {
432        return isNot(match);
433      }
434
435      @GwtIncompatible("java.util.BitSet")
436      @Override
437      void setBits(BitSet table) {
438        table.set(match);
439      }
440    };
441  }
442
443  /**
444   * Returns a {@code char} matcher that matches any character except the one specified.
445   *
446   * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
447   */
448  public static CharMatcher isNot(final char match) {
449    String description = "CharMatcher.isNot('" + showCharacter(match) + "')";
450    return new FastMatcher(description) {
451      @Override public boolean matches(char c) {
452        return c != match;
453      }
454
455      @Override public CharMatcher and(CharMatcher other) {
456        return other.matches(match) ? super.and(other) : other;
457      }
458
459      @Override public CharMatcher or(CharMatcher other) {
460        return other.matches(match) ? ANY : this;
461      }
462
463      @GwtIncompatible("java.util.BitSet")
464      @Override
465      void setBits(BitSet table) {
466        table.set(0, match);
467        table.set(match + 1, Character.MAX_VALUE + 1);
468      }
469
470      @Override public CharMatcher negate() {
471        return is(match);
472      }
473    };
474  }
475
476  /**
477   * Returns a {@code char} matcher that matches any character present in the given character
478   * sequence.
479   */
480  public static CharMatcher anyOf(final CharSequence sequence) {
481    switch (sequence.length()) {
482      case 0:
483        return NONE;
484      case 1:
485        return is(sequence.charAt(0));
486      case 2:
487        return isEither(sequence.charAt(0), sequence.charAt(1));
488      default:
489        // continue below to handle the general case
490    }
491    // TODO(user): is it potentially worth just going ahead and building a precomputed matcher?
492    final char[] chars = sequence.toString().toCharArray();
493    Arrays.sort(chars);
494    StringBuilder description = new StringBuilder("CharMatcher.anyOf(\"");
495    for (char c : chars) {
496      description.append(showCharacter(c));
497    }
498    description.append("\")");
499    return new CharMatcher(description.toString()) {
500      @Override public boolean matches(char c) {
501        return Arrays.binarySearch(chars, c) >= 0;
502      }
503
504      @Override
505      @GwtIncompatible("java.util.BitSet")
506      void setBits(BitSet table) {
507        for (char c : chars) {
508          table.set(c);
509        }
510      }
511    };
512  }
513
514  private static CharMatcher isEither(
515      final char match1,
516      final char match2) {
517    String description = "CharMatcher.anyOf(\"" +
518        showCharacter(match1) + showCharacter(match2) + "\")";
519    return new FastMatcher(description) {
520      @Override public boolean matches(char c) {
521        return c == match1 || c == match2;
522      }
523
524      @GwtIncompatible("java.util.BitSet")
525      @Override void setBits(BitSet table) {
526        table.set(match1);
527        table.set(match2);
528      }
529    };
530  }
531
532  /**
533   * Returns a {@code char} matcher that matches any character not present in the given character
534   * sequence.
535   */
536  public static CharMatcher noneOf(CharSequence sequence) {
537    return anyOf(sequence).negate();
538  }
539
540  /**
541   * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
542   * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
543   * CharMatcher.inRange('a', 'z')}.
544   *
545   * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
546   */
547  public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
548    checkArgument(endInclusive >= startInclusive);
549    String description = "CharMatcher.inRange('" +
550        showCharacter(startInclusive) + "', '" +
551        showCharacter(endInclusive) + "')";
552    return inRange(startInclusive, endInclusive, description);
553  }
554
555  static CharMatcher inRange(final char startInclusive, final char endInclusive,
556      String description) {
557    return new FastMatcher(description) {
558      @Override public boolean matches(char c) {
559        return startInclusive <= c && c <= endInclusive;
560      }
561
562      @GwtIncompatible("java.util.BitSet")
563      @Override void setBits(BitSet table) {
564        table.set(startInclusive, endInclusive + 1);
565      }
566    };
567  }
568
569  /**
570   * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
571   * which operates on primitive {@code char} instances instead.
572   */
573  public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
574    checkNotNull(predicate);
575    if (predicate instanceof CharMatcher) {
576      return (CharMatcher) predicate;
577    }
578    String description = "CharMatcher.forPredicate(" + predicate + ")";
579    return new CharMatcher(description) {
580      @Override public boolean matches(char c) {
581        return predicate.apply(c);
582      }
583
584      @Override public boolean apply(Character character) {
585        return predicate.apply(checkNotNull(character));
586      }
587    };
588  }
589
590  // State
591  final String description;
592
593  // Constructors
594
595  /**
596   * Sets the {@code toString()} from the given description.
597   */
598  CharMatcher(String description) {
599    this.description = description;
600  }
601
602  /**
603   * Constructor for use by subclasses. When subclassing, you may want to override
604   * {@code toString()} to provide a useful description.
605   */
606  protected CharMatcher() {
607    description = super.toString();
608  }
609
610  // Abstract methods
611
612  /** Determines a true or false value for the given character. */
613  public abstract boolean matches(char c);
614
615  // Non-static factories
616
617  /**
618   * Returns a matcher that matches any character not matched by this matcher.
619   */
620  public CharMatcher negate() {
621    return new NegatedMatcher(this);
622  }
623
624  private static class NegatedMatcher extends CharMatcher {
625    final CharMatcher original;
626
627    NegatedMatcher(String toString, CharMatcher original) {
628      super(toString);
629      this.original = original;
630    }
631
632    NegatedMatcher(CharMatcher original) {
633      this(original + ".negate()", original);
634    }
635
636    @Override public boolean matches(char c) {
637      return !original.matches(c);
638    }
639
640    @Override public boolean matchesAllOf(CharSequence sequence) {
641      return original.matchesNoneOf(sequence);
642    }
643
644    @Override public boolean matchesNoneOf(CharSequence sequence) {
645      return original.matchesAllOf(sequence);
646    }
647
648    @Override public int countIn(CharSequence sequence) {
649      return sequence.length() - original.countIn(sequence);
650    }
651
652    @GwtIncompatible("java.util.BitSet")
653    @Override
654    void setBits(BitSet table) {
655      BitSet tmp = new BitSet();
656      original.setBits(tmp);
657      tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
658      table.or(tmp);
659    }
660
661    @Override public CharMatcher negate() {
662      return original;
663    }
664
665    @Override
666    CharMatcher withToString(String description) {
667      return new NegatedMatcher(description, original);
668    }
669  }
670
671  /**
672   * Returns a matcher that matches any character matched by both this matcher and {@code other}.
673   */
674  public CharMatcher and(CharMatcher other) {
675    return new And(this, checkNotNull(other));
676  }
677
678  private static class And extends CharMatcher {
679    final CharMatcher first;
680    final CharMatcher second;
681
682    And(CharMatcher a, CharMatcher b) {
683      this(a, b, "CharMatcher.and(" + a + ", " + b + ")");
684    }
685
686    And(CharMatcher a, CharMatcher b, String description) {
687      super(description);
688      first = checkNotNull(a);
689      second = checkNotNull(b);
690    }
691
692    @Override
693    public boolean matches(char c) {
694      return first.matches(c) && second.matches(c);
695    }
696
697    @GwtIncompatible("java.util.BitSet")
698    @Override
699    void setBits(BitSet table) {
700      BitSet tmp1 = new BitSet();
701      first.setBits(tmp1);
702      BitSet tmp2 = new BitSet();
703      second.setBits(tmp2);
704      tmp1.and(tmp2);
705      table.or(tmp1);
706    }
707
708    @Override
709    CharMatcher withToString(String description) {
710      return new And(first, second, description);
711    }
712  }
713
714  /**
715   * Returns a matcher that matches any character matched by either this matcher or {@code other}.
716   */
717  public CharMatcher or(CharMatcher other) {
718    return new Or(this, checkNotNull(other));
719  }
720
721  private static class Or extends CharMatcher {
722    final CharMatcher first;
723    final CharMatcher second;
724
725    Or(CharMatcher a, CharMatcher b, String description) {
726      super(description);
727      first = checkNotNull(a);
728      second = checkNotNull(b);
729    }
730
731    Or(CharMatcher a, CharMatcher b) {
732      this(a, b, "CharMatcher.or(" + a + ", " + b + ")");
733    }
734
735    @GwtIncompatible("java.util.BitSet")
736    @Override
737    void setBits(BitSet table) {
738      first.setBits(table);
739      second.setBits(table);
740    }
741
742    @Override
743    public boolean matches(char c) {
744      return first.matches(c) || second.matches(c);
745    }
746
747    @Override
748    CharMatcher withToString(String description) {
749      return new Or(first, second, description);
750    }
751  }
752
753  /**
754   * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
755   * query than the original; your mileage may vary. Precomputation takes time and is likely to be
756   * worthwhile only if the precomputed matcher is queried many thousands of times.
757   *
758   * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
759   * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
760   * worthwhile tradeoff in a browser.
761   */
762  public CharMatcher precomputed() {
763    return Platform.precomputeCharMatcher(this);
764  }
765
766  /**
767   * Subclasses should provide a new CharMatcher with the same characteristics as {@code this},
768   * but with their {@code toString} method overridden with the new description.
769   *
770   * <p>This is unsupported by default.
771   */
772  CharMatcher withToString(String description) {
773    throw new UnsupportedOperationException();
774  }
775
776  private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1;
777
778  /**
779   * This is the actual implementation of {@link #precomputed}, but we bounce calls through a
780   * method on {@link Platform} so that we can have different behavior in GWT.
781   *
782   * <p>This implementation tries to be smart in a number of ways.  It recognizes cases where
783   * the negation is cheaper to precompute than the matcher itself; it tries to build small
784   * hash tables for matchers that only match a few characters, and so on.  In the worst-case
785   * scenario, it constructs an eight-kilobyte bit array and queries that.
786   * In many situations this produces a matcher which is faster to query than the original.
787   */
788  @GwtIncompatible("java.util.BitSet")
789  CharMatcher precomputedInternal() {
790    final BitSet table = new BitSet();
791    setBits(table);
792    int totalCharacters = table.cardinality();
793    if (totalCharacters * 2 <= DISTINCT_CHARS) {
794      return precomputedPositive(totalCharacters, table, description);
795    } else {
796      // TODO(user): is it worth it to worry about the last character of large matchers?
797      table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
798      int negatedCharacters = DISTINCT_CHARS - totalCharacters;
799      String suffix = ".negate()";
800      String negatedDescription = description.endsWith(suffix)
801          ? description.substring(0, description.length() - suffix.length())
802          : description + suffix;
803      return new NegatedFastMatcher(toString(),
804          precomputedPositive(negatedCharacters, table, negatedDescription));
805    }
806  }
807
808  /**
809   * A matcher for which precomputation will not yield any significant benefit.
810   */
811  abstract static class FastMatcher extends CharMatcher {
812    FastMatcher() {
813      super();
814    }
815
816    FastMatcher(String description) {
817      super(description);
818    }
819
820    @Override
821    public final CharMatcher precomputed() {
822      return this;
823    }
824
825    @Override
826    public CharMatcher negate() {
827      return new NegatedFastMatcher(this);
828    }
829  }
830
831  static final class NegatedFastMatcher extends NegatedMatcher {
832    NegatedFastMatcher(CharMatcher original) {
833      super(original);
834    }
835
836    NegatedFastMatcher(String toString, CharMatcher original) {
837      super(toString, original);
838    }
839
840    @Override
841    public final CharMatcher precomputed() {
842      return this;
843    }
844
845    @Override
846    CharMatcher withToString(String description) {
847      return new NegatedFastMatcher(description, original);
848    }
849  }
850
851  /**
852   * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper.
853   */
854  @GwtIncompatible("java.util.BitSet")
855  private static CharMatcher precomputedPositive(
856      int totalCharacters,
857      BitSet table,
858      String description) {
859    switch (totalCharacters) {
860      case 0:
861        return NONE;
862      case 1:
863        return is((char) table.nextSetBit(0));
864      case 2:
865        char c1 = (char) table.nextSetBit(0);
866        char c2 = (char) table.nextSetBit(c1 + 1);
867        return isEither(c1, c2);
868      default:
869        return isSmall(totalCharacters, table.length())
870            ? SmallCharMatcher.from(table, description)
871            : new BitSetMatcher(table, description);
872    }
873  }
874
875  @GwtIncompatible("SmallCharMatcher")
876  private static boolean isSmall(int totalCharacters, int tableLength) {
877    return totalCharacters <= SmallCharMatcher.MAX_SIZE
878        && tableLength > (totalCharacters * 4 * Character.SIZE);
879        // err on the side of BitSetMatcher
880  }
881
882  @GwtIncompatible("java.util.BitSet")
883  private static class BitSetMatcher extends FastMatcher {
884    private final BitSet table;
885
886    private BitSetMatcher(BitSet table, String description) {
887      super(description);
888      if (table.length() + Long.SIZE < table.size()) {
889        table = (BitSet) table.clone();
890        // If only we could actually call BitSet.trimToSize() ourselves...
891      }
892      this.table = table;
893    }
894
895    @Override public boolean matches(char c) {
896      return table.get(c);
897    }
898
899    @Override
900    void setBits(BitSet bitSet) {
901      bitSet.or(table);
902    }
903  }
904
905  /**
906   * Sets bits in {@code table} matched by this matcher.
907   */
908  @GwtIncompatible("java.util.BitSet")
909  void setBits(BitSet table) {
910    for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) {
911      if (matches((char) c)) {
912        table.set(c);
913      }
914    }
915  }
916
917  // Text processing routines
918
919  /**
920   * Returns {@code true} if a character sequence contains at least one matching character.
921   * Equivalent to {@code !matchesNoneOf(sequence)}.
922   *
923   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
924   * character, until this returns {@code true} or the end is reached.
925   *
926   * @param sequence the character sequence to examine, possibly empty
927   * @return {@code true} if this matcher matches at least one character in the sequence
928   * @since 8.0
929   */
930  public boolean matchesAnyOf(CharSequence sequence) {
931    return !matchesNoneOf(sequence);
932  }
933
934  /**
935   * Returns {@code true} if a character sequence contains only matching characters.
936   *
937   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
938   * character, until this returns {@code false} or the end is reached.
939   *
940   * @param sequence the character sequence to examine, possibly empty
941   * @return {@code true} if this matcher matches every character in the sequence, including when
942   *         the sequence is empty
943   */
944  public boolean matchesAllOf(CharSequence sequence) {
945    for (int i = sequence.length() - 1; i >= 0; i--) {
946      if (!matches(sequence.charAt(i))) {
947        return false;
948      }
949    }
950    return true;
951  }
952
953  /**
954   * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
955   * {@code !matchesAnyOf(sequence)}.
956   *
957   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
958   * character, until this returns {@code false} or the end is reached.
959   *
960   * @param sequence the character sequence to examine, possibly empty
961   * @return {@code true} if this matcher matches every character in the sequence, including when
962   *         the sequence is empty
963   */
964  public boolean matchesNoneOf(CharSequence sequence) {
965    return indexIn(sequence) == -1;
966  }
967
968  /**
969   * Returns the index of the first matching character in a character sequence, or {@code -1} if no
970   * matching character is present.
971   *
972   * <p>The default implementation iterates over the sequence in forward order calling {@link
973   * #matches} for each character.
974   *
975   * @param sequence the character sequence to examine from the beginning
976   * @return an index, or {@code -1} if no character matches
977   */
978  public int indexIn(CharSequence sequence) {
979    int length = sequence.length();
980    for (int i = 0; i < length; i++) {
981      if (matches(sequence.charAt(i))) {
982        return i;
983      }
984    }
985    return -1;
986  }
987
988  /**
989   * Returns the index of the first matching character in a character sequence, starting from a
990   * given position, or {@code -1} if no character matches after that position.
991   *
992   * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
993   * start}, calling {@link #matches} for each character.
994   *
995   * @param sequence the character sequence to examine
996   * @param start the first index to examine; must be nonnegative and no greater than {@code
997   *        sequence.length()}
998   * @return the index of the first matching character, guaranteed to be no less than {@code start},
999   *         or {@code -1} if no character matches
1000   * @throws IndexOutOfBoundsException if start is negative or greater than {@code
1001   *         sequence.length()}
1002   */
1003  public int indexIn(CharSequence sequence, int start) {
1004    int length = sequence.length();
1005    Preconditions.checkPositionIndex(start, length);
1006    for (int i = start; i < length; i++) {
1007      if (matches(sequence.charAt(i))) {
1008        return i;
1009      }
1010    }
1011    return -1;
1012  }
1013
1014  /**
1015   * Returns the index of the last matching character in a character sequence, or {@code -1} if no
1016   * matching character is present.
1017   *
1018   * <p>The default implementation iterates over the sequence in reverse order calling {@link
1019   * #matches} for each character.
1020   *
1021   * @param sequence the character sequence to examine from the end
1022   * @return an index, or {@code -1} if no character matches
1023   */
1024  public int lastIndexIn(CharSequence sequence) {
1025    for (int i = sequence.length() - 1; i >= 0; i--) {
1026      if (matches(sequence.charAt(i))) {
1027        return i;
1028      }
1029    }
1030    return -1;
1031  }
1032
1033  /**
1034   * Returns the number of matching characters found in a character sequence.
1035   */
1036  public int countIn(CharSequence sequence) {
1037    int count = 0;
1038    for (int i = 0; i < sequence.length(); i++) {
1039      if (matches(sequence.charAt(i))) {
1040        count++;
1041      }
1042    }
1043    return count;
1044  }
1045
1046  /**
1047   * Returns a string containing all non-matching characters of a character sequence, in order. For
1048   * example: <pre>   {@code
1049   *
1050   *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
1051   *
1052   * ... returns {@code "bzr"}.
1053   */
1054  @CheckReturnValue
1055  public String removeFrom(CharSequence sequence) {
1056    String string = sequence.toString();
1057    int pos = indexIn(string);
1058    if (pos == -1) {
1059      return string;
1060    }
1061
1062    char[] chars = string.toCharArray();
1063    int spread = 1;
1064
1065    // This unusual loop comes from extensive benchmarking
1066    OUT: while (true) {
1067      pos++;
1068      while (true) {
1069        if (pos == chars.length) {
1070          break OUT;
1071        }
1072        if (matches(chars[pos])) {
1073          break;
1074        }
1075        chars[pos - spread] = chars[pos];
1076        pos++;
1077      }
1078      spread++;
1079    }
1080    return new String(chars, 0, pos - spread);
1081  }
1082
1083  /**
1084   * Returns a string containing all matching characters of a character sequence, in order. For
1085   * example: <pre>   {@code
1086   *
1087   *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
1088   *
1089   * ... returns {@code "aaa"}.
1090   */
1091  @CheckReturnValue
1092  public String retainFrom(CharSequence sequence) {
1093    return negate().removeFrom(sequence);
1094  }
1095
1096  /**
1097   * Returns a string copy of the input character sequence, with each character that matches this
1098   * matcher replaced by a given replacement character. For example: <pre>   {@code
1099   *
1100   *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
1101   *
1102   * ... returns {@code "rodor"}.
1103   *
1104   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1105   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1106   * character.
1107   *
1108   * @param sequence the character sequence to replace matching characters in
1109   * @param replacement the character to append to the result string in place of each matching
1110   *        character in {@code sequence}
1111   * @return the new string
1112   */
1113  @CheckReturnValue
1114  public String replaceFrom(CharSequence sequence, char replacement) {
1115    String string = sequence.toString();
1116    int pos = indexIn(string);
1117    if (pos == -1) {
1118      return string;
1119    }
1120    char[] chars = string.toCharArray();
1121    chars[pos] = replacement;
1122    for (int i = pos + 1; i < chars.length; i++) {
1123      if (matches(chars[i])) {
1124        chars[i] = replacement;
1125      }
1126    }
1127    return new String(chars);
1128  }
1129
1130  /**
1131   * Returns a string copy of the input character sequence, with each character that matches this
1132   * matcher replaced by a given replacement sequence. For example: <pre>   {@code
1133   *
1134   *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1135   *
1136   * ... returns {@code "yoohoo"}.
1137   *
1138   * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1139   * off calling {@link #replaceFrom(CharSequence, char)} directly.
1140   *
1141   * @param sequence the character sequence to replace matching characters in
1142   * @param replacement the characters to append to the result string in place of each matching
1143   *        character in {@code sequence}
1144   * @return the new string
1145   */
1146  @CheckReturnValue
1147  public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1148    int replacementLen = replacement.length();
1149    if (replacementLen == 0) {
1150      return removeFrom(sequence);
1151    }
1152    if (replacementLen == 1) {
1153      return replaceFrom(sequence, replacement.charAt(0));
1154    }
1155
1156    String string = sequence.toString();
1157    int pos = indexIn(string);
1158    if (pos == -1) {
1159      return string;
1160    }
1161
1162    int len = string.length();
1163    StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1164
1165    int oldpos = 0;
1166    do {
1167      buf.append(string, oldpos, pos);
1168      buf.append(replacement);
1169      oldpos = pos + 1;
1170      pos = indexIn(string, oldpos);
1171    } while (pos != -1);
1172
1173    buf.append(string, oldpos, len);
1174    return buf.toString();
1175  }
1176
1177  /**
1178   * Returns a substring of the input character sequence that omits all characters this matcher
1179   * matches from the beginning and from the end of the string. For example: <pre>   {@code
1180   *
1181   *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1182   *
1183   * ... returns {@code "cat"}.
1184   *
1185   * <p>Note that: <pre>   {@code
1186   *
1187   *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1188   *
1189   * ... is equivalent to {@link String#trim()}.
1190   */
1191  @CheckReturnValue
1192  public String trimFrom(CharSequence sequence) {
1193    int len = sequence.length();
1194    int first;
1195    int last;
1196
1197    for (first = 0; first < len; first++) {
1198      if (!matches(sequence.charAt(first))) {
1199        break;
1200      }
1201    }
1202    for (last = len - 1; last > first; last--) {
1203      if (!matches(sequence.charAt(last))) {
1204        break;
1205      }
1206    }
1207
1208    return sequence.subSequence(first, last + 1).toString();
1209  }
1210
1211  /**
1212   * Returns a substring of the input character sequence that omits all characters this matcher
1213   * matches from the beginning of the string. For example: <pre> {@code
1214   *
1215   *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1216   *
1217   * ... returns {@code "catbab"}.
1218   */
1219  @CheckReturnValue
1220  public String trimLeadingFrom(CharSequence sequence) {
1221    int len = sequence.length();
1222    for (int first = 0; first < len; first++) {
1223      if (!matches(sequence.charAt(first))) {
1224        return sequence.subSequence(first, len).toString();
1225      }
1226    }
1227    return "";
1228  }
1229
1230  /**
1231   * Returns a substring of the input character sequence that omits all characters this matcher
1232   * matches from the end of the string. For example: <pre> {@code
1233   *
1234   *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1235   *
1236   * ... returns {@code "abacat"}.
1237   */
1238  @CheckReturnValue
1239  public String trimTrailingFrom(CharSequence sequence) {
1240    int len = sequence.length();
1241    for (int last = len - 1; last >= 0; last--) {
1242      if (!matches(sequence.charAt(last))) {
1243        return sequence.subSequence(0, last + 1).toString();
1244      }
1245    }
1246    return "";
1247  }
1248
1249  /**
1250   * Returns a string copy of the input character sequence, with each group of consecutive
1251   * characters that match this matcher replaced by a single replacement character. For example:
1252   * <pre>   {@code
1253   *
1254   *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1255   *
1256   * ... returns {@code "b-p-r"}.
1257   *
1258   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1259   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1260   * character.
1261   *
1262   * @param sequence the character sequence to replace matching groups of characters in
1263   * @param replacement the character to append to the result string in place of each group of
1264   *        matching characters in {@code sequence}
1265   * @return the new string
1266   */
1267  @CheckReturnValue
1268  public String collapseFrom(CharSequence sequence, char replacement) {
1269    // This implementation avoids unnecessary allocation.
1270    int len = sequence.length();
1271    for (int i = 0; i < len; i++) {
1272      char c = sequence.charAt(i);
1273      if (matches(c)) {
1274        if (c == replacement
1275            && (i == len - 1 || !matches(sequence.charAt(i + 1)))) {
1276          // a no-op replacement
1277          i++;
1278        } else {
1279          StringBuilder builder = new StringBuilder(len)
1280              .append(sequence.subSequence(0, i))
1281              .append(replacement);
1282          return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true);
1283        }
1284      }
1285    }
1286    // no replacement needed
1287    return sequence.toString();
1288  }
1289
1290  /**
1291   * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1292   * groups of matching characters at the start or end of the sequence are removed without
1293   * replacement.
1294   */
1295  @CheckReturnValue
1296  public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1297    // This implementation avoids unnecessary allocation.
1298    int len = sequence.length();
1299    int first;
1300    int last;
1301
1302    for (first = 0; first < len && matches(sequence.charAt(first)); first++) {}
1303    for (last = len - 1; last > first && matches(sequence.charAt(last)); last--) {}
1304
1305    return (first == 0 && last == len - 1)
1306        ? collapseFrom(sequence, replacement)
1307        : finishCollapseFrom(
1308              sequence, first, last + 1, replacement,
1309              new StringBuilder(last + 1 - first),
1310              false);
1311  }
1312
1313  private String finishCollapseFrom(
1314      CharSequence sequence, int start, int end, char replacement,
1315      StringBuilder builder, boolean inMatchingGroup) {
1316    for (int i = start; i < end; i++) {
1317      char c = sequence.charAt(i);
1318      if (matches(c)) {
1319        if (!inMatchingGroup) {
1320          builder.append(replacement);
1321          inMatchingGroup = true;
1322        }
1323      } else {
1324        builder.append(c);
1325        inMatchingGroup = false;
1326      }
1327    }
1328    return builder.toString();
1329  }
1330
1331  /**
1332   * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches}
1333   *     instead.
1334   */
1335  @Deprecated
1336  @Override
1337  public boolean apply(Character character) {
1338    return matches(character);
1339  }
1340
1341  /**
1342   * Returns a string representation of this {@code CharMatcher}, such as
1343   * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1344   */
1345  @Override
1346  public String toString() {
1347    return description;
1348  }
1349
1350  /**
1351   * Determines whether a character is whitespace according to the latest Unicode standard, as
1352   * illustrated
1353   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1354   * This is not the same definition used by other Java APIs. (See a
1355   * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1356   * definitions of "whitespace"</a>.)
1357   *
1358   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1359   * to date.
1360   */
1361  public static final CharMatcher WHITESPACE = new FastMatcher("WHITESPACE") {
1362    private static final String TABLE = "\u0009\u3000\n\u0009\u0009\u0009\u202F\u0009"
1363        + "\u0009\u2001\u2006\u0009\u0009\u0009\u0009\u0009"
1364        + "\u180E\u0009\u2029\u0009\u0009\u0009\u2000\u2005"
1365        + "\u200A\u0009\u0009\u0009\r\u0009\u0009\u2028"
1366        + "\u1680\u0009\u00A0\u0009\u2004\u2009\u0009\u0009"
1367        + "\u0009\u000C\u205F\u0009\u0009\u0020\u0009\u0009"
1368        + "\u2003\u2008\u0009\u0009\u0009\u000B\u0085\u0009"
1369        + "\u0009\u0009\u0009\u0009\u0009\u2002\u2007\u0009";
1370
1371    @Override
1372    public boolean matches(char c) {
1373      return TABLE.charAt((-844444961 * c) >>> 26) == c;
1374    }
1375
1376    @GwtIncompatible("java.util.BitSet")
1377    @Override
1378    void setBits(BitSet table) {
1379      for (int i = 0; i < TABLE.length(); i++) {
1380        table.set(TABLE.charAt(i));
1381      }
1382    }
1383  };
1384}