001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.base;
018
019import static com.google.common.base.Preconditions.checkArgument;
020import static com.google.common.base.Preconditions.checkNotNull;
021
022import com.google.common.annotations.Beta;
023import com.google.common.annotations.GwtCompatible;
024import com.google.common.annotations.GwtIncompatible;
025
026import java.util.Arrays;
027import java.util.BitSet;
028
029import javax.annotation.CheckReturnValue;
030
031/**
032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
033 * for any {@link Object}. Also offers basic text processing methods based on this function.
034 * Implementations are strongly encouraged to be side-effect-free and immutable.
035 *
036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
038 *
039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
042 * treats these just as two separate characters.
043 *
044 * <p>Example usages: <pre>
045 *   String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
046 *   if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
047 *
048 * <p>See the Guava User Guide article on <a href=
049 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
050 * {@code CharMatcher}</a>.
051 *
052 * @author Kevin Bourrillion
053 * @since 1.0
054 */
055@Beta // Possibly change from chars to code points; decide constants vs. methods
056@GwtCompatible(emulated = true)
057public abstract class CharMatcher implements Predicate<Character> {
058
059  // Constants
060  /**
061   * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
062   * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
063   * discussion of that term.
064   *
065   * @since 2.0
066   */
067  public static final CharMatcher BREAKING_WHITESPACE = new CharMatcher() {
068    @Override
069    public boolean matches(char c) {
070      switch (c) {
071        case '\t':
072        case '\n':
073        case '\013':
074        case '\f':
075        case '\r':
076        case ' ':
077        case '\u0085':
078        case '\u1680':
079        case '\u2028':
080        case '\u2029':
081        case '\u205f':
082        case '\u3000':
083          return true;
084        case '\u2007':
085          return false;
086        default:
087          return c >= '\u2000' && c <= '\u200a';
088      }
089    }
090
091    @Override
092    public String toString() {
093      return "CharMatcher.BREAKING_WHITESPACE";
094    }
095  };
096
097  /**
098   * Determines whether a character is ASCII, meaning that its code point is less than 128.
099   */
100  public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII");
101
102  private static class RangesMatcher extends CharMatcher {
103    private final char[] rangeStarts;
104    private final char[] rangeEnds;
105
106    RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) {
107      super(description);
108      this.rangeStarts = rangeStarts;
109      this.rangeEnds = rangeEnds;
110      checkArgument(rangeStarts.length == rangeEnds.length);
111      for (int i = 0; i < rangeStarts.length; i++) {
112        checkArgument(rangeStarts[i] <= rangeEnds[i]);
113        if (i + 1 < rangeStarts.length) {
114          checkArgument(rangeEnds[i] < rangeStarts[i + 1]);
115        }
116      }
117    }
118
119    @Override
120    public boolean matches(char c) {
121      int index = Arrays.binarySearch(rangeStarts, c);
122      if (index >= 0) {
123        return true;
124      } else {
125        index = ~index - 1;
126        return index >= 0 && c <= rangeEnds[index];
127      }
128    }
129  }
130
131  // Must be in ascending order.
132  private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6"
133      + "\u0c66\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1b50\u1bb0"
134      + "\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
135
136  private static final String NINES;
137  static {
138    StringBuilder builder = new StringBuilder(ZEROES.length());
139    for (int i = 0; i < ZEROES.length(); i++) {
140      builder.append((char) (ZEROES.charAt(i) + 9));
141    }
142    NINES = builder.toString();
143  }
144
145  /**
146   * Determines whether a character is a digit according to
147   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
148   * If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
149   */
150  public static final CharMatcher DIGIT = new RangesMatcher(
151      "CharMatcher.DIGIT", ZEROES.toCharArray(), NINES.toCharArray());
152
153  /**
154   * Determines whether a character is a digit according to {@linkplain Character#isDigit(char)
155   * Java's definition}. If you only care to match ASCII digits, you can use {@code
156   * inRange('0', '9')}.
157   */
158  public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") {
159    @Override public boolean matches(char c) {
160      return Character.isDigit(c);
161    }
162  };
163
164  /**
165   * Determines whether a character is a letter according to {@linkplain Character#isLetter(char)
166   * Java's definition}. If you only care to match letters of the Latin alphabet, you can use {@code
167   * inRange('a', 'z').or(inRange('A', 'Z'))}.
168   */
169  public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") {
170    @Override public boolean matches(char c) {
171      return Character.isLetter(c);
172    }
173  };
174
175  /**
176   * Determines whether a character is a letter or digit according to {@linkplain
177   * Character#isLetterOrDigit(char) Java's definition}.
178   */
179  public static final CharMatcher JAVA_LETTER_OR_DIGIT =
180      new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") {
181    @Override public boolean matches(char c) {
182      return Character.isLetterOrDigit(c);
183    }
184  };
185
186  /**
187   * Determines whether a character is upper case according to {@linkplain
188   * Character#isUpperCase(char) Java's definition}.
189   */
190  public static final CharMatcher JAVA_UPPER_CASE =
191      new CharMatcher("CharMatcher.JAVA_UPPER_CASE") {
192    @Override public boolean matches(char c) {
193      return Character.isUpperCase(c);
194    }
195  };
196
197  /**
198   * Determines whether a character is lower case according to {@linkplain
199   * Character#isLowerCase(char) Java's definition}.
200   */
201  public static final CharMatcher JAVA_LOWER_CASE =
202      new CharMatcher("CharMatcher.JAVA_LOWER_CASE") {
203    @Override public boolean matches(char c) {
204      return Character.isLowerCase(c);
205    }
206  };
207
208  /**
209   * Determines whether a character is an ISO control character as specified by {@link
210   * Character#isISOControl(char)}.
211   */
212  public static final CharMatcher JAVA_ISO_CONTROL =
213      inRange('\u0000', '\u001f')
214      .or(inRange('\u007f', '\u009f'))
215      .withToString("CharMatcher.JAVA_ISO_CONTROL");
216
217  /**
218   * Determines whether a character is invisible; that is, if its Unicode category is any of
219   * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
220   * PRIVATE_USE according to ICU4J.
221   */
222  public static final CharMatcher INVISIBLE = new RangesMatcher("CharMatcher.INVISIBLE", (
223      "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u1680\u180e\u2000\u2028\u205f\u2066\u2067\u2068"
224      + "\u2069\u206a\u3000\ud800\ufeff\ufff9\ufffa").toCharArray(), (
225      "\u0020\u00a0\u00ad\u0604\u061c\u06dd\u070f\u1680\u180e\u200f\u202f\u2064\u2066\u2067\u2068"
226      + "\u2069\u206f\u3000\uf8ff\ufeff\ufff9\ufffb").toCharArray());
227
228  private static String showCharacter(char c) {
229    String hex = "0123456789ABCDEF";
230    char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'};
231    for (int i = 0; i < 4; i++) {
232      tmp[5 - i] = hex.charAt(c & 0xF);
233      c >>= 4;
234    }
235    return String.copyValueOf(tmp);
236
237  }
238
239  /**
240   * Determines whether a character is single-width (not double-width). When in doubt, this matcher
241   * errs on the side of returning {@code false} (that is, it tends to assume a character is
242   * double-width).
243   *
244   * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
245   * date.
246   */
247  public static final CharMatcher SINGLE_WIDTH = new RangesMatcher("CharMatcher.SINGLE_WIDTH",
248      "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(),
249      "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray());
250
251  /** Matches any character. */
252  public static final CharMatcher ANY =
253      new FastMatcher("CharMatcher.ANY") {
254        @Override public boolean matches(char c) {
255          return true;
256        }
257
258        @Override public int indexIn(CharSequence sequence) {
259          return (sequence.length() == 0) ? -1 : 0;
260        }
261
262        @Override public int indexIn(CharSequence sequence, int start) {
263          int length = sequence.length();
264          Preconditions.checkPositionIndex(start, length);
265          return (start == length) ? -1 : start;
266        }
267
268        @Override public int lastIndexIn(CharSequence sequence) {
269          return sequence.length() - 1;
270        }
271
272        @Override public boolean matchesAllOf(CharSequence sequence) {
273          checkNotNull(sequence);
274          return true;
275        }
276
277        @Override public boolean matchesNoneOf(CharSequence sequence) {
278          return sequence.length() == 0;
279        }
280
281        @Override public String removeFrom(CharSequence sequence) {
282          checkNotNull(sequence);
283          return "";
284        }
285
286        @Override public String replaceFrom(CharSequence sequence, char replacement) {
287          char[] array = new char[sequence.length()];
288          Arrays.fill(array, replacement);
289          return new String(array);
290        }
291
292        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
293          StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
294          for (int i = 0; i < sequence.length(); i++) {
295            retval.append(replacement);
296          }
297          return retval.toString();
298        }
299
300        @Override public String collapseFrom(CharSequence sequence, char replacement) {
301          return (sequence.length() == 0) ? "" : String.valueOf(replacement);
302        }
303
304        @Override public String trimFrom(CharSequence sequence) {
305          checkNotNull(sequence);
306          return "";
307        }
308
309        @Override public int countIn(CharSequence sequence) {
310          return sequence.length();
311        }
312
313        @Override public CharMatcher and(CharMatcher other) {
314          return checkNotNull(other);
315        }
316
317        @Override public CharMatcher or(CharMatcher other) {
318          checkNotNull(other);
319          return this;
320        }
321
322        @Override public CharMatcher negate() {
323          return NONE;
324        }
325      };
326
327  /** Matches no characters. */
328  public static final CharMatcher NONE =
329      new FastMatcher("CharMatcher.NONE") {
330        @Override public boolean matches(char c) {
331          return false;
332        }
333
334        @Override public int indexIn(CharSequence sequence) {
335          checkNotNull(sequence);
336          return -1;
337        }
338
339        @Override public int indexIn(CharSequence sequence, int start) {
340          int length = sequence.length();
341          Preconditions.checkPositionIndex(start, length);
342          return -1;
343        }
344
345        @Override public int lastIndexIn(CharSequence sequence) {
346          checkNotNull(sequence);
347          return -1;
348        }
349
350        @Override public boolean matchesAllOf(CharSequence sequence) {
351          return sequence.length() == 0;
352        }
353
354        @Override public boolean matchesNoneOf(CharSequence sequence) {
355          checkNotNull(sequence);
356          return true;
357        }
358
359        @Override public String removeFrom(CharSequence sequence) {
360          return sequence.toString();
361        }
362
363        @Override public String replaceFrom(CharSequence sequence, char replacement) {
364          return sequence.toString();
365        }
366
367        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
368          checkNotNull(replacement);
369          return sequence.toString();
370        }
371
372        @Override public String collapseFrom(CharSequence sequence, char replacement) {
373          return sequence.toString();
374        }
375
376        @Override public String trimFrom(CharSequence sequence) {
377          return sequence.toString();
378        }
379
380        @Override
381        public String trimLeadingFrom(CharSequence sequence) {
382          return sequence.toString();
383        }
384
385        @Override
386        public String trimTrailingFrom(CharSequence sequence) {
387          return sequence.toString();
388        }
389
390        @Override public int countIn(CharSequence sequence) {
391          checkNotNull(sequence);
392          return 0;
393        }
394
395        @Override public CharMatcher and(CharMatcher other) {
396          checkNotNull(other);
397          return this;
398        }
399
400        @Override public CharMatcher or(CharMatcher other) {
401          return checkNotNull(other);
402        }
403
404        @Override public CharMatcher negate() {
405          return ANY;
406        }
407      };
408
409  // Static factories
410
411  /**
412   * Returns a {@code char} matcher that matches only one specified character.
413   */
414  public static CharMatcher is(final char match) {
415    String description = "CharMatcher.is('" + showCharacter(match) + "')";
416    return new FastMatcher(description) {
417      @Override public boolean matches(char c) {
418        return c == match;
419      }
420
421      @Override public String replaceFrom(CharSequence sequence, char replacement) {
422        return sequence.toString().replace(match, replacement);
423      }
424
425      @Override public CharMatcher and(CharMatcher other) {
426        return other.matches(match) ? this : NONE;
427      }
428
429      @Override public CharMatcher or(CharMatcher other) {
430        return other.matches(match) ? other : super.or(other);
431      }
432
433      @Override public CharMatcher negate() {
434        return isNot(match);
435      }
436
437      @GwtIncompatible("java.util.BitSet")
438      @Override
439      void setBits(BitSet table) {
440        table.set(match);
441      }
442    };
443  }
444
445  /**
446   * Returns a {@code char} matcher that matches any character except the one specified.
447   *
448   * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
449   */
450  public static CharMatcher isNot(final char match) {
451    String description = "CharMatcher.isNot('" + showCharacter(match) + "')";
452    return new FastMatcher(description) {
453      @Override public boolean matches(char c) {
454        return c != match;
455      }
456
457      @Override public CharMatcher and(CharMatcher other) {
458        return other.matches(match) ? super.and(other) : other;
459      }
460
461      @Override public CharMatcher or(CharMatcher other) {
462        return other.matches(match) ? ANY : this;
463      }
464
465      @GwtIncompatible("java.util.BitSet")
466      @Override
467      void setBits(BitSet table) {
468        table.set(0, match);
469        table.set(match + 1, Character.MAX_VALUE + 1);
470      }
471
472      @Override public CharMatcher negate() {
473        return is(match);
474      }
475    };
476  }
477
478  /**
479   * Returns a {@code char} matcher that matches any character present in the given character
480   * sequence.
481   */
482  public static CharMatcher anyOf(final CharSequence sequence) {
483    switch (sequence.length()) {
484      case 0:
485        return NONE;
486      case 1:
487        return is(sequence.charAt(0));
488      case 2:
489        return isEither(sequence.charAt(0), sequence.charAt(1));
490      default:
491        // continue below to handle the general case
492    }
493    // TODO(user): is it potentially worth just going ahead and building a precomputed matcher?
494    final char[] chars = sequence.toString().toCharArray();
495    Arrays.sort(chars);
496    StringBuilder description = new StringBuilder("CharMatcher.anyOf(\"");
497    for (char c : chars) {
498      description.append(showCharacter(c));
499    }
500    description.append("\")");
501    return new CharMatcher(description.toString()) {
502      @Override public boolean matches(char c) {
503        return Arrays.binarySearch(chars, c) >= 0;
504      }
505
506      @Override
507      @GwtIncompatible("java.util.BitSet")
508      void setBits(BitSet table) {
509        for (char c : chars) {
510          table.set(c);
511        }
512      }
513    };
514  }
515
516  private static CharMatcher isEither(
517      final char match1,
518      final char match2) {
519    String description = "CharMatcher.anyOf(\"" +
520        showCharacter(match1) + showCharacter(match2) + "\")";
521    return new FastMatcher(description) {
522      @Override public boolean matches(char c) {
523        return c == match1 || c == match2;
524      }
525
526      @GwtIncompatible("java.util.BitSet")
527      @Override void setBits(BitSet table) {
528        table.set(match1);
529        table.set(match2);
530      }
531    };
532  }
533
534  /**
535   * Returns a {@code char} matcher that matches any character not present in the given character
536   * sequence.
537   */
538  public static CharMatcher noneOf(CharSequence sequence) {
539    return anyOf(sequence).negate();
540  }
541
542  /**
543   * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
544   * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
545   * CharMatcher.inRange('a', 'z')}.
546   *
547   * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
548   */
549  public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
550    checkArgument(endInclusive >= startInclusive);
551    String description = "CharMatcher.inRange('" +
552        showCharacter(startInclusive) + "', '" +
553        showCharacter(endInclusive) + "')";
554    return inRange(startInclusive, endInclusive, description);
555  }
556
557  static CharMatcher inRange(final char startInclusive, final char endInclusive,
558      String description) {
559    return new FastMatcher(description) {
560      @Override public boolean matches(char c) {
561        return startInclusive <= c && c <= endInclusive;
562      }
563
564      @GwtIncompatible("java.util.BitSet")
565      @Override void setBits(BitSet table) {
566        table.set(startInclusive, endInclusive + 1);
567      }
568    };
569  }
570
571  /**
572   * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
573   * which operates on primitive {@code char} instances instead.
574   */
575  public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
576    checkNotNull(predicate);
577    if (predicate instanceof CharMatcher) {
578      return (CharMatcher) predicate;
579    }
580    String description = "CharMatcher.forPredicate(" + predicate + ")";
581    return new CharMatcher(description) {
582      @Override public boolean matches(char c) {
583        return predicate.apply(c);
584      }
585
586      @Override public boolean apply(Character character) {
587        return predicate.apply(checkNotNull(character));
588      }
589    };
590  }
591
592  // State
593  final String description;
594
595  // Constructors
596
597  /**
598   * Sets the {@code toString()} from the given description.
599   */
600  CharMatcher(String description) {
601    this.description = description;
602  }
603
604  /**
605   * Constructor for use by subclasses. When subclassing, you may want to override
606   * {@code toString()} to provide a useful description.
607   */
608  protected CharMatcher() {
609    description = super.toString();
610  }
611
612  // Abstract methods
613
614  /** Determines a true or false value for the given character. */
615  public abstract boolean matches(char c);
616
617  // Non-static factories
618
619  /**
620   * Returns a matcher that matches any character not matched by this matcher.
621   */
622  public CharMatcher negate() {
623    return new NegatedMatcher(this);
624  }
625
626  private static class NegatedMatcher extends CharMatcher {
627    final CharMatcher original;
628
629    NegatedMatcher(String toString, CharMatcher original) {
630      super(toString);
631      this.original = original;
632    }
633
634    NegatedMatcher(CharMatcher original) {
635      this(original + ".negate()", original);
636    }
637
638    @Override public boolean matches(char c) {
639      return !original.matches(c);
640    }
641
642    @Override public boolean matchesAllOf(CharSequence sequence) {
643      return original.matchesNoneOf(sequence);
644    }
645
646    @Override public boolean matchesNoneOf(CharSequence sequence) {
647      return original.matchesAllOf(sequence);
648    }
649
650    @Override public int countIn(CharSequence sequence) {
651      return sequence.length() - original.countIn(sequence);
652    }
653
654    @GwtIncompatible("java.util.BitSet")
655    @Override
656    void setBits(BitSet table) {
657      BitSet tmp = new BitSet();
658      original.setBits(tmp);
659      tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
660      table.or(tmp);
661    }
662
663    @Override public CharMatcher negate() {
664      return original;
665    }
666
667    @Override
668    CharMatcher withToString(String description) {
669      return new NegatedMatcher(description, original);
670    }
671  }
672
673  /**
674   * Returns a matcher that matches any character matched by both this matcher and {@code other}.
675   */
676  public CharMatcher and(CharMatcher other) {
677    return new And(this, checkNotNull(other));
678  }
679
680  private static class And extends CharMatcher {
681    final CharMatcher first;
682    final CharMatcher second;
683
684    And(CharMatcher a, CharMatcher b) {
685      this(a, b, "CharMatcher.and(" + a + ", " + b + ")");
686    }
687
688    And(CharMatcher a, CharMatcher b, String description) {
689      super(description);
690      first = checkNotNull(a);
691      second = checkNotNull(b);
692    }
693
694    @Override
695    public boolean matches(char c) {
696      return first.matches(c) && second.matches(c);
697    }
698
699    @GwtIncompatible("java.util.BitSet")
700    @Override
701    void setBits(BitSet table) {
702      BitSet tmp1 = new BitSet();
703      first.setBits(tmp1);
704      BitSet tmp2 = new BitSet();
705      second.setBits(tmp2);
706      tmp1.and(tmp2);
707      table.or(tmp1);
708    }
709
710    @Override
711    CharMatcher withToString(String description) {
712      return new And(first, second, description);
713    }
714  }
715
716  /**
717   * Returns a matcher that matches any character matched by either this matcher or {@code other}.
718   */
719  public CharMatcher or(CharMatcher other) {
720    return new Or(this, checkNotNull(other));
721  }
722
723  private static class Or extends CharMatcher {
724    final CharMatcher first;
725    final CharMatcher second;
726
727    Or(CharMatcher a, CharMatcher b, String description) {
728      super(description);
729      first = checkNotNull(a);
730      second = checkNotNull(b);
731    }
732
733    Or(CharMatcher a, CharMatcher b) {
734      this(a, b, "CharMatcher.or(" + a + ", " + b + ")");
735    }
736
737    @GwtIncompatible("java.util.BitSet")
738    @Override
739    void setBits(BitSet table) {
740      first.setBits(table);
741      second.setBits(table);
742    }
743
744    @Override
745    public boolean matches(char c) {
746      return first.matches(c) || second.matches(c);
747    }
748
749    @Override
750    CharMatcher withToString(String description) {
751      return new Or(first, second, description);
752    }
753  }
754
755  /**
756   * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
757   * query than the original; your mileage may vary. Precomputation takes time and is likely to be
758   * worthwhile only if the precomputed matcher is queried many thousands of times.
759   *
760   * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
761   * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
762   * worthwhile tradeoff in a browser.
763   */
764  public CharMatcher precomputed() {
765    return Platform.precomputeCharMatcher(this);
766  }
767
768  /**
769   * Subclasses should provide a new CharMatcher with the same characteristics as {@code this},
770   * but with their {@code toString} method overridden with the new description.
771   *
772   * <p>This is unsupported by default.
773   */
774  CharMatcher withToString(String description) {
775    throw new UnsupportedOperationException();
776  }
777
778  private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1;
779
780  /**
781   * This is the actual implementation of {@link #precomputed}, but we bounce calls through a
782   * method on {@link Platform} so that we can have different behavior in GWT.
783   *
784   * <p>This implementation tries to be smart in a number of ways.  It recognizes cases where
785   * the negation is cheaper to precompute than the matcher itself; it tries to build small
786   * hash tables for matchers that only match a few characters, and so on.  In the worst-case
787   * scenario, it constructs an eight-kilobyte bit array and queries that.
788   * In many situations this produces a matcher which is faster to query than the original.
789   */
790  @GwtIncompatible("java.util.BitSet")
791  CharMatcher precomputedInternal() {
792    final BitSet table = new BitSet();
793    setBits(table);
794    int totalCharacters = table.cardinality();
795    if (totalCharacters * 2 <= DISTINCT_CHARS) {
796      return precomputedPositive(totalCharacters, table, description);
797    } else {
798      // TODO(user): is it worth it to worry about the last character of large matchers?
799      table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
800      int negatedCharacters = DISTINCT_CHARS - totalCharacters;
801      String suffix = ".negate()";
802      String negatedDescription = description.endsWith(suffix)
803          ? description.substring(0, description.length() - suffix.length())
804          : description + suffix;
805      return new NegatedFastMatcher(toString(),
806          precomputedPositive(negatedCharacters, table, negatedDescription));
807    }
808  }
809
810  /**
811   * A matcher for which precomputation will not yield any significant benefit.
812   */
813  abstract static class FastMatcher extends CharMatcher {
814    FastMatcher() {
815      super();
816    }
817
818    FastMatcher(String description) {
819      super(description);
820    }
821
822    @Override
823    public final CharMatcher precomputed() {
824      return this;
825    }
826
827    @Override
828    public CharMatcher negate() {
829      return new NegatedFastMatcher(this);
830    }
831  }
832
833  static final class NegatedFastMatcher extends NegatedMatcher {
834    NegatedFastMatcher(CharMatcher original) {
835      super(original);
836    }
837
838    NegatedFastMatcher(String toString, CharMatcher original) {
839      super(toString, original);
840    }
841
842    @Override
843    public final CharMatcher precomputed() {
844      return this;
845    }
846
847    @Override
848    CharMatcher withToString(String description) {
849      return new NegatedFastMatcher(description, original);
850    }
851  }
852
853  /**
854   * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper.
855   */
856  @GwtIncompatible("java.util.BitSet")
857  private static CharMatcher precomputedPositive(
858      int totalCharacters,
859      BitSet table,
860      String description) {
861    switch (totalCharacters) {
862      case 0:
863        return NONE;
864      case 1:
865        return is((char) table.nextSetBit(0));
866      case 2:
867        char c1 = (char) table.nextSetBit(0);
868        char c2 = (char) table.nextSetBit(c1 + 1);
869        return isEither(c1, c2);
870      default:
871        return isSmall(totalCharacters, table.length())
872            ? SmallCharMatcher.from(table, description)
873            : new BitSetMatcher(table, description);
874    }
875  }
876
877  @GwtIncompatible("SmallCharMatcher")
878  private static boolean isSmall(int totalCharacters, int tableLength) {
879    return totalCharacters <= SmallCharMatcher.MAX_SIZE
880        && tableLength > (totalCharacters * 4 * Character.SIZE);
881        // err on the side of BitSetMatcher
882  }
883
884  @GwtIncompatible("java.util.BitSet")
885  private static class BitSetMatcher extends FastMatcher {
886    private final BitSet table;
887
888    private BitSetMatcher(BitSet table, String description) {
889      super(description);
890      if (table.length() + Long.SIZE < table.size()) {
891        table = (BitSet) table.clone();
892        // If only we could actually call BitSet.trimToSize() ourselves...
893      }
894      this.table = table;
895    }
896
897    @Override public boolean matches(char c) {
898      return table.get(c);
899    }
900
901    @Override
902    void setBits(BitSet bitSet) {
903      bitSet.or(table);
904    }
905  }
906
907  /**
908   * Sets bits in {@code table} matched by this matcher.
909   */
910  @GwtIncompatible("java.util.BitSet")
911  void setBits(BitSet table) {
912    for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) {
913      if (matches((char) c)) {
914        table.set(c);
915      }
916    }
917  }
918
919  // Text processing routines
920
921  /**
922   * Returns {@code true} if a character sequence contains at least one matching character.
923   * Equivalent to {@code !matchesNoneOf(sequence)}.
924   *
925   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
926   * character, until this returns {@code true} or the end is reached.
927   *
928   * @param sequence the character sequence to examine, possibly empty
929   * @return {@code true} if this matcher matches at least one character in the sequence
930   * @since 8.0
931   */
932  public boolean matchesAnyOf(CharSequence sequence) {
933    return !matchesNoneOf(sequence);
934  }
935
936  /**
937   * Returns {@code true} if a character sequence contains only matching characters.
938   *
939   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
940   * character, until this returns {@code false} or the end is reached.
941   *
942   * @param sequence the character sequence to examine, possibly empty
943   * @return {@code true} if this matcher matches every character in the sequence, including when
944   *         the sequence is empty
945   */
946  public boolean matchesAllOf(CharSequence sequence) {
947    for (int i = sequence.length() - 1; i >= 0; i--) {
948      if (!matches(sequence.charAt(i))) {
949        return false;
950      }
951    }
952    return true;
953  }
954
955  /**
956   * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
957   * {@code !matchesAnyOf(sequence)}.
958   *
959   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
960   * character, until this returns {@code false} or the end is reached.
961   *
962   * @param sequence the character sequence to examine, possibly empty
963   * @return {@code true} if this matcher matches every character in the sequence, including when
964   *         the sequence is empty
965   */
966  public boolean matchesNoneOf(CharSequence sequence) {
967    return indexIn(sequence) == -1;
968  }
969
970  /**
971   * Returns the index of the first matching character in a character sequence, or {@code -1} if no
972   * matching character is present.
973   *
974   * <p>The default implementation iterates over the sequence in forward order calling {@link
975   * #matches} for each character.
976   *
977   * @param sequence the character sequence to examine from the beginning
978   * @return an index, or {@code -1} if no character matches
979   */
980  public int indexIn(CharSequence sequence) {
981    int length = sequence.length();
982    for (int i = 0; i < length; i++) {
983      if (matches(sequence.charAt(i))) {
984        return i;
985      }
986    }
987    return -1;
988  }
989
990  /**
991   * Returns the index of the first matching character in a character sequence, starting from a
992   * given position, or {@code -1} if no character matches after that position.
993   *
994   * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
995   * start}, calling {@link #matches} for each character.
996   *
997   * @param sequence the character sequence to examine
998   * @param start the first index to examine; must be nonnegative and no greater than {@code
999   *        sequence.length()}
1000   * @return the index of the first matching character, guaranteed to be no less than {@code start},
1001   *         or {@code -1} if no character matches
1002   * @throws IndexOutOfBoundsException if start is negative or greater than {@code
1003   *         sequence.length()}
1004   */
1005  public int indexIn(CharSequence sequence, int start) {
1006    int length = sequence.length();
1007    Preconditions.checkPositionIndex(start, length);
1008    for (int i = start; i < length; i++) {
1009      if (matches(sequence.charAt(i))) {
1010        return i;
1011      }
1012    }
1013    return -1;
1014  }
1015
1016  /**
1017   * Returns the index of the last matching character in a character sequence, or {@code -1} if no
1018   * matching character is present.
1019   *
1020   * <p>The default implementation iterates over the sequence in reverse order calling {@link
1021   * #matches} for each character.
1022   *
1023   * @param sequence the character sequence to examine from the end
1024   * @return an index, or {@code -1} if no character matches
1025   */
1026  public int lastIndexIn(CharSequence sequence) {
1027    for (int i = sequence.length() - 1; i >= 0; i--) {
1028      if (matches(sequence.charAt(i))) {
1029        return i;
1030      }
1031    }
1032    return -1;
1033  }
1034
1035  /**
1036   * Returns the number of matching characters found in a character sequence.
1037   */
1038  public int countIn(CharSequence sequence) {
1039    int count = 0;
1040    for (int i = 0; i < sequence.length(); i++) {
1041      if (matches(sequence.charAt(i))) {
1042        count++;
1043      }
1044    }
1045    return count;
1046  }
1047
1048  /**
1049   * Returns a string containing all non-matching characters of a character sequence, in order. For
1050   * example: <pre>   {@code
1051   *
1052   *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
1053   *
1054   * ... returns {@code "bzr"}.
1055   */
1056  @CheckReturnValue
1057  public String removeFrom(CharSequence sequence) {
1058    String string = sequence.toString();
1059    int pos = indexIn(string);
1060    if (pos == -1) {
1061      return string;
1062    }
1063
1064    char[] chars = string.toCharArray();
1065    int spread = 1;
1066
1067    // This unusual loop comes from extensive benchmarking
1068    OUT: while (true) {
1069      pos++;
1070      while (true) {
1071        if (pos == chars.length) {
1072          break OUT;
1073        }
1074        if (matches(chars[pos])) {
1075          break;
1076        }
1077        chars[pos - spread] = chars[pos];
1078        pos++;
1079      }
1080      spread++;
1081    }
1082    return new String(chars, 0, pos - spread);
1083  }
1084
1085  /**
1086   * Returns a string containing all matching characters of a character sequence, in order. For
1087   * example: <pre>   {@code
1088   *
1089   *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
1090   *
1091   * ... returns {@code "aaa"}.
1092   */
1093  @CheckReturnValue
1094  public String retainFrom(CharSequence sequence) {
1095    return negate().removeFrom(sequence);
1096  }
1097
1098  /**
1099   * Returns a string copy of the input character sequence, with each character that matches this
1100   * matcher replaced by a given replacement character. For example: <pre>   {@code
1101   *
1102   *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
1103   *
1104   * ... returns {@code "rodor"}.
1105   *
1106   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1107   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1108   * character.
1109   *
1110   * @param sequence the character sequence to replace matching characters in
1111   * @param replacement the character to append to the result string in place of each matching
1112   *        character in {@code sequence}
1113   * @return the new string
1114   */
1115  @CheckReturnValue
1116  public String replaceFrom(CharSequence sequence, char replacement) {
1117    String string = sequence.toString();
1118    int pos = indexIn(string);
1119    if (pos == -1) {
1120      return string;
1121    }
1122    char[] chars = string.toCharArray();
1123    chars[pos] = replacement;
1124    for (int i = pos + 1; i < chars.length; i++) {
1125      if (matches(chars[i])) {
1126        chars[i] = replacement;
1127      }
1128    }
1129    return new String(chars);
1130  }
1131
1132  /**
1133   * Returns a string copy of the input character sequence, with each character that matches this
1134   * matcher replaced by a given replacement sequence. For example: <pre>   {@code
1135   *
1136   *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1137   *
1138   * ... returns {@code "yoohoo"}.
1139   *
1140   * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1141   * off calling {@link #replaceFrom(CharSequence, char)} directly.
1142   *
1143   * @param sequence the character sequence to replace matching characters in
1144   * @param replacement the characters to append to the result string in place of each matching
1145   *        character in {@code sequence}
1146   * @return the new string
1147   */
1148  @CheckReturnValue
1149  public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1150    int replacementLen = replacement.length();
1151    if (replacementLen == 0) {
1152      return removeFrom(sequence);
1153    }
1154    if (replacementLen == 1) {
1155      return replaceFrom(sequence, replacement.charAt(0));
1156    }
1157
1158    String string = sequence.toString();
1159    int pos = indexIn(string);
1160    if (pos == -1) {
1161      return string;
1162    }
1163
1164    int len = string.length();
1165    StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1166
1167    int oldpos = 0;
1168    do {
1169      buf.append(string, oldpos, pos);
1170      buf.append(replacement);
1171      oldpos = pos + 1;
1172      pos = indexIn(string, oldpos);
1173    } while (pos != -1);
1174
1175    buf.append(string, oldpos, len);
1176    return buf.toString();
1177  }
1178
1179  /**
1180   * Returns a substring of the input character sequence that omits all characters this matcher
1181   * matches from the beginning and from the end of the string. For example: <pre>   {@code
1182   *
1183   *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1184   *
1185   * ... returns {@code "cat"}.
1186   *
1187   * <p>Note that: <pre>   {@code
1188   *
1189   *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1190   *
1191   * ... is equivalent to {@link String#trim()}.
1192   */
1193  @CheckReturnValue
1194  public String trimFrom(CharSequence sequence) {
1195    int len = sequence.length();
1196    int first;
1197    int last;
1198
1199    for (first = 0; first < len; first++) {
1200      if (!matches(sequence.charAt(first))) {
1201        break;
1202      }
1203    }
1204    for (last = len - 1; last > first; last--) {
1205      if (!matches(sequence.charAt(last))) {
1206        break;
1207      }
1208    }
1209
1210    return sequence.subSequence(first, last + 1).toString();
1211  }
1212
1213  /**
1214   * Returns a substring of the input character sequence that omits all characters this matcher
1215   * matches from the beginning of the string. For example: <pre> {@code
1216   *
1217   *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1218   *
1219   * ... returns {@code "catbab"}.
1220   */
1221  @CheckReturnValue
1222  public String trimLeadingFrom(CharSequence sequence) {
1223    int len = sequence.length();
1224    for (int first = 0; first < len; first++) {
1225      if (!matches(sequence.charAt(first))) {
1226        return sequence.subSequence(first, len).toString();
1227      }
1228    }
1229    return "";
1230  }
1231
1232  /**
1233   * Returns a substring of the input character sequence that omits all characters this matcher
1234   * matches from the end of the string. For example: <pre> {@code
1235   *
1236   *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1237   *
1238   * ... returns {@code "abacat"}.
1239   */
1240  @CheckReturnValue
1241  public String trimTrailingFrom(CharSequence sequence) {
1242    int len = sequence.length();
1243    for (int last = len - 1; last >= 0; last--) {
1244      if (!matches(sequence.charAt(last))) {
1245        return sequence.subSequence(0, last + 1).toString();
1246      }
1247    }
1248    return "";
1249  }
1250
1251  /**
1252   * Returns a string copy of the input character sequence, with each group of consecutive
1253   * characters that match this matcher replaced by a single replacement character. For example:
1254   * <pre>   {@code
1255   *
1256   *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1257   *
1258   * ... returns {@code "b-p-r"}.
1259   *
1260   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1261   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1262   * character.
1263   *
1264   * @param sequence the character sequence to replace matching groups of characters in
1265   * @param replacement the character to append to the result string in place of each group of
1266   *        matching characters in {@code sequence}
1267   * @return the new string
1268   */
1269  @CheckReturnValue
1270  public String collapseFrom(CharSequence sequence, char replacement) {
1271    // This implementation avoids unnecessary allocation.
1272    int len = sequence.length();
1273    for (int i = 0; i < len; i++) {
1274      char c = sequence.charAt(i);
1275      if (matches(c)) {
1276        if (c == replacement
1277            && (i == len - 1 || !matches(sequence.charAt(i + 1)))) {
1278          // a no-op replacement
1279          i++;
1280        } else {
1281          StringBuilder builder = new StringBuilder(len)
1282              .append(sequence.subSequence(0, i))
1283              .append(replacement);
1284          return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true);
1285        }
1286      }
1287    }
1288    // no replacement needed
1289    return sequence.toString();
1290  }
1291
1292  /**
1293   * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1294   * groups of matching characters at the start or end of the sequence are removed without
1295   * replacement.
1296   */
1297  @CheckReturnValue
1298  public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1299    // This implementation avoids unnecessary allocation.
1300    int len = sequence.length();
1301    int first;
1302    int last;
1303
1304    for (first = 0; first < len && matches(sequence.charAt(first)); first++) {}
1305    for (last = len - 1; last > first && matches(sequence.charAt(last)); last--) {}
1306
1307    return (first == 0 && last == len - 1)
1308        ? collapseFrom(sequence, replacement)
1309        : finishCollapseFrom(
1310              sequence, first, last + 1, replacement,
1311              new StringBuilder(last + 1 - first),
1312              false);
1313  }
1314
1315  private String finishCollapseFrom(
1316      CharSequence sequence, int start, int end, char replacement,
1317      StringBuilder builder, boolean inMatchingGroup) {
1318    for (int i = start; i < end; i++) {
1319      char c = sequence.charAt(i);
1320      if (matches(c)) {
1321        if (!inMatchingGroup) {
1322          builder.append(replacement);
1323          inMatchingGroup = true;
1324        }
1325      } else {
1326        builder.append(c);
1327        inMatchingGroup = false;
1328      }
1329    }
1330    return builder.toString();
1331  }
1332
1333  /**
1334   * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches}
1335   *     instead.
1336   */
1337  @Deprecated
1338  @Override
1339  public boolean apply(Character character) {
1340    return matches(character);
1341  }
1342
1343  /**
1344   * Returns a string representation of this {@code CharMatcher}, such as
1345   * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1346   */
1347  @Override
1348  public String toString() {
1349    return description;
1350  }
1351
1352  static final String WHITESPACE_TABLE = ""
1353      + "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000"
1354      + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680"
1355      + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009"
1356      + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000";
1357  static final int WHITESPACE_MULTIPLIER = 1682554634;
1358  static final int WHITESPACE_SHIFT = Integer.numberOfLeadingZeros(WHITESPACE_TABLE.length() - 1);
1359
1360  /**
1361   * Determines whether a character is whitespace according to the latest Unicode standard, as
1362   * illustrated
1363   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1364   * This is not the same definition used by other Java APIs. (See a
1365   * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1366   * definitions of "whitespace"</a>.)
1367   *
1368   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1369   * to date.
1370   */
1371  public static final CharMatcher WHITESPACE = new FastMatcher("WHITESPACE") {
1372    @Override
1373    public boolean matches(char c) {
1374      return WHITESPACE_TABLE.charAt((WHITESPACE_MULTIPLIER * c) >>> WHITESPACE_SHIFT) == c;
1375    }
1376
1377    @GwtIncompatible("java.util.BitSet")
1378    @Override
1379    void setBits(BitSet table) {
1380      for (int i = 0; i < WHITESPACE_TABLE.length(); i++) {
1381        table.set(WHITESPACE_TABLE.charAt(i));
1382      }
1383    }
1384  };
1385}