001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.base;
018
019import static com.google.common.base.Preconditions.checkArgument;
020import static com.google.common.base.Preconditions.checkNotNull;
021
022import com.google.common.annotations.Beta;
023import com.google.common.annotations.GwtCompatible;
024import com.google.common.annotations.GwtIncompatible;
025
026import java.util.Arrays;
027import java.util.BitSet;
028
029import javax.annotation.CheckReturnValue;
030
031/**
032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
033 * for any {@link Object}. Also offers basic text processing methods based on this function.
034 * Implementations are strongly encouraged to be side-effect-free and immutable.
035 *
036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
038 *
039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
042 * treats these just as two separate characters.
043 *
044 * <p>Example usages: <pre>
045 *   String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
046 *   if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
047 *
048 * <p>See the Guava User Guide article on <a href=
049 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
050 * {@code CharMatcher}</a>.
051 *
052 * @author Kevin Bourrillion
053 * @since 1.0
054 */
055@Beta // Possibly change from chars to code points; decide constants vs. methods
056@GwtCompatible(emulated = true)
057public abstract class CharMatcher implements Predicate<Character> {
058  // Constants
059  /**
060   * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
061   * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
062   * discussion of that term.
063   *
064   * @since 2.0
065   */
066  public static final CharMatcher BREAKING_WHITESPACE = new CharMatcher() {
067    @Override
068    public boolean matches(char c) {
069      switch (c) {
070        case '\t':
071        case '\n':
072        case '\013':
073        case '\f':
074        case '\r':
075        case ' ':
076        case '\u0085':
077        case '\u1680':
078        case '\u2028':
079        case '\u2029':
080        case '\u205f':
081        case '\u3000':
082          return true;
083        case '\u2007':
084          return false;
085        default:
086          return c >= '\u2000' && c <= '\u200a';
087      }
088    }
089
090    @Override
091    public String toString() {
092      return "CharMatcher.BREAKING_WHITESPACE";
093    }
094  };
095
096  /**
097   * Determines whether a character is ASCII, meaning that its code point is less than 128.
098   */
099  public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII");
100
101  private static class RangesMatcher extends CharMatcher {
102    private final char[] rangeStarts;
103    private final char[] rangeEnds;
104
105    RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) {
106      super(description);
107      this.rangeStarts = rangeStarts;
108      this.rangeEnds = rangeEnds;
109      checkArgument(rangeStarts.length == rangeEnds.length);
110      for (int i = 0; i < rangeStarts.length; i++) {
111        checkArgument(rangeStarts[i] <= rangeEnds[i]);
112        if (i + 1 < rangeStarts.length) {
113          checkArgument(rangeEnds[i] < rangeStarts[i + 1]);
114        }
115      }
116    }
117
118    @Override
119    public boolean matches(char c) {
120      int index = Arrays.binarySearch(rangeStarts, c);
121      if (index >= 0) {
122        return true;
123      } else {
124        index = ~index - 1;
125        return index >= 0 && c <= rangeEnds[index];
126      }
127    }
128  }
129
130  // Must be in ascending order.
131  private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6"
132      + "\u0c66\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1b50\u1bb0"
133      + "\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
134
135  private static final String NINES;
136  static {
137    StringBuilder builder = new StringBuilder(ZEROES.length());
138    for (int i = 0; i < ZEROES.length(); i++) {
139      builder.append((char) (ZEROES.charAt(i) + 9));
140    }
141    NINES = builder.toString();
142  }
143
144  /**
145   * Determines whether a character is a digit according to
146   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
147   */
148  public static final CharMatcher DIGIT = new RangesMatcher(
149      "CharMatcher.DIGIT", ZEROES.toCharArray(), NINES.toCharArray());
150
151  /**
152   * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
153   * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
154   */
155  public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") {
156    @Override public boolean matches(char c) {
157      return Character.isDigit(c);
158    }
159  };
160
161  /**
162   * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
163   * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
164   * inRange('a', 'z').or(inRange('A', 'Z'))}.
165   */
166  public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") {
167    @Override public boolean matches(char c) {
168      return Character.isLetter(c);
169    }
170  };
171
172  /**
173   * Determines whether a character is a letter or digit according to {@link
174   * Character#isLetterOrDigit(char) Java's definition}.
175   */
176  public static final CharMatcher JAVA_LETTER_OR_DIGIT =
177      new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") {
178    @Override public boolean matches(char c) {
179      return Character.isLetterOrDigit(c);
180    }
181  };
182
183  /**
184   * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
185   * Java's definition}.
186   */
187  public static final CharMatcher JAVA_UPPER_CASE =
188      new CharMatcher("CharMatcher.JAVA_UPPER_CASE") {
189    @Override public boolean matches(char c) {
190      return Character.isUpperCase(c);
191    }
192  };
193
194  /**
195   * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
196   * Java's definition}.
197   */
198  public static final CharMatcher JAVA_LOWER_CASE =
199      new CharMatcher("CharMatcher.JAVA_LOWER_CASE") {
200    @Override public boolean matches(char c) {
201      return Character.isLowerCase(c);
202    }
203  };
204
205  /**
206   * Determines whether a character is an ISO control character as specified by {@link
207   * Character#isISOControl(char)}.
208   */
209  public static final CharMatcher JAVA_ISO_CONTROL =
210      inRange('\u0000', '\u001f')
211      .or(inRange('\u007f', '\u009f'))
212      .withToString("CharMatcher.JAVA_ISO_CONTROL");
213
214  /**
215   * Determines whether a character is invisible; that is, if its Unicode category is any of
216   * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
217   * PRIVATE_USE according to ICU4J.
218   */
219  public static final CharMatcher INVISIBLE = new RangesMatcher("CharMatcher.INVISIBLE", (
220      "\u0000\u007f\u00ad\u0600\u06dd\u070f\u1680\u180e\u2000\u2028\u205f\u206a\u3000\ud800\ufeff"
221      + "\ufff9\ufffa").toCharArray(), (
222      "\u0020\u00a0\u00ad\u0604\u06dd\u070f\u1680\u180e\u200f\u202f\u2064\u206f\u3000\uf8ff\ufeff"
223      + "\ufff9\ufffb").toCharArray());
224
225  private static String showCharacter(char c) {
226    String hex = "0123456789ABCDEF";
227    char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'};
228    for (int i = 0; i < 4; i++) {
229      tmp[5 - i] = hex.charAt(c & 0xF);
230      c >>= 4;
231    }
232    return String.copyValueOf(tmp);
233
234  }
235
236  /**
237   * Determines whether a character is single-width (not double-width). When in doubt, this matcher
238   * errs on the side of returning {@code false} (that is, it tends to assume a character is
239   * double-width).
240   *
241   * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
242   * date.
243   */
244  public static final CharMatcher SINGLE_WIDTH = new RangesMatcher("CharMatcher.SINGLE_WIDTH",
245      "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(),
246      "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray());
247
248  /** Matches any character. */
249  public static final CharMatcher ANY =
250      new FastMatcher("CharMatcher.ANY") {
251        @Override public boolean matches(char c) {
252          return true;
253        }
254
255        @Override public int indexIn(CharSequence sequence) {
256          return (sequence.length() == 0) ? -1 : 0;
257        }
258
259        @Override public int indexIn(CharSequence sequence, int start) {
260          int length = sequence.length();
261          Preconditions.checkPositionIndex(start, length);
262          return (start == length) ? -1 : start;
263        }
264
265        @Override public int lastIndexIn(CharSequence sequence) {
266          return sequence.length() - 1;
267        }
268
269        @Override public boolean matchesAllOf(CharSequence sequence) {
270          checkNotNull(sequence);
271          return true;
272        }
273
274        @Override public boolean matchesNoneOf(CharSequence sequence) {
275          return sequence.length() == 0;
276        }
277
278        @Override public String removeFrom(CharSequence sequence) {
279          checkNotNull(sequence);
280          return "";
281        }
282
283        @Override public String replaceFrom(CharSequence sequence, char replacement) {
284          char[] array = new char[sequence.length()];
285          Arrays.fill(array, replacement);
286          return new String(array);
287        }
288
289        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
290          StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
291          for (int i = 0; i < sequence.length(); i++) {
292            retval.append(replacement);
293          }
294          return retval.toString();
295        }
296
297        @Override public String collapseFrom(CharSequence sequence, char replacement) {
298          return (sequence.length() == 0) ? "" : String.valueOf(replacement);
299        }
300
301        @Override public String trimFrom(CharSequence sequence) {
302          checkNotNull(sequence);
303          return "";
304        }
305
306        @Override public int countIn(CharSequence sequence) {
307          return sequence.length();
308        }
309
310        @Override public CharMatcher and(CharMatcher other) {
311          return checkNotNull(other);
312        }
313
314        @Override public CharMatcher or(CharMatcher other) {
315          checkNotNull(other);
316          return this;
317        }
318
319        @Override public CharMatcher negate() {
320          return NONE;
321        }
322      };
323
324  /** Matches no characters. */
325  public static final CharMatcher NONE =
326      new FastMatcher("CharMatcher.NONE") {
327        @Override public boolean matches(char c) {
328          return false;
329        }
330
331        @Override public int indexIn(CharSequence sequence) {
332          checkNotNull(sequence);
333          return -1;
334        }
335
336        @Override public int indexIn(CharSequence sequence, int start) {
337          int length = sequence.length();
338          Preconditions.checkPositionIndex(start, length);
339          return -1;
340        }
341
342        @Override public int lastIndexIn(CharSequence sequence) {
343          checkNotNull(sequence);
344          return -1;
345        }
346
347        @Override public boolean matchesAllOf(CharSequence sequence) {
348          return sequence.length() == 0;
349        }
350
351        @Override public boolean matchesNoneOf(CharSequence sequence) {
352          checkNotNull(sequence);
353          return true;
354        }
355
356        @Override public String removeFrom(CharSequence sequence) {
357          return sequence.toString();
358        }
359
360        @Override public String replaceFrom(CharSequence sequence, char replacement) {
361          return sequence.toString();
362        }
363
364        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
365          checkNotNull(replacement);
366          return sequence.toString();
367        }
368
369        @Override public String collapseFrom(CharSequence sequence, char replacement) {
370          return sequence.toString();
371        }
372
373        @Override public String trimFrom(CharSequence sequence) {
374          return sequence.toString();
375        }
376
377        @Override
378        public String trimLeadingFrom(CharSequence sequence) {
379          return sequence.toString();
380        }
381
382        @Override
383        public String trimTrailingFrom(CharSequence sequence) {
384          return sequence.toString();
385        }
386
387        @Override public int countIn(CharSequence sequence) {
388          checkNotNull(sequence);
389          return 0;
390        }
391
392        @Override public CharMatcher and(CharMatcher other) {
393          checkNotNull(other);
394          return this;
395        }
396
397        @Override public CharMatcher or(CharMatcher other) {
398          return checkNotNull(other);
399        }
400
401        @Override public CharMatcher negate() {
402          return ANY;
403        }
404      };
405
406  // Static factories
407
408  /**
409   * Returns a {@code char} matcher that matches only one specified character.
410   */
411  public static CharMatcher is(final char match) {
412    String description = "CharMatcher.is('" + showCharacter(match) + "')";
413    return new FastMatcher(description) {
414      @Override public boolean matches(char c) {
415        return c == match;
416      }
417
418      @Override public String replaceFrom(CharSequence sequence, char replacement) {
419        return sequence.toString().replace(match, replacement);
420      }
421
422      @Override public CharMatcher and(CharMatcher other) {
423        return other.matches(match) ? this : NONE;
424      }
425
426      @Override public CharMatcher or(CharMatcher other) {
427        return other.matches(match) ? other : super.or(other);
428      }
429
430      @Override public CharMatcher negate() {
431        return isNot(match);
432      }
433
434      @GwtIncompatible("java.util.BitSet")
435      @Override
436      void setBits(BitSet table) {
437        table.set(match);
438      }
439    };
440  }
441
442  /**
443   * Returns a {@code char} matcher that matches any character except the one specified.
444   *
445   * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
446   */
447  public static CharMatcher isNot(final char match) {
448    String description = "CharMatcher.isNot('" + showCharacter(match) + "')";
449    return new FastMatcher(description) {
450      @Override public boolean matches(char c) {
451        return c != match;
452      }
453
454      @Override public CharMatcher and(CharMatcher other) {
455        return other.matches(match) ? super.and(other) : other;
456      }
457
458      @Override public CharMatcher or(CharMatcher other) {
459        return other.matches(match) ? ANY : this;
460      }
461
462      @GwtIncompatible("java.util.BitSet")
463      @Override
464      void setBits(BitSet table) {
465        table.set(0, match);
466        table.set(match + 1, Character.MAX_VALUE + 1);
467      }
468
469      @Override public CharMatcher negate() {
470        return is(match);
471      }
472    };
473  }
474
475  /**
476   * Returns a {@code char} matcher that matches any character present in the given character
477   * sequence.
478   */
479  public static CharMatcher anyOf(final CharSequence sequence) {
480    switch (sequence.length()) {
481      case 0:
482        return NONE;
483      case 1:
484        return is(sequence.charAt(0));
485      case 2:
486        return isEither(sequence.charAt(0), sequence.charAt(1));
487      default:
488        // continue below to handle the general case
489    }
490    // TODO(user): is it potentially worth just going ahead and building a precomputed matcher?
491    final char[] chars = sequence.toString().toCharArray();
492    Arrays.sort(chars);
493    StringBuilder description = new StringBuilder("CharMatcher.anyOf(\"");
494    for (char c : chars) {
495      description.append(showCharacter(c));
496    }
497    description.append("\")");
498    return new CharMatcher(description.toString()) {
499      @Override public boolean matches(char c) {
500        return Arrays.binarySearch(chars, c) >= 0;
501      }
502
503      @Override
504      @GwtIncompatible("java.util.BitSet")
505      void setBits(BitSet table) {
506        for (char c : chars) {
507          table.set(c);
508        }
509      }
510    };
511  }
512
513  private static CharMatcher isEither(
514      final char match1,
515      final char match2) {
516    String description = "CharMatcher.anyOf(\"" +
517        showCharacter(match1) + showCharacter(match2) + "\")";
518    return new FastMatcher(description) {
519      @Override public boolean matches(char c) {
520        return c == match1 || c == match2;
521      }
522
523      @GwtIncompatible("java.util.BitSet")
524      @Override void setBits(BitSet table) {
525        table.set(match1);
526        table.set(match2);
527      }
528    };
529  }
530
531  /**
532   * Returns a {@code char} matcher that matches any character not present in the given character
533   * sequence.
534   */
535  public static CharMatcher noneOf(CharSequence sequence) {
536    return anyOf(sequence).negate();
537  }
538
539  /**
540   * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
541   * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
542   * CharMatcher.inRange('a', 'z')}.
543   *
544   * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
545   */
546  public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
547    checkArgument(endInclusive >= startInclusive);
548    String description = "CharMatcher.inRange('" +
549        showCharacter(startInclusive) + "', '" +
550        showCharacter(endInclusive) + "')";
551    return inRange(startInclusive, endInclusive, description);
552  }
553
554  static CharMatcher inRange(final char startInclusive, final char endInclusive,
555      String description) {
556    return new FastMatcher(description) {
557      @Override public boolean matches(char c) {
558        return startInclusive <= c && c <= endInclusive;
559      }
560
561      @GwtIncompatible("java.util.BitSet")
562      @Override void setBits(BitSet table) {
563        table.set(startInclusive, endInclusive + 1);
564      }
565    };
566  }
567
568  /**
569   * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
570   * which operates on primitive {@code char} instances instead.
571   */
572  public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
573    checkNotNull(predicate);
574    if (predicate instanceof CharMatcher) {
575      return (CharMatcher) predicate;
576    }
577    String description = "CharMatcher.forPredicate(" + predicate + ")";
578    return new CharMatcher(description) {
579      @Override public boolean matches(char c) {
580        return predicate.apply(c);
581      }
582
583      @Override public boolean apply(Character character) {
584        return predicate.apply(checkNotNull(character));
585      }
586    };
587  }
588
589  // State
590  final String description;
591
592  // Constructors
593
594  /**
595   * Sets the {@code toString()} from the given description.
596   */
597  CharMatcher(String description) {
598    this.description = description;
599  }
600
601  /**
602   * Constructor for use by subclasses. When subclassing, you may want to override
603   * {@code toString()} to provide a useful description.
604   */
605  protected CharMatcher() {
606    description = super.toString();
607  }
608
609  // Abstract methods
610
611  /** Determines a true or false value for the given character. */
612  public abstract boolean matches(char c);
613
614  // Non-static factories
615
616  /**
617   * Returns a matcher that matches any character not matched by this matcher.
618   */
619  public CharMatcher negate() {
620    return new NegatedMatcher(this);
621  }
622
623  private static class NegatedMatcher extends CharMatcher {
624    final CharMatcher original;
625
626    NegatedMatcher(String toString, CharMatcher original) {
627      super(toString);
628      this.original = original;
629    }
630
631    NegatedMatcher(CharMatcher original) {
632      this(original + ".negate()", original);
633    }
634
635    @Override public boolean matches(char c) {
636      return !original.matches(c);
637    }
638
639    @Override public boolean matchesAllOf(CharSequence sequence) {
640      return original.matchesNoneOf(sequence);
641    }
642
643    @Override public boolean matchesNoneOf(CharSequence sequence) {
644      return original.matchesAllOf(sequence);
645    }
646
647    @Override public int countIn(CharSequence sequence) {
648      return sequence.length() - original.countIn(sequence);
649    }
650
651    @GwtIncompatible("java.util.BitSet")
652    @Override
653    void setBits(BitSet table) {
654      BitSet tmp = new BitSet();
655      original.setBits(tmp);
656      tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
657      table.or(tmp);
658    }
659
660    @Override public CharMatcher negate() {
661      return original;
662    }
663
664    @Override
665    CharMatcher withToString(String description) {
666      return new NegatedMatcher(description, original);
667    }
668  }
669
670  /**
671   * Returns a matcher that matches any character matched by both this matcher and {@code other}.
672   */
673  public CharMatcher and(CharMatcher other) {
674    return new And(this, checkNotNull(other));
675  }
676
677  private static class And extends CharMatcher {
678    final CharMatcher first;
679    final CharMatcher second;
680
681    And(CharMatcher a, CharMatcher b) {
682      this(a, b, "CharMatcher.and(" + a + ", " + b + ")");
683    }
684
685    And(CharMatcher a, CharMatcher b, String description) {
686      super(description);
687      first = checkNotNull(a);
688      second = checkNotNull(b);
689    }
690
691    @Override
692    public boolean matches(char c) {
693      return first.matches(c) && second.matches(c);
694    }
695
696    @GwtIncompatible("java.util.BitSet")
697    @Override
698    void setBits(BitSet table) {
699      BitSet tmp1 = new BitSet();
700      first.setBits(tmp1);
701      BitSet tmp2 = new BitSet();
702      second.setBits(tmp2);
703      tmp1.and(tmp2);
704      table.or(tmp1);
705    }
706
707    @Override
708    CharMatcher withToString(String description) {
709      return new And(first, second, description);
710    }
711  }
712
713  /**
714   * Returns a matcher that matches any character matched by either this matcher or {@code other}.
715   */
716  public CharMatcher or(CharMatcher other) {
717    return new Or(this, checkNotNull(other));
718  }
719
720  private static class Or extends CharMatcher {
721    final CharMatcher first;
722    final CharMatcher second;
723
724    Or(CharMatcher a, CharMatcher b, String description) {
725      super(description);
726      first = checkNotNull(a);
727      second = checkNotNull(b);
728    }
729
730    Or(CharMatcher a, CharMatcher b) {
731      this(a, b, "CharMatcher.or(" + a + ", " + b + ")");
732    }
733
734    @GwtIncompatible("java.util.BitSet")
735    @Override
736    void setBits(BitSet table) {
737      first.setBits(table);
738      second.setBits(table);
739    }
740
741    @Override
742    public boolean matches(char c) {
743      return first.matches(c) || second.matches(c);
744    }
745
746    @Override
747    CharMatcher withToString(String description) {
748      return new Or(first, second, description);
749    }
750  }
751
752  /**
753   * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
754   * query than the original; your mileage may vary. Precomputation takes time and is likely to be
755   * worthwhile only if the precomputed matcher is queried many thousands of times.
756   *
757   * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
758   * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
759   * worthwhile tradeoff in a browser.
760   */
761  public CharMatcher precomputed() {
762    return Platform.precomputeCharMatcher(this);
763  }
764
765  /**
766   * Subclasses should provide a new CharMatcher with the same characteristics as {@code this},
767   * but with their {@code toString} method overridden with the new description.
768   *
769   * <p>This is unsupported by default.
770   */
771  CharMatcher withToString(String description) {
772    throw new UnsupportedOperationException();
773  }
774
775  private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1;
776
777  /**
778   * This is the actual implementation of {@link #precomputed}, but we bounce calls through a
779   * method on {@link Platform} so that we can have different behavior in GWT.
780   *
781   * <p>This implementation tries to be smart in a number of ways.  It recognizes cases where
782   * the negation is cheaper to precompute than the matcher itself; it tries to build small
783   * hash tables for matchers that only match a few characters, and so on.  In the worst-case
784   * scenario, it constructs an eight-kilobyte bit array and queries that.
785   * In many situations this produces a matcher which is faster to query than the original.
786   */
787  @GwtIncompatible("java.util.BitSet")
788  CharMatcher precomputedInternal() {
789    final BitSet table = new BitSet();
790    setBits(table);
791    int totalCharacters = table.cardinality();
792    if (totalCharacters * 2 <= DISTINCT_CHARS) {
793      return precomputedPositive(totalCharacters, table, description);
794    } else {
795      // TODO(user): is it worth it to worry about the last character of large matchers?
796      table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
797      int negatedCharacters = DISTINCT_CHARS - totalCharacters;
798      String suffix = ".negate()";
799      String negatedDescription = description.endsWith(suffix)
800          ? description.substring(0, description.length() - suffix.length())
801          : description + suffix;
802      return new NegatedFastMatcher(toString(),
803          precomputedPositive(negatedCharacters, table, negatedDescription));
804    }
805  }
806
807  /**
808   * A matcher for which precomputation will not yield any significant benefit.
809   */
810  abstract static class FastMatcher extends CharMatcher {
811    FastMatcher() {
812      super();
813    }
814
815    FastMatcher(String description) {
816      super(description);
817    }
818
819    @Override
820    public final CharMatcher precomputed() {
821      return this;
822    }
823
824    @Override
825    public CharMatcher negate() {
826      return new NegatedFastMatcher(this);
827    }
828  }
829
830  static final class NegatedFastMatcher extends NegatedMatcher {
831    NegatedFastMatcher(CharMatcher original) {
832      super(original);
833    }
834
835    NegatedFastMatcher(String toString, CharMatcher original) {
836      super(toString, original);
837    }
838
839    @Override
840    public final CharMatcher precomputed() {
841      return this;
842    }
843
844    @Override
845    CharMatcher withToString(String description) {
846      return new NegatedFastMatcher(description, original);
847    }
848  }
849
850  /**
851   * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper.
852   */
853  @GwtIncompatible("java.util.BitSet")
854  private static CharMatcher precomputedPositive(
855      int totalCharacters,
856      BitSet table,
857      String description) {
858    switch (totalCharacters) {
859      case 0:
860        return NONE;
861      case 1:
862        return is((char) table.nextSetBit(0));
863      case 2:
864        char c1 = (char) table.nextSetBit(0);
865        char c2 = (char) table.nextSetBit(c1 + 1);
866        return isEither(c1, c2);
867      default:
868        return isSmall(totalCharacters, table.length())
869            ? SmallCharMatcher.from(table, description)
870            : new BitSetMatcher(table, description);
871    }
872  }
873
874  private static boolean isSmall(int totalCharacters, int tableLength) {
875    return totalCharacters <= SmallCharMatcher.MAX_SIZE
876        && tableLength > (totalCharacters * 4 * Character.SIZE);
877        // err on the side of BitSetMatcher
878  }
879
880  @GwtIncompatible("java.util.BitSet")
881  private static class BitSetMatcher extends FastMatcher {
882    private final BitSet table;
883
884    private BitSetMatcher(BitSet table, String description) {
885      super(description);
886      if (table.length() + Long.SIZE < table.size()) {
887        table = (BitSet) table.clone();
888        // If only we could actually call BitSet.trimToSize() ourselves...
889      }
890      this.table = table;
891    }
892
893    @Override public boolean matches(char c) {
894      return table.get(c);
895    }
896
897    @Override
898    void setBits(BitSet bitSet) {
899      bitSet.or(table);
900    }
901  }
902
903  /**
904   * Sets bits in {@code table} matched by this matcher.
905   */
906  @GwtIncompatible("java.util.BitSet")
907  void setBits(BitSet table) {
908    for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) {
909      if (matches((char) c)) {
910        table.set(c);
911      }
912    }
913  }
914
915  // Text processing routines
916
917  /**
918   * Returns {@code true} if a character sequence contains at least one matching character.
919   * Equivalent to {@code !matchesNoneOf(sequence)}.
920   *
921   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
922   * character, until this returns {@code true} or the end is reached.
923   *
924   * @param sequence the character sequence to examine, possibly empty
925   * @return {@code true} if this matcher matches at least one character in the sequence
926   * @since 8.0
927   */
928  public boolean matchesAnyOf(CharSequence sequence) {
929    return !matchesNoneOf(sequence);
930  }
931
932  /**
933   * Returns {@code true} if a character sequence contains only matching characters.
934   *
935   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
936   * character, until this returns {@code false} or the end is reached.
937   *
938   * @param sequence the character sequence to examine, possibly empty
939   * @return {@code true} if this matcher matches every character in the sequence, including when
940   *         the sequence is empty
941   */
942  public boolean matchesAllOf(CharSequence sequence) {
943    for (int i = sequence.length() - 1; i >= 0; i--) {
944      if (!matches(sequence.charAt(i))) {
945        return false;
946      }
947    }
948    return true;
949  }
950
951  /**
952   * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
953   * {@code !matchesAnyOf(sequence)}.
954   *
955   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
956   * character, until this returns {@code false} or the end is reached.
957   *
958   * @param sequence the character sequence to examine, possibly empty
959   * @return {@code true} if this matcher matches every character in the sequence, including when
960   *         the sequence is empty
961   */
962  public boolean matchesNoneOf(CharSequence sequence) {
963    return indexIn(sequence) == -1;
964  }
965
966  /**
967   * Returns the index of the first matching character in a character sequence, or {@code -1} if no
968   * matching character is present.
969   *
970   * <p>The default implementation iterates over the sequence in forward order calling {@link
971   * #matches} for each character.
972   *
973   * @param sequence the character sequence to examine from the beginning
974   * @return an index, or {@code -1} if no character matches
975   */
976  public int indexIn(CharSequence sequence) {
977    int length = sequence.length();
978    for (int i = 0; i < length; i++) {
979      if (matches(sequence.charAt(i))) {
980        return i;
981      }
982    }
983    return -1;
984  }
985
986  /**
987   * Returns the index of the first matching character in a character sequence, starting from a
988   * given position, or {@code -1} if no character matches after that position.
989   *
990   * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
991   * start}, calling {@link #matches} for each character.
992   *
993   * @param sequence the character sequence to examine
994   * @param start the first index to examine; must be nonnegative and no greater than {@code
995   *        sequence.length()}
996   * @return the index of the first matching character, guaranteed to be no less than {@code start},
997   *         or {@code -1} if no character matches
998   * @throws IndexOutOfBoundsException if start is negative or greater than {@code
999   *         sequence.length()}
1000   */
1001  public int indexIn(CharSequence sequence, int start) {
1002    int length = sequence.length();
1003    Preconditions.checkPositionIndex(start, length);
1004    for (int i = start; i < length; i++) {
1005      if (matches(sequence.charAt(i))) {
1006        return i;
1007      }
1008    }
1009    return -1;
1010  }
1011
1012  /**
1013   * Returns the index of the last matching character in a character sequence, or {@code -1} if no
1014   * matching character is present.
1015   *
1016   * <p>The default implementation iterates over the sequence in reverse order calling {@link
1017   * #matches} for each character.
1018   *
1019   * @param sequence the character sequence to examine from the end
1020   * @return an index, or {@code -1} if no character matches
1021   */
1022  public int lastIndexIn(CharSequence sequence) {
1023    for (int i = sequence.length() - 1; i >= 0; i--) {
1024      if (matches(sequence.charAt(i))) {
1025        return i;
1026      }
1027    }
1028    return -1;
1029  }
1030
1031  /**
1032   * Returns the number of matching characters found in a character sequence.
1033   */
1034  public int countIn(CharSequence sequence) {
1035    int count = 0;
1036    for (int i = 0; i < sequence.length(); i++) {
1037      if (matches(sequence.charAt(i))) {
1038        count++;
1039      }
1040    }
1041    return count;
1042  }
1043
1044  /**
1045   * Returns a string containing all non-matching characters of a character sequence, in order. For
1046   * example: <pre>   {@code
1047   *
1048   *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
1049   *
1050   * ... returns {@code "bzr"}.
1051   */
1052  @CheckReturnValue
1053  public String removeFrom(CharSequence sequence) {
1054    String string = sequence.toString();
1055    int pos = indexIn(string);
1056    if (pos == -1) {
1057      return string;
1058    }
1059
1060    char[] chars = string.toCharArray();
1061    int spread = 1;
1062
1063    // This unusual loop comes from extensive benchmarking
1064    OUT: while (true) {
1065      pos++;
1066      while (true) {
1067        if (pos == chars.length) {
1068          break OUT;
1069        }
1070        if (matches(chars[pos])) {
1071          break;
1072        }
1073        chars[pos - spread] = chars[pos];
1074        pos++;
1075      }
1076      spread++;
1077    }
1078    return new String(chars, 0, pos - spread);
1079  }
1080
1081  /**
1082   * Returns a string containing all matching characters of a character sequence, in order. For
1083   * example: <pre>   {@code
1084   *
1085   *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
1086   *
1087   * ... returns {@code "aaa"}.
1088   */
1089  @CheckReturnValue
1090  public String retainFrom(CharSequence sequence) {
1091    return negate().removeFrom(sequence);
1092  }
1093
1094  /**
1095   * Returns a string copy of the input character sequence, with each character that matches this
1096   * matcher replaced by a given replacement character. For example: <pre>   {@code
1097   *
1098   *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
1099   *
1100   * ... returns {@code "rodor"}.
1101   *
1102   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1103   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1104   * character.
1105   *
1106   * @param sequence the character sequence to replace matching characters in
1107   * @param replacement the character to append to the result string in place of each matching
1108   *        character in {@code sequence}
1109   * @return the new string
1110   */
1111  @CheckReturnValue
1112  public String replaceFrom(CharSequence sequence, char replacement) {
1113    String string = sequence.toString();
1114    int pos = indexIn(string);
1115    if (pos == -1) {
1116      return string;
1117    }
1118    char[] chars = string.toCharArray();
1119    chars[pos] = replacement;
1120    for (int i = pos + 1; i < chars.length; i++) {
1121      if (matches(chars[i])) {
1122        chars[i] = replacement;
1123      }
1124    }
1125    return new String(chars);
1126  }
1127
1128  /**
1129   * Returns a string copy of the input character sequence, with each character that matches this
1130   * matcher replaced by a given replacement sequence. For example: <pre>   {@code
1131   *
1132   *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1133   *
1134   * ... returns {@code "yoohoo"}.
1135   *
1136   * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1137   * off calling {@link #replaceFrom(CharSequence, char)} directly.
1138   *
1139   * @param sequence the character sequence to replace matching characters in
1140   * @param replacement the characters to append to the result string in place of each matching
1141   *        character in {@code sequence}
1142   * @return the new string
1143   */
1144  @CheckReturnValue
1145  public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1146    int replacementLen = replacement.length();
1147    if (replacementLen == 0) {
1148      return removeFrom(sequence);
1149    }
1150    if (replacementLen == 1) {
1151      return replaceFrom(sequence, replacement.charAt(0));
1152    }
1153
1154    String string = sequence.toString();
1155    int pos = indexIn(string);
1156    if (pos == -1) {
1157      return string;
1158    }
1159
1160    int len = string.length();
1161    StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1162
1163    int oldpos = 0;
1164    do {
1165      buf.append(string, oldpos, pos);
1166      buf.append(replacement);
1167      oldpos = pos + 1;
1168      pos = indexIn(string, oldpos);
1169    } while (pos != -1);
1170
1171    buf.append(string, oldpos, len);
1172    return buf.toString();
1173  }
1174
1175  /**
1176   * Returns a substring of the input character sequence that omits all characters this matcher
1177   * matches from the beginning and from the end of the string. For example: <pre>   {@code
1178   *
1179   *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1180   *
1181   * ... returns {@code "cat"}.
1182   *
1183   * <p>Note that: <pre>   {@code
1184   *
1185   *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1186   *
1187   * ... is equivalent to {@link String#trim()}.
1188   */
1189  @CheckReturnValue
1190  public String trimFrom(CharSequence sequence) {
1191    int len = sequence.length();
1192    int first;
1193    int last;
1194
1195    for (first = 0; first < len; first++) {
1196      if (!matches(sequence.charAt(first))) {
1197        break;
1198      }
1199    }
1200    for (last = len - 1; last > first; last--) {
1201      if (!matches(sequence.charAt(last))) {
1202        break;
1203      }
1204    }
1205
1206    return sequence.subSequence(first, last + 1).toString();
1207  }
1208
1209  /**
1210   * Returns a substring of the input character sequence that omits all characters this matcher
1211   * matches from the beginning of the string. For example: <pre> {@code
1212   *
1213   *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1214   *
1215   * ... returns {@code "catbab"}.
1216   */
1217  @CheckReturnValue
1218  public String trimLeadingFrom(CharSequence sequence) {
1219    int len = sequence.length();
1220    for (int first = 0; first < len; first++) {
1221      if (!matches(sequence.charAt(first))) {
1222        return sequence.subSequence(first, len).toString();
1223      }
1224    }
1225    return "";
1226  }
1227
1228  /**
1229   * Returns a substring of the input character sequence that omits all characters this matcher
1230   * matches from the end of the string. For example: <pre> {@code
1231   *
1232   *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1233   *
1234   * ... returns {@code "abacat"}.
1235   */
1236  @CheckReturnValue
1237  public String trimTrailingFrom(CharSequence sequence) {
1238    int len = sequence.length();
1239    for (int last = len - 1; last >= 0; last--) {
1240      if (!matches(sequence.charAt(last))) {
1241        return sequence.subSequence(0, last + 1).toString();
1242      }
1243    }
1244    return "";
1245  }
1246
1247  /**
1248   * Returns a string copy of the input character sequence, with each group of consecutive
1249   * characters that match this matcher replaced by a single replacement character. For example:
1250   * <pre>   {@code
1251   *
1252   *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1253   *
1254   * ... returns {@code "b-p-r"}.
1255   *
1256   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1257   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1258   * character.
1259   *
1260   * @param sequence the character sequence to replace matching groups of characters in
1261   * @param replacement the character to append to the result string in place of each group of
1262   *        matching characters in {@code sequence}
1263   * @return the new string
1264   */
1265  @CheckReturnValue
1266  public String collapseFrom(CharSequence sequence, char replacement) {
1267    // This implementation avoids unnecessary allocation.
1268    int len = sequence.length();
1269    for (int i = 0; i < len; i++) {
1270      char c = sequence.charAt(i);
1271      if (matches(c)) {
1272        if (c == replacement
1273            && (i == len - 1 || !matches(sequence.charAt(i + 1)))) {
1274          // a no-op replacement
1275          i++;
1276        } else {
1277          StringBuilder builder = new StringBuilder(len)
1278              .append(sequence.subSequence(0, i))
1279              .append(replacement);
1280          return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true);
1281        }
1282      }
1283    }
1284    // no replacement needed
1285    return sequence.toString();
1286  }
1287
1288  /**
1289   * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1290   * groups of matching characters at the start or end of the sequence are removed without
1291   * replacement.
1292   */
1293  @CheckReturnValue
1294  public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1295    // This implementation avoids unnecessary allocation.
1296    int len = sequence.length();
1297    int first;
1298    int last;
1299
1300    for (first = 0; first < len && matches(sequence.charAt(first)); first++) {}
1301    for (last = len - 1; last > first && matches(sequence.charAt(last)); last--) {}
1302
1303    return (first == 0 && last == len - 1)
1304        ? collapseFrom(sequence, replacement)
1305        : finishCollapseFrom(
1306              sequence, first, last + 1, replacement,
1307              new StringBuilder(last + 1 - first),
1308              false);
1309  }
1310
1311  private String finishCollapseFrom(
1312      CharSequence sequence, int start, int end, char replacement,
1313      StringBuilder builder, boolean inMatchingGroup) {
1314    for (int i = start; i < end; i++) {
1315      char c = sequence.charAt(i);
1316      if (matches(c)) {
1317        if (!inMatchingGroup) {
1318          builder.append(replacement);
1319          inMatchingGroup = true;
1320        }
1321      } else {
1322        builder.append(c);
1323        inMatchingGroup = false;
1324      }
1325    }
1326    return builder.toString();
1327  }
1328
1329  // Predicate interface
1330
1331  /**
1332   * Equivalent to {@link #matches}; provided only to satisfy the {@link Predicate} interface. When
1333   * using a reference of type {@code CharMatcher}, invoke {@link #matches} directly instead.
1334   */
1335  @Override public boolean apply(Character character) {
1336    return matches(character);
1337  }
1338
1339  /**
1340   * Returns a string representation of this {@code CharMatcher}, such as
1341   * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1342   */
1343  @Override
1344  public String toString() {
1345    return description;
1346  }
1347
1348  /**
1349   * A special-case CharMatcher for Unicode whitespace characters that is extremely
1350   * efficient both in space required and in time to check for matches.
1351   *
1352   * Implementation details.
1353   * It turns out that all current (early 2012) Unicode characters are unique modulo 79:
1354   * so we can construct a lookup table of exactly 79 entries, and just check the character code
1355   * mod 79, and see if that character is in the table.
1356   *
1357   * There is a 1 at the beginning of the table so that the null character is not listed
1358   * as whitespace.
1359   *
1360   * Other things we tried that did not prove to be beneficial, mostly due to speed concerns:
1361   *
1362   *   * Binary search into the sorted list of characters, i.e., what
1363   *     CharMatcher.anyOf() does</li>
1364   *   * Perfect hash function into a table of size 26 (using an offset table and a special
1365   *     Jenkins hash function)</li>
1366   *   * Perfect-ish hash function that required two lookups into a single table of size 26.</li>
1367   *   * Using a power-of-2 sized hash table (size 64) with linear probing.</li>
1368   *
1369   * --Christopher Swenson, February 2012.
1370   */
1371  private static final String WHITESPACE_TABLE = "\u0001\u0000\u00a0\u0000\u0000\u0000\u0000\u0000"
1372      + "\u0000\u0009\n\u000b\u000c\r\u0000\u0000\u2028\u2029\u0000\u0000\u0000\u0000\u0000\u202f"
1373      + "\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0020\u0000\u0000\u0000\u0000\u0000"
1374      + "\u0000\u0000\u0000\u0000\u0000\u3000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"
1375      + "\u0000\u0000\u0085\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a"
1376      + "\u0000\u0000\u0000\u0000\u0000\u205f\u1680\u0000\u0000\u180e\u0000\u0000\u0000";
1377
1378  /**
1379   * Determines whether a character is whitespace according to the latest Unicode standard, as
1380   * illustrated
1381   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1382   * This is not the same definition used by other Java APIs. (See a
1383   * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1384   * definitions of "whitespace"</a>.)
1385   *
1386   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1387   * to date.
1388   */
1389  public static final CharMatcher WHITESPACE = new FastMatcher("CharMatcher.WHITESPACE") {
1390
1391    @Override public boolean matches(char c) {
1392      return WHITESPACE_TABLE.charAt(c % 79) == c;
1393    }
1394  };
1395}