001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.base;
018
019import static com.google.common.base.Preconditions.checkArgument;
020import static com.google.common.base.Preconditions.checkNotNull;
021
022import com.google.common.annotations.Beta;
023import com.google.common.annotations.GwtCompatible;
024import com.google.common.annotations.GwtIncompatible;
025
026import java.util.Arrays;
027import java.util.BitSet;
028
029import javax.annotation.CheckReturnValue;
030
031/**
032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
033 * for any {@link Object}. Also offers basic text processing methods based on this function.
034 * Implementations are strongly encouraged to be side-effect-free and immutable.
035 *
036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
038 *
039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
042 * treats these just as two separate characters.
043 *
044 * <p>Example usages: <pre>
045 *   String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
046 *   if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
047 *
048 * <p>See the Guava User Guide article on <a href=
049 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
050 * {@code CharMatcher}</a>.
051 *
052 * @author Kevin Bourrillion
053 * @since 1.0
054 */
055@Beta // Possibly change from chars to code points; decide constants vs. methods
056@GwtCompatible(emulated = true)
057public abstract class CharMatcher implements Predicate<Character> {
058  // Constants
059  /**
060   * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
061   * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
062   * discussion of that term.
063   *
064   * @since 2.0
065   */
066  public static final CharMatcher BREAKING_WHITESPACE = new CharMatcher() {
067    @Override
068    public boolean matches(char c) {
069      switch (c) {
070        case '\t':
071        case '\n':
072        case '\013':
073        case '\f':
074        case '\r':
075        case ' ':
076        case '\u0085':
077        case '\u1680':
078        case '\u2028':
079        case '\u2029':
080        case '\u205f':
081        case '\u3000':
082          return true;
083        case '\u2007':
084          return false;
085        default:
086          return c >= '\u2000' && c <= '\u200a';
087      }
088    }
089
090    @Override
091    public String toString() {
092      return "CharMatcher.BREAKING_WHITESPACE";
093    }
094  };
095
096  /**
097   * Determines whether a character is ASCII, meaning that its code point is less than 128.
098   */
099  public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII");
100
101  private static class RangesMatcher extends CharMatcher {
102    private final char[] rangeStarts;
103    private final char[] rangeEnds;
104
105    RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) {
106      super(description);
107      this.rangeStarts = rangeStarts;
108      this.rangeEnds = rangeEnds;
109      checkArgument(rangeStarts.length == rangeEnds.length);
110      for (int i = 0; i < rangeStarts.length; i++) {
111        checkArgument(rangeStarts[i] <= rangeEnds[i]);
112        if (i + 1 < rangeStarts.length) {
113          checkArgument(rangeEnds[i] < rangeStarts[i + 1]);
114        }
115      }
116    }
117
118    @Override
119    public boolean matches(char c) {
120      int index = Arrays.binarySearch(rangeStarts, c);
121      if (index >= 0) {
122        return true;
123      } else {
124        index = ~index - 1;
125        return index >= 0 && c <= rangeEnds[index];
126      }
127    }
128  }
129
130  // Must be in ascending order.
131  private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6"
132      + "\u0c66\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1b50\u1bb0"
133      + "\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
134
135  private static final String NINES;
136  static {
137    StringBuilder builder = new StringBuilder(ZEROES.length());
138    for (int i = 0; i < ZEROES.length(); i++) {
139      builder.append((char) (ZEROES.charAt(i) + 9));
140    }
141    NINES = builder.toString();
142  }
143
144  /**
145   * Determines whether a character is a digit according to
146   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
147   */
148  public static final CharMatcher DIGIT = new RangesMatcher(
149      "CharMatcher.DIGIT", ZEROES.toCharArray(), NINES.toCharArray());
150
151  /**
152   * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
153   * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
154   */
155  public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") {
156    @Override public boolean matches(char c) {
157      return Character.isDigit(c);
158    }
159  };
160
161  /**
162   * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
163   * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
164   * inRange('a', 'z').or(inRange('A', 'Z'))}.
165   */
166  public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") {
167    @Override public boolean matches(char c) {
168      return Character.isLetter(c);
169    }
170  };
171
172  /**
173   * Determines whether a character is a letter or digit according to {@link
174   * Character#isLetterOrDigit(char) Java's definition}.
175   */
176  public static final CharMatcher JAVA_LETTER_OR_DIGIT =
177      new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") {
178    @Override public boolean matches(char c) {
179      return Character.isLetterOrDigit(c);
180    }
181  };
182
183  /**
184   * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
185   * Java's definition}.
186   */
187  public static final CharMatcher JAVA_UPPER_CASE =
188      new CharMatcher("CharMatcher.JAVA_UPPER_CASE") {
189    @Override public boolean matches(char c) {
190      return Character.isUpperCase(c);
191    }
192  };
193
194  /**
195   * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
196   * Java's definition}.
197   */
198  public static final CharMatcher JAVA_LOWER_CASE =
199      new CharMatcher("CharMatcher.JAVA_LOWER_CASE") {
200    @Override public boolean matches(char c) {
201      return Character.isLowerCase(c);
202    }
203  };
204
205  /**
206   * Determines whether a character is an ISO control character as specified by {@link
207   * Character#isISOControl(char)}.
208   */
209  public static final CharMatcher JAVA_ISO_CONTROL =
210      inRange('\u0000', '\u001f')
211      .or(inRange('\u007f', '\u009f'))
212      .withToString("CharMatcher.JAVA_ISO_CONTROL");
213
214  /**
215   * Determines whether a character is invisible; that is, if its Unicode category is any of
216   * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
217   * PRIVATE_USE according to ICU4J.
218   */
219  public static final CharMatcher INVISIBLE = new RangesMatcher("CharMatcher.INVISIBLE", (
220      "\u0000\u007f\u00ad\u0600\u06dd\u070f\u1680\u180e\u2000\u2028\u205f\u206a\u3000\ud800\ufeff"
221      + "\ufff9\ufffa").toCharArray(), (
222      "\u0020\u00a0\u00ad\u0604\u06dd\u070f\u1680\u180e\u200f\u202f\u2064\u206f\u3000\uf8ff\ufeff"
223      + "\ufff9\ufffb").toCharArray());
224
225  private static String showCharacter(char c) {
226    String hex = "0123456789ABCDEF";
227    char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'};
228    for (int i = 0; i < 4; i++) {
229      tmp[5 - i] = hex.charAt(c & 0xF);
230      c >>= 4;
231    }
232    return String.copyValueOf(tmp);
233
234  }
235
236  /**
237   * Determines whether a character is single-width (not double-width). When in doubt, this matcher
238   * errs on the side of returning {@code false} (that is, it tends to assume a character is
239   * double-width).
240   *
241   * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
242   * date.
243   */
244  public static final CharMatcher SINGLE_WIDTH = new RangesMatcher("CharMatcher.SINGLE_WIDTH",
245      "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(),
246      "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray());
247
248  /** Matches any character. */
249  public static final CharMatcher ANY =
250      new FastMatcher("CharMatcher.ANY") {
251        @Override public boolean matches(char c) {
252          return true;
253        }
254
255        @Override public int indexIn(CharSequence sequence) {
256          return (sequence.length() == 0) ? -1 : 0;
257        }
258
259        @Override public int indexIn(CharSequence sequence, int start) {
260          int length = sequence.length();
261          Preconditions.checkPositionIndex(start, length);
262          return (start == length) ? -1 : start;
263        }
264
265        @Override public int lastIndexIn(CharSequence sequence) {
266          return sequence.length() - 1;
267        }
268
269        @Override public boolean matchesAllOf(CharSequence sequence) {
270          checkNotNull(sequence);
271          return true;
272        }
273
274        @Override public boolean matchesNoneOf(CharSequence sequence) {
275          return sequence.length() == 0;
276        }
277
278        @Override public String removeFrom(CharSequence sequence) {
279          checkNotNull(sequence);
280          return "";
281        }
282
283        @Override public String replaceFrom(CharSequence sequence, char replacement) {
284          char[] array = new char[sequence.length()];
285          Arrays.fill(array, replacement);
286          return new String(array);
287        }
288
289        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
290          StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
291          for (int i = 0; i < sequence.length(); i++) {
292            retval.append(replacement);
293          }
294          return retval.toString();
295        }
296
297        @Override public String collapseFrom(CharSequence sequence, char replacement) {
298          return (sequence.length() == 0) ? "" : String.valueOf(replacement);
299        }
300
301        @Override public String trimFrom(CharSequence sequence) {
302          checkNotNull(sequence);
303          return "";
304        }
305
306        @Override public int countIn(CharSequence sequence) {
307          return sequence.length();
308        }
309
310        @Override public CharMatcher and(CharMatcher other) {
311          return checkNotNull(other);
312        }
313
314        @Override public CharMatcher or(CharMatcher other) {
315          checkNotNull(other);
316          return this;
317        }
318
319        @Override public CharMatcher negate() {
320          return NONE;
321        }
322      };
323
324  /** Matches no characters. */
325  public static final CharMatcher NONE =
326      new FastMatcher("CharMatcher.NONE") {
327        @Override public boolean matches(char c) {
328          return false;
329        }
330
331        @Override public int indexIn(CharSequence sequence) {
332          checkNotNull(sequence);
333          return -1;
334        }
335
336        @Override public int indexIn(CharSequence sequence, int start) {
337          int length = sequence.length();
338          Preconditions.checkPositionIndex(start, length);
339          return -1;
340        }
341
342        @Override public int lastIndexIn(CharSequence sequence) {
343          checkNotNull(sequence);
344          return -1;
345        }
346
347        @Override public boolean matchesAllOf(CharSequence sequence) {
348          return sequence.length() == 0;
349        }
350
351        @Override public boolean matchesNoneOf(CharSequence sequence) {
352          checkNotNull(sequence);
353          return true;
354        }
355
356        @Override public String removeFrom(CharSequence sequence) {
357          return sequence.toString();
358        }
359
360        @Override public String replaceFrom(CharSequence sequence, char replacement) {
361          return sequence.toString();
362        }
363
364        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
365          checkNotNull(replacement);
366          return sequence.toString();
367        }
368
369        @Override public String collapseFrom(CharSequence sequence, char replacement) {
370          return sequence.toString();
371        }
372
373        @Override public String trimFrom(CharSequence sequence) {
374          return sequence.toString();
375        }
376
377        @Override
378        public String trimLeadingFrom(CharSequence sequence) {
379          return sequence.toString();
380        }
381
382        @Override
383        public String trimTrailingFrom(CharSequence sequence) {
384          return sequence.toString();
385        }
386
387        @Override public int countIn(CharSequence sequence) {
388          checkNotNull(sequence);
389          return 0;
390        }
391
392        @Override public CharMatcher and(CharMatcher other) {
393          checkNotNull(other);
394          return this;
395        }
396
397        @Override public CharMatcher or(CharMatcher other) {
398          return checkNotNull(other);
399        }
400
401        @Override public CharMatcher negate() {
402          return ANY;
403        }
404      };
405
406  // Static factories
407
408  /**
409   * Returns a {@code char} matcher that matches only one specified character.
410   */
411  public static CharMatcher is(final char match) {
412    String description = "CharMatcher.is('" + showCharacter(match) + "')";
413    return new FastMatcher(description) {
414      @Override public boolean matches(char c) {
415        return c == match;
416      }
417
418      @Override public String replaceFrom(CharSequence sequence, char replacement) {
419        return sequence.toString().replace(match, replacement);
420      }
421
422      @Override public CharMatcher and(CharMatcher other) {
423        return other.matches(match) ? this : NONE;
424      }
425
426      @Override public CharMatcher or(CharMatcher other) {
427        return other.matches(match) ? other : super.or(other);
428      }
429
430      @Override public CharMatcher negate() {
431        return isNot(match);
432      }
433
434      @GwtIncompatible("java.util.BitSet")
435      @Override
436      void setBits(BitSet table) {
437        table.set(match);
438      }
439    };
440  }
441
442  /**
443   * Returns a {@code char} matcher that matches any character except the one specified.
444   *
445   * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
446   */
447  public static CharMatcher isNot(final char match) {
448    String description = "CharMatcher.isNot(" + Integer.toHexString(match) + ")";
449    return new FastMatcher(description) {
450      @Override public boolean matches(char c) {
451        return c != match;
452      }
453
454      @Override public CharMatcher and(CharMatcher other) {
455        return other.matches(match) ? super.and(other) : other;
456      }
457
458      @Override public CharMatcher or(CharMatcher other) {
459        return other.matches(match) ? ANY : this;
460      }
461
462      @GwtIncompatible("java.util.BitSet")
463      @Override
464      void setBits(BitSet table) {
465        table.set(0, match);
466        table.set(match + 1, Character.MAX_VALUE + 1);
467      }
468
469      @Override public CharMatcher negate() {
470        return is(match);
471      }
472    };
473  }
474
475  /**
476   * Returns a {@code char} matcher that matches any character present in the given character
477   * sequence.
478   */
479  public static CharMatcher anyOf(final CharSequence sequence) {
480    switch (sequence.length()) {
481      case 0:
482        return NONE;
483      case 1:
484        return is(sequence.charAt(0));
485      case 2:
486        return isEither(sequence.charAt(0), sequence.charAt(1));
487      default:
488        // continue below to handle the general case
489    }
490    // TODO(user): is it potentially worth just going ahead and building a precomputed matcher?
491    final char[] chars = sequence.toString().toCharArray();
492    Arrays.sort(chars);
493    StringBuilder description = new StringBuilder("CharMatcher.anyOf(\"");
494    for (char c : chars) {
495      description.append(showCharacter(c));
496    }
497    description.append("\")");
498    return new CharMatcher(description.toString()) {
499      @Override public boolean matches(char c) {
500        return Arrays.binarySearch(chars, c) >= 0;
501      }
502
503      @Override
504      @GwtIncompatible("java.util.BitSet")
505      void setBits(BitSet table) {
506        for (char c : chars) {
507          table.set(c);
508        }
509      }
510    };
511  }
512
513  private static CharMatcher isEither(
514      final char match1,
515      final char match2) {
516    String description = "CharMatcher.anyOf(\"" +
517        showCharacter(match1) + showCharacter(match2) + "\")";
518    return new FastMatcher(description) {
519      @Override public boolean matches(char c) {
520        return c == match1 || c == match2;
521      }
522
523      @GwtIncompatible("java.util.BitSet")
524      @Override void setBits(BitSet table) {
525        table.set(match1);
526        table.set(match2);
527      }
528    };
529  }
530
531  /**
532   * Returns a {@code char} matcher that matches any character not present in the given character
533   * sequence.
534   */
535  public static CharMatcher noneOf(CharSequence sequence) {
536    return anyOf(sequence).negate();
537  }
538
539  /**
540   * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
541   * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
542   * CharMatcher.inRange('a', 'z')}.
543   *
544   * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
545   */
546  public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
547    checkArgument(endInclusive >= startInclusive);
548    String description = "CharMatcher.inRange('" +
549        showCharacter(startInclusive) + "', '" +
550        showCharacter(endInclusive) + "')";
551    return inRange(startInclusive, endInclusive, description);
552  }
553
554  static CharMatcher inRange(final char startInclusive, final char endInclusive,
555      String description) {
556    return new FastMatcher(description) {
557      @Override public boolean matches(char c) {
558        return startInclusive <= c && c <= endInclusive;
559      }
560
561      @GwtIncompatible("java.util.BitSet")
562      @Override void setBits(BitSet table) {
563        table.set(startInclusive, endInclusive + 1);
564      }
565    };
566  }
567
568  /**
569   * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
570   * which operates on primitive {@code char} instances instead.
571   */
572  public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
573    checkNotNull(predicate);
574    if (predicate instanceof CharMatcher) {
575      return (CharMatcher) predicate;
576    }
577    String description = "CharMatcher.forPredicate(" + predicate + ")";
578    return new CharMatcher(description) {
579      @Override public boolean matches(char c) {
580        return predicate.apply(c);
581      }
582
583      @Override public boolean apply(Character character) {
584        return predicate.apply(checkNotNull(character));
585      }
586    };
587  }
588
589  // State
590  final String description;
591
592  // Constructors
593
594  /**
595   * Sets the {@code toString()} from the given description.
596   */
597  CharMatcher(String description) {
598    this.description = description;
599  }
600
601  /**
602   * Constructor for use by subclasses. When subclassing, you may want to override
603   * {@code toString()} to provide a useful description.
604   */
605  protected CharMatcher() {
606    description = super.toString();
607  }
608
609  // Abstract methods
610
611  /** Determines a true or false value for the given character. */
612  public abstract boolean matches(char c);
613
614  // Non-static factories
615
616  /**
617   * Returns a matcher that matches any character not matched by this matcher.
618   */
619  public CharMatcher negate() {
620    return new NegatedMatcher(this);
621  }
622
623  private static class NegatedMatcher extends CharMatcher {
624    final CharMatcher original;
625
626    NegatedMatcher(String toString, CharMatcher original) {
627      super(toString);
628      this.original = original;
629    }
630
631    NegatedMatcher(CharMatcher original) {
632      this(original + ".negate()", original);
633    }
634
635    @Override public boolean matches(char c) {
636      return !original.matches(c);
637    }
638
639    @Override public boolean matchesAllOf(CharSequence sequence) {
640      return original.matchesNoneOf(sequence);
641    }
642
643    @Override public boolean matchesNoneOf(CharSequence sequence) {
644      return original.matchesAllOf(sequence);
645    }
646
647    @Override public int countIn(CharSequence sequence) {
648      return sequence.length() - original.countIn(sequence);
649    }
650
651    @GwtIncompatible("java.util.BitSet")
652    @Override
653    void setBits(BitSet table) {
654      BitSet tmp = new BitSet();
655      original.setBits(tmp);
656      tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
657      table.or(tmp);
658    }
659
660    @Override public CharMatcher negate() {
661      return original;
662    }
663
664    @Override
665    CharMatcher withToString(String description) {
666      return new NegatedMatcher(description, original);
667    }
668  }
669
670  /**
671   * Returns a matcher that matches any character matched by both this matcher and {@code other}.
672   */
673  public CharMatcher and(CharMatcher other) {
674    return new And(this, checkNotNull(other));
675  }
676
677  private static class And extends CharMatcher {
678    final CharMatcher first;
679    final CharMatcher second;
680
681    And(CharMatcher a, CharMatcher b) {
682      this(a, b, "CharMatcher.and(" + a + ", " + b + ")");
683    }
684
685    And(CharMatcher a, CharMatcher b, String description) {
686      super(description);
687      first = checkNotNull(a);
688      second = checkNotNull(b);
689    }
690
691    @Override
692    public boolean matches(char c) {
693      return first.matches(c) && second.matches(c);
694    }
695
696    @GwtIncompatible("java.util.BitSet")
697    @Override
698    void setBits(BitSet table) {
699      BitSet tmp1 = new BitSet();
700      first.setBits(tmp1);
701      BitSet tmp2 = new BitSet();
702      second.setBits(tmp2);
703      tmp1.and(tmp2);
704      table.or(tmp1);
705    }
706
707    @Override
708    CharMatcher withToString(String description) {
709      return new And(first, second, description);
710    }
711  }
712
713  /**
714   * Returns a matcher that matches any character matched by either this matcher or {@code other}.
715   */
716  public CharMatcher or(CharMatcher other) {
717    return new Or(this, checkNotNull(other));
718  }
719
720  private static class Or extends CharMatcher {
721    final CharMatcher first;
722    final CharMatcher second;
723
724    Or(CharMatcher a, CharMatcher b, String description) {
725      super(description);
726      first = checkNotNull(a);
727      second = checkNotNull(b);
728    }
729
730    Or(CharMatcher a, CharMatcher b) {
731      this(a, b, "CharMatcher.or(" + a + ", " + b + ")");
732    }
733
734    @GwtIncompatible("java.util.BitSet")
735    @Override
736    void setBits(BitSet table) {
737      first.setBits(table);
738      second.setBits(table);
739    }
740
741    @Override
742    public boolean matches(char c) {
743      return first.matches(c) || second.matches(c);
744    }
745
746    @Override
747    CharMatcher withToString(String description) {
748      return new Or(first, second, description);
749    }
750  }
751
752  /**
753   * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
754   * query than the original; your mileage may vary. Precomputation takes time and is likely to be
755   * worthwhile only if the precomputed matcher is queried many thousands of times.
756   *
757   * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
758   * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
759   * worthwhile tradeoff in a browser.
760   */
761  public CharMatcher precomputed() {
762    return Platform.precomputeCharMatcher(this);
763  }
764
765  /**
766   * Subclasses should provide a new CharMatcher with the same characteristics as {@code this},
767   * but with their {@code toString} method overridden with the new description.
768   *
769   * <p>This is unsupported by default.
770   */
771  CharMatcher withToString(String description) {
772    throw new UnsupportedOperationException();
773  }
774
775  private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1;
776
777  /**
778   * This is the actual implementation of {@link #precomputed}, but we bounce calls through a
779   * method on {@link Platform} so that we can have different behavior in GWT.
780   *
781   * <p>This implementation tries to be smart in a number of ways.  It recognizes cases where
782   * the negation is cheaper to precompute than the matcher itself; it tries to build small
783   * hash tables for matchers that only match a few characters, and so on.  In the worst-case
784   * scenario, it constructs an eight-kilobyte bit array and queries that.
785   * In many situations this produces a matcher which is faster to query than the original.
786   */
787  @GwtIncompatible("java.util.BitSet")
788  CharMatcher precomputedInternal() {
789    final BitSet table = new BitSet();
790    setBits(table);
791    int totalCharacters = table.cardinality();
792    if (totalCharacters * 2 <= DISTINCT_CHARS) {
793      return precomputedPositive(totalCharacters, table, description);
794    } else {
795      // TODO(user): is it worth it to worry about the last character of large matchers?
796      table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
797      int negatedCharacters = DISTINCT_CHARS - totalCharacters;
798      return new NegatedFastMatcher(toString(),
799          precomputedPositive(negatedCharacters, table, description + ".negate()"));
800    }
801  }
802
803  /**
804   * A matcher for which precomputation will not yield any significant benefit.
805   */
806  abstract static class FastMatcher extends CharMatcher {
807    FastMatcher() {
808      super();
809    }
810
811    FastMatcher(String description) {
812      super(description);
813    }
814
815    @Override
816    public final CharMatcher precomputed() {
817      return this;
818    }
819
820    @Override
821    public CharMatcher negate() {
822      return new NegatedFastMatcher(this);
823    }
824  }
825
826  static final class NegatedFastMatcher extends NegatedMatcher {
827    NegatedFastMatcher(CharMatcher original) {
828      super(original);
829    }
830
831    NegatedFastMatcher(String toString, CharMatcher original) {
832      super(toString, original);
833    }
834
835    @Override
836    public final CharMatcher precomputed() {
837      return this;
838    }
839
840    @Override
841    CharMatcher withToString(String description) {
842      return new NegatedFastMatcher(description, original);
843    }
844  }
845
846  /**
847   * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper.
848   */
849  @GwtIncompatible("java.util.BitSet")
850  private static CharMatcher precomputedPositive(
851      int totalCharacters,
852      BitSet table,
853      String description) {
854    switch (totalCharacters) {
855      case 0:
856        return NONE;
857      case 1:
858        return is((char) table.nextSetBit(0));
859      case 2:
860        char c1 = (char) table.nextSetBit(0);
861        char c2 = (char) table.nextSetBit(c1 + 1);
862        return isEither(c1, c2);
863      default:
864        return isSmall(totalCharacters, table.length())
865            ? SmallCharMatcher.from(table, description)
866            : new BitSetMatcher(table, description);
867    }
868  }
869
870  private static boolean isSmall(int totalCharacters, int tableLength) {
871    return totalCharacters <= SmallCharMatcher.MAX_SIZE
872        && tableLength > (totalCharacters * Character.SIZE);
873  }
874
875  @GwtIncompatible("java.util.BitSet")
876  private static class BitSetMatcher extends FastMatcher {
877    private final BitSet table;
878
879    private BitSetMatcher(BitSet table, String description) {
880      super(description);
881      if (table.length() + Long.SIZE < table.size()) {
882        table = (BitSet) table.clone();
883        // If only we could actually call BitSet.trimToSize() ourselves...
884      }
885      this.table = table;
886    }
887
888    @Override public boolean matches(char c) {
889      return table.get(c);
890    }
891
892    @Override
893    void setBits(BitSet bitSet) {
894      bitSet.or(table);
895    }
896  }
897
898  /**
899   * Sets bits in {@code table} matched by this matcher.
900   */
901  @GwtIncompatible("java.util.BitSet")
902  void setBits(BitSet table) {
903    for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) {
904      if (matches((char) c)) {
905        table.set(c);
906      }
907    }
908  }
909
910  // Text processing routines
911
912  /**
913   * Returns {@code true} if a character sequence contains at least one matching character.
914   * Equivalent to {@code !matchesNoneOf(sequence)}.
915   *
916   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
917   * character, until this returns {@code true} or the end is reached.
918   *
919   * @param sequence the character sequence to examine, possibly empty
920   * @return {@code true} if this matcher matches at least one character in the sequence
921   * @since 8.0
922   */
923  public boolean matchesAnyOf(CharSequence sequence) {
924    return !matchesNoneOf(sequence);
925  }
926
927  /**
928   * Returns {@code true} if a character sequence contains only matching characters.
929   *
930   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
931   * character, until this returns {@code false} or the end is reached.
932   *
933   * @param sequence the character sequence to examine, possibly empty
934   * @return {@code true} if this matcher matches every character in the sequence, including when
935   *         the sequence is empty
936   */
937  public boolean matchesAllOf(CharSequence sequence) {
938    for (int i = sequence.length() - 1; i >= 0; i--) {
939      if (!matches(sequence.charAt(i))) {
940        return false;
941      }
942    }
943    return true;
944  }
945
946  /**
947   * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
948   * {@code !matchesAnyOf(sequence)}.
949   *
950   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
951   * character, until this returns {@code false} or the end is reached.
952   *
953   * @param sequence the character sequence to examine, possibly empty
954   * @return {@code true} if this matcher matches every character in the sequence, including when
955   *         the sequence is empty
956   */
957  public boolean matchesNoneOf(CharSequence sequence) {
958    return indexIn(sequence) == -1;
959  }
960
961  /**
962   * Returns the index of the first matching character in a character sequence, or {@code -1} if no
963   * matching character is present.
964   *
965   * <p>The default implementation iterates over the sequence in forward order calling {@link
966   * #matches} for each character.
967   *
968   * @param sequence the character sequence to examine from the beginning
969   * @return an index, or {@code -1} if no character matches
970   */
971  public int indexIn(CharSequence sequence) {
972    int length = sequence.length();
973    for (int i = 0; i < length; i++) {
974      if (matches(sequence.charAt(i))) {
975        return i;
976      }
977    }
978    return -1;
979  }
980
981  /**
982   * Returns the index of the first matching character in a character sequence, starting from a
983   * given position, or {@code -1} if no character matches after that position.
984   *
985   * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
986   * start}, calling {@link #matches} for each character.
987   *
988   * @param sequence the character sequence to examine
989   * @param start the first index to examine; must be nonnegative and no greater than {@code
990   *        sequence.length()}
991   * @return the index of the first matching character, guaranteed to be no less than {@code start},
992   *         or {@code -1} if no character matches
993   * @throws IndexOutOfBoundsException if start is negative or greater than {@code
994   *         sequence.length()}
995   */
996  public int indexIn(CharSequence sequence, int start) {
997    int length = sequence.length();
998    Preconditions.checkPositionIndex(start, length);
999    for (int i = start; i < length; i++) {
1000      if (matches(sequence.charAt(i))) {
1001        return i;
1002      }
1003    }
1004    return -1;
1005  }
1006
1007  /**
1008   * Returns the index of the last matching character in a character sequence, or {@code -1} if no
1009   * matching character is present.
1010   *
1011   * <p>The default implementation iterates over the sequence in reverse order calling {@link
1012   * #matches} for each character.
1013   *
1014   * @param sequence the character sequence to examine from the end
1015   * @return an index, or {@code -1} if no character matches
1016   */
1017  public int lastIndexIn(CharSequence sequence) {
1018    for (int i = sequence.length() - 1; i >= 0; i--) {
1019      if (matches(sequence.charAt(i))) {
1020        return i;
1021      }
1022    }
1023    return -1;
1024  }
1025
1026  /**
1027   * Returns the number of matching characters found in a character sequence.
1028   */
1029  public int countIn(CharSequence sequence) {
1030    int count = 0;
1031    for (int i = 0; i < sequence.length(); i++) {
1032      if (matches(sequence.charAt(i))) {
1033        count++;
1034      }
1035    }
1036    return count;
1037  }
1038
1039  /**
1040   * Returns a string containing all non-matching characters of a character sequence, in order. For
1041   * example: <pre>   {@code
1042   *
1043   *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
1044   *
1045   * ... returns {@code "bzr"}.
1046   */
1047  @CheckReturnValue
1048  public String removeFrom(CharSequence sequence) {
1049    String string = sequence.toString();
1050    int pos = indexIn(string);
1051    if (pos == -1) {
1052      return string;
1053    }
1054
1055    char[] chars = string.toCharArray();
1056    int spread = 1;
1057
1058    // This unusual loop comes from extensive benchmarking
1059    OUT: while (true) {
1060      pos++;
1061      while (true) {
1062        if (pos == chars.length) {
1063          break OUT;
1064        }
1065        if (matches(chars[pos])) {
1066          break;
1067        }
1068        chars[pos - spread] = chars[pos];
1069        pos++;
1070      }
1071      spread++;
1072    }
1073    return new String(chars, 0, pos - spread);
1074  }
1075
1076  /**
1077   * Returns a string containing all matching characters of a character sequence, in order. For
1078   * example: <pre>   {@code
1079   *
1080   *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
1081   *
1082   * ... returns {@code "aaa"}.
1083   */
1084  @CheckReturnValue
1085  public String retainFrom(CharSequence sequence) {
1086    return negate().removeFrom(sequence);
1087  }
1088
1089  /**
1090   * Returns a string copy of the input character sequence, with each character that matches this
1091   * matcher replaced by a given replacement character. For example: <pre>   {@code
1092   *
1093   *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
1094   *
1095   * ... returns {@code "rodor"}.
1096   *
1097   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1098   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1099   * character.
1100   *
1101   * @param sequence the character sequence to replace matching characters in
1102   * @param replacement the character to append to the result string in place of each matching
1103   *        character in {@code sequence}
1104   * @return the new string
1105   */
1106  @CheckReturnValue
1107  public String replaceFrom(CharSequence sequence, char replacement) {
1108    String string = sequence.toString();
1109    int pos = indexIn(string);
1110    if (pos == -1) {
1111      return string;
1112    }
1113    char[] chars = string.toCharArray();
1114    chars[pos] = replacement;
1115    for (int i = pos + 1; i < chars.length; i++) {
1116      if (matches(chars[i])) {
1117        chars[i] = replacement;
1118      }
1119    }
1120    return new String(chars);
1121  }
1122
1123  /**
1124   * Returns a string copy of the input character sequence, with each character that matches this
1125   * matcher replaced by a given replacement sequence. For example: <pre>   {@code
1126   *
1127   *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1128   *
1129   * ... returns {@code "yoohoo"}.
1130   *
1131   * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1132   * off calling {@link #replaceFrom(CharSequence, char)} directly.
1133   *
1134   * @param sequence the character sequence to replace matching characters in
1135   * @param replacement the characters to append to the result string in place of each matching
1136   *        character in {@code sequence}
1137   * @return the new string
1138   */
1139  @CheckReturnValue
1140  public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1141    int replacementLen = replacement.length();
1142    if (replacementLen == 0) {
1143      return removeFrom(sequence);
1144    }
1145    if (replacementLen == 1) {
1146      return replaceFrom(sequence, replacement.charAt(0));
1147    }
1148
1149    String string = sequence.toString();
1150    int pos = indexIn(string);
1151    if (pos == -1) {
1152      return string;
1153    }
1154
1155    int len = string.length();
1156    StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1157
1158    int oldpos = 0;
1159    do {
1160      buf.append(string, oldpos, pos);
1161      buf.append(replacement);
1162      oldpos = pos + 1;
1163      pos = indexIn(string, oldpos);
1164    } while (pos != -1);
1165
1166    buf.append(string, oldpos, len);
1167    return buf.toString();
1168  }
1169
1170  /**
1171   * Returns a substring of the input character sequence that omits all characters this matcher
1172   * matches from the beginning and from the end of the string. For example: <pre>   {@code
1173   *
1174   *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1175   *
1176   * ... returns {@code "cat"}.
1177   *
1178   * <p>Note that: <pre>   {@code
1179   *
1180   *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1181   *
1182   * ... is equivalent to {@link String#trim()}.
1183   */
1184  @CheckReturnValue
1185  public String trimFrom(CharSequence sequence) {
1186    int len = sequence.length();
1187    int first;
1188    int last;
1189
1190    for (first = 0; first < len; first++) {
1191      if (!matches(sequence.charAt(first))) {
1192        break;
1193      }
1194    }
1195    for (last = len - 1; last > first; last--) {
1196      if (!matches(sequence.charAt(last))) {
1197        break;
1198      }
1199    }
1200
1201    return sequence.subSequence(first, last + 1).toString();
1202  }
1203
1204  /**
1205   * Returns a substring of the input character sequence that omits all characters this matcher
1206   * matches from the beginning of the string. For example: <pre> {@code
1207   *
1208   *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1209   *
1210   * ... returns {@code "catbab"}.
1211   */
1212  @CheckReturnValue
1213  public String trimLeadingFrom(CharSequence sequence) {
1214    int len = sequence.length();
1215    for (int first = 0; first < len; first++) {
1216      if (!matches(sequence.charAt(first))) {
1217        return sequence.subSequence(first, len).toString();
1218      }
1219    }
1220    return "";
1221  }
1222
1223  /**
1224   * Returns a substring of the input character sequence that omits all characters this matcher
1225   * matches from the end of the string. For example: <pre> {@code
1226   *
1227   *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1228   *
1229   * ... returns {@code "abacat"}.
1230   */
1231  @CheckReturnValue
1232  public String trimTrailingFrom(CharSequence sequence) {
1233    int len = sequence.length();
1234    for (int last = len - 1; last >= 0; last--) {
1235      if (!matches(sequence.charAt(last))) {
1236        return sequence.subSequence(0, last + 1).toString();
1237      }
1238    }
1239    return "";
1240  }
1241
1242  /**
1243   * Returns a string copy of the input character sequence, with each group of consecutive
1244   * characters that match this matcher replaced by a single replacement character. For example:
1245   * <pre>   {@code
1246   *
1247   *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1248   *
1249   * ... returns {@code "b-p-r"}.
1250   *
1251   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1252   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1253   * character.
1254   *
1255   * @param sequence the character sequence to replace matching groups of characters in
1256   * @param replacement the character to append to the result string in place of each group of
1257   *        matching characters in {@code sequence}
1258   * @return the new string
1259   */
1260  @CheckReturnValue
1261  public String collapseFrom(CharSequence sequence, char replacement) {
1262    // This implementation avoids unnecessary allocation.
1263    int len = sequence.length();
1264    for (int i = 0; i < len; i++) {
1265      char c = sequence.charAt(i);
1266      if (matches(c)) {
1267        if (c == replacement
1268            && (i == len - 1 || !matches(sequence.charAt(i + 1)))) {
1269          // a no-op replacement
1270          i++;
1271        } else {
1272          StringBuilder builder = new StringBuilder(len)
1273              .append(sequence.subSequence(0, i))
1274              .append(replacement);
1275          return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true);
1276        }
1277      }
1278    }
1279    // no replacement needed
1280    return sequence.toString();
1281  }
1282
1283  /**
1284   * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1285   * groups of matching characters at the start or end of the sequence are removed without
1286   * replacement.
1287   */
1288  @CheckReturnValue
1289  public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1290    // This implementation avoids unnecessary allocation.
1291    int len = sequence.length();
1292    int first;
1293    int last;
1294
1295    for (first = 0; first < len && matches(sequence.charAt(first)); first++) {}
1296    for (last = len - 1; last > first && matches(sequence.charAt(last)); last--) {}
1297
1298    return (first == 0 && last == len - 1)
1299        ? collapseFrom(sequence, replacement)
1300        : finishCollapseFrom(
1301              sequence, first, last + 1, replacement,
1302              new StringBuilder(last + 1 - first),
1303              false);
1304  }
1305
1306  private String finishCollapseFrom(
1307      CharSequence sequence, int start, int end, char replacement,
1308      StringBuilder builder, boolean inMatchingGroup) {
1309    for (int i = start; i < end; i++) {
1310      char c = sequence.charAt(i);
1311      if (matches(c)) {
1312        if (!inMatchingGroup) {
1313          builder.append(replacement);
1314          inMatchingGroup = true;
1315        }
1316      } else {
1317        builder.append(c);
1318        inMatchingGroup = false;
1319      }
1320    }
1321    return builder.toString();
1322  }
1323
1324  // Predicate interface
1325
1326  /**
1327   * Equivalent to {@link #matches}; provided only to satisfy the {@link Predicate} interface. When
1328   * using a reference of type {@code CharMatcher}, invoke {@link #matches} directly instead.
1329   */
1330  @Override public boolean apply(Character character) {
1331    return matches(character);
1332  }
1333
1334  /**
1335   * Returns a string representation of this {@code CharMatcher}, such as
1336   * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1337   */
1338  @Override
1339  public String toString() {
1340    return description;
1341  }
1342
1343  /**
1344   * A special-case CharMatcher for Unicode whitespace characters that is extremely
1345   * efficient both in space required and in time to check for matches.
1346   *
1347   * Implementation details.
1348   * It turns out that all current (early 2012) Unicode characters are unique modulo 79:
1349   * so we can construct a lookup table of exactly 79 entries, and just check the character code
1350   * mod 79, and see if that character is in the table.
1351   *
1352   * There is a 1 at the beginning of the table so that the null character is not listed
1353   * as whitespace.
1354   *
1355   * Other things we tried that did not prove to be beneficial, mostly due to speed concerns:
1356   *
1357   *   * Binary search into the sorted list of characters, i.e., what
1358   *     CharMatcher.anyOf() does</li>
1359   *   * Perfect hash function into a table of size 26 (using an offset table and a special
1360   *     Jenkins hash function)</li>
1361   *   * Perfect-ish hash function that required two lookups into a single table of size 26.</li>
1362   *   * Using a power-of-2 sized hash table (size 64) with linear probing.</li>
1363   *
1364   * --Christopher Swenson, February 2012.
1365   */
1366  private static final String WHITESPACE_TABLE = "\u0001\u0000\u00a0\u0000\u0000\u0000\u0000\u0000"
1367      + "\u0000\u0009\n\u000b\u000c\r\u0000\u0000\u2028\u2029\u0000\u0000\u0000\u0000\u0000\u202f"
1368      + "\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0020\u0000\u0000\u0000\u0000\u0000"
1369      + "\u0000\u0000\u0000\u0000\u0000\u3000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"
1370      + "\u0000\u0000\u0085\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a"
1371      + "\u0000\u0000\u0000\u0000\u0000\u205f\u1680\u0000\u0000\u180e\u0000\u0000\u0000";
1372
1373  /**
1374   * Determines whether a character is whitespace according to the latest Unicode standard, as
1375   * illustrated
1376   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1377   * This is not the same definition used by other Java APIs. (See a
1378   * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1379   * definitions of "whitespace"</a>.)
1380   *
1381   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1382   * to date.
1383   */
1384  public static final CharMatcher WHITESPACE = new FastMatcher("CharMatcher.WHITESPACE") {
1385
1386    @Override public boolean matches(char c) {
1387      return WHITESPACE_TABLE.charAt(c % 79) == c;
1388    }
1389  };
1390}