001    /*
002     * Copyright (C) 2008 The Guava Authors
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package com.google.common.base;
018    
019    import static com.google.common.base.Preconditions.checkArgument;
020    import static com.google.common.base.Preconditions.checkNotNull;
021    
022    import com.google.common.annotations.Beta;
023    import com.google.common.annotations.GwtCompatible;
024    
025    import java.util.ArrayList;
026    import java.util.Arrays;
027    import java.util.List;
028    
029    import javax.annotation.CheckReturnValue;
030    
031    /**
032     * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
033     * for any {@link Object}. Also offers basic text processing methods based on this function.
034     * Implementations are strongly encouraged to be side-effect-free and immutable.
035     *
036     * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
037     * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
038     *
039     * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
040     * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
041     * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
042     * treats these just as two separate characters.
043     *
044     * <p>Example usages: <pre>
045     *   String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
046     *   if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
047     *
048     * <p>See the Guava User Guide article on <a href=
049     * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
050     * {@code CharMatcher}</a>.
051     *
052     * @author Kevin Bourrillion
053     * @since 1.0
054     */
055    @Beta // Possibly change from chars to code points; decide constants vs. methods
056    @GwtCompatible
057    public abstract class CharMatcher implements Predicate<Character> {
058      // Constants
059      /**
060       * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
061       * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
062       * discussion of that term.
063       *
064       * @since 2.0
065       */
066      public static final CharMatcher BREAKING_WHITESPACE =
067          anyOf("\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000")
068              .or(inRange('\u2000', '\u2006'))
069              .or(inRange('\u2008', '\u200a'))
070              .withToString("CharMatcher.BREAKING_WHITESPACE")
071              .precomputed();
072    
073      /**
074       * Determines whether a character is ASCII, meaning that its code point is less than 128.
075       */
076      public static final CharMatcher ASCII = inRange('\0', '\u007f')
077          .withToString("CharMatcher.ASCII");
078    
079      /**
080       * Determines whether a character is a digit according to
081       * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
082       */
083      public static final CharMatcher DIGIT;
084    
085      static {
086        CharMatcher digit = inRange('0', '9');
087        String zeroes =
088            "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
089                + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
090                + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
091        for (char base : zeroes.toCharArray()) {
092          digit = digit.or(inRange(base, (char) (base + 9)));
093        }
094        DIGIT = digit.withToString("CharMatcher.DIGIT").precomputed();
095      }
096    
097      /**
098       * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
099       * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
100       */
101      public static final CharMatcher JAVA_DIGIT = new CharMatcher() {
102        @Override public boolean matches(char c) {
103          return Character.isDigit(c);
104        }
105    
106        @Override public String toString() {
107          return "CharMatcher.JAVA_DIGIT";
108        }
109      };
110    
111      /**
112       * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
113       * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
114       * inRange('a', 'z').or(inRange('A', 'Z'))}.
115       */
116      public static final CharMatcher JAVA_LETTER = new CharMatcher() {
117        @Override public boolean matches(char c) {
118          return Character.isLetter(c);
119        }
120    
121        @Override public String toString() {
122          return "CharMatcher.JAVA_LETTER";
123        }
124      };
125    
126      /**
127       * Determines whether a character is a letter or digit according to {@link
128       * Character#isLetterOrDigit(char) Java's definition}.
129       */
130      public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() {
131        @Override public boolean matches(char c) {
132          return Character.isLetterOrDigit(c);
133        }
134    
135        @Override public String toString() {
136          return "CharMatcher.JAVA_LETTER_OR_DIGIT";
137        }
138      };
139    
140      /**
141       * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
142       * Java's definition}.
143       */
144      public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() {
145        @Override public boolean matches(char c) {
146          return Character.isUpperCase(c);
147        }
148    
149        @Override public String toString() {
150          return "CharMatcher.JAVA_UPPER_CASE";
151        }
152      };
153    
154      /**
155       * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
156       * Java's definition}.
157       */
158      public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() {
159        @Override public boolean matches(char c) {
160          return Character.isLowerCase(c);
161        }
162    
163        @Override public String toString() {
164          return "CharMatcher.JAVA_LOWER_CASE";
165        }
166      };
167    
168      /**
169       * Determines whether a character is an ISO control character as specified by {@link
170       * Character#isISOControl(char)}.
171       */
172      public static final CharMatcher JAVA_ISO_CONTROL =
173          inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f'))
174          .withToString("CharMatcher.JAVA_ISO_CONTROL");
175    
176      /**
177       * Determines whether a character is invisible; that is, if its Unicode category is any of
178       * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
179       * PRIVATE_USE according to ICU4J.
180       */
181      public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
182          .or(inRange('\u007f', '\u00a0'))
183          .or(is('\u00ad'))
184          .or(inRange('\u0600', '\u0603'))
185          .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e"))
186          .or(inRange('\u2000', '\u200f'))
187          .or(inRange('\u2028', '\u202f'))
188          .or(inRange('\u205f', '\u2064'))
189          .or(inRange('\u206a', '\u206f'))
190          .or(is('\u3000'))
191          .or(inRange('\ud800', '\uf8ff'))
192          .or(anyOf("\ufeff\ufff9\ufffa\ufffb"))
193          .withToString("CharMatcher.INVISIBLE")
194          .precomputed();
195    
196      /**
197       * Determines whether a character is single-width (not double-width). When in doubt, this matcher
198       * errs on the side of returning {@code false} (that is, it tends to assume a character is
199       * double-width).
200       *
201       * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
202       * date.
203       */
204      public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
205          .or(is('\u05be'))
206          .or(inRange('\u05d0', '\u05ea'))
207          .or(is('\u05f3'))
208          .or(is('\u05f4'))
209          .or(inRange('\u0600', '\u06ff'))
210          .or(inRange('\u0750', '\u077f'))
211          .or(inRange('\u0e00', '\u0e7f'))
212          .or(inRange('\u1e00', '\u20af'))
213          .or(inRange('\u2100', '\u213a'))
214          .or(inRange('\ufb50', '\ufdff'))
215          .or(inRange('\ufe70', '\ufeff'))
216          .or(inRange('\uff61', '\uffdc'))
217          .withToString("CharMatcher.SINGLE_WIDTH")
218          .precomputed();
219    
220      /** Matches any character. */
221      public static final CharMatcher ANY =
222          new CharMatcher() {
223            @Override public boolean matches(char c) {
224              return true;
225            }
226    
227            @Override public int indexIn(CharSequence sequence) {
228              return (sequence.length() == 0) ? -1 : 0;
229            }
230    
231            @Override public int indexIn(CharSequence sequence, int start) {
232              int length = sequence.length();
233              Preconditions.checkPositionIndex(start, length);
234              return (start == length) ? -1 : start;
235            }
236    
237            @Override public int lastIndexIn(CharSequence sequence) {
238              return sequence.length() - 1;
239            }
240    
241            @Override public boolean matchesAllOf(CharSequence sequence) {
242              checkNotNull(sequence);
243              return true;
244            }
245    
246            @Override public boolean matchesNoneOf(CharSequence sequence) {
247              return sequence.length() == 0;
248            }
249    
250            @Override public String removeFrom(CharSequence sequence) {
251              checkNotNull(sequence);
252              return "";
253            }
254    
255            @Override public String replaceFrom(CharSequence sequence, char replacement) {
256              char[] array = new char[sequence.length()];
257              Arrays.fill(array, replacement);
258              return new String(array);
259            }
260    
261            @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
262              StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
263              for (int i = 0; i < sequence.length(); i++) {
264                retval.append(replacement);
265              }
266              return retval.toString();
267            }
268    
269            @Override public String collapseFrom(CharSequence sequence, char replacement) {
270              return (sequence.length() == 0) ? "" : String.valueOf(replacement);
271            }
272    
273            @Override public String trimFrom(CharSequence sequence) {
274              checkNotNull(sequence);
275              return "";
276            }
277    
278            @Override public int countIn(CharSequence sequence) {
279              return sequence.length();
280            }
281    
282            @Override public CharMatcher and(CharMatcher other) {
283              return checkNotNull(other);
284            }
285    
286            @Override public CharMatcher or(CharMatcher other) {
287              checkNotNull(other);
288              return this;
289            }
290    
291            @Override public CharMatcher negate() {
292              return NONE;
293            }
294    
295            @Override public CharMatcher precomputed() {
296              return this;
297            }
298    
299            @Override public String toString() {
300              return "CharMatcher.ANY";
301            }
302          };
303    
304      /** Matches no characters. */
305      public static final CharMatcher NONE =
306          new CharMatcher() {
307            @Override public boolean matches(char c) {
308              return false;
309            }
310    
311            @Override public int indexIn(CharSequence sequence) {
312              checkNotNull(sequence);
313              return -1;
314            }
315    
316            @Override public int indexIn(CharSequence sequence, int start) {
317              int length = sequence.length();
318              Preconditions.checkPositionIndex(start, length);
319              return -1;
320            }
321    
322            @Override public int lastIndexIn(CharSequence sequence) {
323              checkNotNull(sequence);
324              return -1;
325            }
326    
327            @Override public boolean matchesAllOf(CharSequence sequence) {
328              return sequence.length() == 0;
329            }
330    
331            @Override public boolean matchesNoneOf(CharSequence sequence) {
332              checkNotNull(sequence);
333              return true;
334            }
335    
336            @Override public String removeFrom(CharSequence sequence) {
337              return sequence.toString();
338            }
339    
340            @Override public String replaceFrom(CharSequence sequence, char replacement) {
341              return sequence.toString();
342            }
343    
344            @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
345              checkNotNull(replacement);
346              return sequence.toString();
347            }
348    
349            @Override public String collapseFrom(CharSequence sequence, char replacement) {
350              return sequence.toString();
351            }
352    
353            @Override public String trimFrom(CharSequence sequence) {
354              return sequence.toString();
355            }
356    
357            @Override public int countIn(CharSequence sequence) {
358              checkNotNull(sequence);
359              return 0;
360            }
361    
362            @Override public CharMatcher and(CharMatcher other) {
363              checkNotNull(other);
364              return this;
365            }
366    
367            @Override public CharMatcher or(CharMatcher other) {
368              return checkNotNull(other);
369            }
370    
371            @Override public CharMatcher negate() {
372              return ANY;
373            }
374    
375            @Override void setBits(LookupTable table) {}
376    
377            @Override public CharMatcher precomputed() {
378              return this;
379            }
380    
381            @Override public String toString() {
382              return "CharMatcher.NONE";
383            }
384          };
385    
386      // Static factories
387    
388      /**
389       * Returns a {@code char} matcher that matches only one specified character.
390       */
391      public static CharMatcher is(final char match) {
392        return new CharMatcher() {
393          @Override public boolean matches(char c) {
394            return c == match;
395          }
396    
397          @Override public String replaceFrom(CharSequence sequence, char replacement) {
398            return sequence.toString().replace(match, replacement);
399          }
400    
401          @Override public CharMatcher and(CharMatcher other) {
402            return other.matches(match) ? this : NONE;
403          }
404    
405          @Override public CharMatcher or(CharMatcher other) {
406            return other.matches(match) ? other : super.or(other);
407          }
408    
409          @Override public CharMatcher negate() {
410            return isNot(match);
411          }
412    
413          @Override void setBits(LookupTable table) {
414            table.set(match);
415          }
416    
417          @Override public CharMatcher precomputed() {
418            return this;
419          }
420    
421          @Override public String toString() {
422            return new StringBuilder("CharMatcher.is(")
423                .append(Integer.toHexString(match))
424                .append(")")
425                .toString();
426          }
427        };
428      }
429    
430      /**
431       * Returns a {@code char} matcher that matches any character except the one specified.
432       *
433       * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
434       */
435      public static CharMatcher isNot(final char match) {
436        return new CharMatcher() {
437          @Override public boolean matches(char c) {
438            return c != match;
439          }
440    
441          @Override public CharMatcher and(CharMatcher other) {
442            return other.matches(match) ? super.and(other) : other;
443          }
444    
445          @Override public CharMatcher or(CharMatcher other) {
446            return other.matches(match) ? ANY : this;
447          }
448    
449          @Override public CharMatcher negate() {
450            return is(match);
451          }
452    
453          @Override public String toString() {
454            return new StringBuilder("CharMatcher.isNot(")
455                .append(Integer.toHexString(match))
456                .append(")")
457                .toString();
458          }
459        };
460      }
461    
462      /**
463       * Returns a {@code char} matcher that matches any character present in the given character
464       * sequence.
465       */
466      public static CharMatcher anyOf(final CharSequence sequence) {
467        switch (sequence.length()) {
468          case 0:
469            return NONE;
470          case 1:
471            return is(sequence.charAt(0));
472          case 2:
473            final char match1 = sequence.charAt(0);
474            final char match2 = sequence.charAt(1);
475            return new CharMatcher() {
476              @Override public boolean matches(char c) {
477                return c == match1 || c == match2;
478              }
479    
480              @Override void setBits(LookupTable table) {
481                table.set(match1);
482                table.set(match2);
483              }
484    
485              @Override public CharMatcher precomputed() {
486                return this;
487              }
488            };
489        }
490    
491        final char[] chars = sequence.toString().toCharArray();
492        Arrays.sort(chars); // not worth collapsing duplicates
493    
494        return new CharMatcher() {
495          @Override public boolean matches(char c) {
496            return Arrays.binarySearch(chars, c) >= 0;
497          }
498    
499          @Override void setBits(LookupTable table) {
500            for (char c : chars) {
501              table.set(c);
502            }
503          }
504    
505          @Override public String toString() {
506            return new StringBuilder("CharMatcher.anyOf(\"").append(chars).append("\")").toString();
507          }
508        };
509      }
510    
511      /**
512       * Returns a {@code char} matcher that matches any character not present in the given character
513       * sequence.
514       */
515      public static CharMatcher noneOf(CharSequence sequence) {
516        return anyOf(sequence).negate();
517      }
518    
519      /**
520       * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
521       * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
522       * CharMatcher.inRange('a', 'z')}.
523       *
524       * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
525       */
526      public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
527        checkArgument(endInclusive >= startInclusive);
528        return new CharMatcher() {
529          @Override public boolean matches(char c) {
530            return startInclusive <= c && c <= endInclusive;
531          }
532    
533          @Override void setBits(LookupTable table) {
534            char c = startInclusive;
535            while (true) {
536              table.set(c);
537              if (c++ == endInclusive) {
538                break;
539              }
540            }
541          }
542    
543          @Override public CharMatcher precomputed() {
544            return this;
545          }
546    
547          @Override public String toString() {
548            return new StringBuilder("CharMatcher.inRange(")
549                .append(Integer.toHexString(startInclusive))
550                .append(", ")
551                .append(Integer.toHexString(endInclusive))
552                .append(")")
553                .toString();
554          }
555        };
556      }
557    
558      /**
559       * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
560       * which operates on primitive {@code char} instances instead.
561       */
562      public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
563        checkNotNull(predicate);
564        if (predicate instanceof CharMatcher) {
565          return (CharMatcher) predicate;
566        }
567        return new CharMatcher() {
568          @Override public boolean matches(char c) {
569            return predicate.apply(c);
570          }
571    
572          @Override public boolean apply(Character character) {
573            return predicate.apply(checkNotNull(character));
574          }
575    
576          @Override public String toString() {
577            return new StringBuilder("CharMatcher.forPredicate(")
578                .append(predicate)
579                .append(')')
580                .toString();
581          }
582        };
583      }
584    
585      // Constructors
586    
587      /**
588       * Constructor for use by subclasses.
589       */
590      protected CharMatcher() {}
591    
592      // Abstract methods
593    
594      /** Determines a true or false value for the given character. */
595      public abstract boolean matches(char c);
596    
597      // Non-static factories
598    
599      /**
600       * Returns a matcher that matches any character not matched by this matcher.
601       */
602      public CharMatcher negate() {
603        final CharMatcher original = this;
604        return new CharMatcher() {
605          @Override public boolean matches(char c) {
606            return !original.matches(c);
607          }
608    
609          @Override public boolean matchesAllOf(CharSequence sequence) {
610            return original.matchesNoneOf(sequence);
611          }
612    
613          @Override public boolean matchesNoneOf(CharSequence sequence) {
614            return original.matchesAllOf(sequence);
615          }
616    
617          @Override public int countIn(CharSequence sequence) {
618            return sequence.length() - original.countIn(sequence);
619          }
620    
621          @Override public CharMatcher negate() {
622            return original;
623          }
624    
625          @Override public String toString() {
626            return original + ".negate()";
627          }
628        };
629      }
630    
631      /**
632       * Returns a matcher that matches any character matched by both this matcher and {@code other}.
633       */
634      public CharMatcher and(CharMatcher other) {
635        return new And(Arrays.asList(this, checkNotNull(other)));
636      }
637    
638      private static class And extends CharMatcher {
639        List<CharMatcher> components;
640    
641        And(List<CharMatcher> components) {
642          this.components = components; // Skip defensive copy (private)
643        }
644    
645        @Override public boolean matches(char c) {
646          for (CharMatcher matcher : components) {
647            if (!matcher.matches(c)) {
648              return false;
649            }
650          }
651          return true;
652        }
653    
654        @Override public CharMatcher and(CharMatcher other) {
655          List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
656          newComponents.add(checkNotNull(other));
657          return new And(newComponents);
658        }
659    
660        @Override public String toString() {
661          StringBuilder builder = new StringBuilder("CharMatcher.and(");
662          Joiner.on(", ").appendTo(builder, components);
663          return builder.append(')').toString();
664        }
665      }
666    
667      /**
668       * Returns a matcher that matches any character matched by either this matcher or {@code other}.
669       */
670      public CharMatcher or(CharMatcher other) {
671        return new Or(Arrays.asList(this, checkNotNull(other)));
672      }
673    
674      private static class Or extends CharMatcher {
675        List<CharMatcher> components;
676    
677        Or(List<CharMatcher> components) {
678          this.components = components; // Skip defensive copy (private)
679        }
680    
681        @Override public boolean matches(char c) {
682          for (CharMatcher matcher : components) {
683            if (matcher.matches(c)) {
684              return true;
685            }
686          }
687          return false;
688        }
689    
690        @Override public CharMatcher or(CharMatcher other) {
691          List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
692          newComponents.add(checkNotNull(other));
693          return new Or(newComponents);
694        }
695    
696        @Override void setBits(LookupTable table) {
697          for (CharMatcher matcher : components) {
698            matcher.setBits(table);
699          }
700        }
701    
702        @Override public String toString() {
703          StringBuilder builder = new StringBuilder("CharMatcher.or(");
704          Joiner.on(", ").appendTo(builder, components);
705          return builder.append(')').toString();
706        }
707      }
708    
709      /**
710       * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
711       * query than the original; your mileage may vary. Precomputation takes time and is likely to be
712       * worthwhile only if the precomputed matcher is queried many thousands of times.
713       *
714       * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
715       * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
716       * worthwhile tradeoff in a browser.
717       */
718      public CharMatcher precomputed() {
719        return Platform.precomputeCharMatcher(this);
720      }
721    
722      /**
723       * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
724       * on {@link Platform} so that we can have different behavior in GWT.
725       *
726       * <p>The default precomputation is to cache the configuration of the original matcher in an
727       * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query
728       * than the original.
729       *
730       * <p>The default implementation creates a new bit array and passes it to {@link
731       * #setBits(LookupTable)}.
732       */
733      CharMatcher precomputedInternal() {
734        final LookupTable table = new LookupTable();
735        setBits(table);
736        final CharMatcher outer = this;
737    
738        return new CharMatcher() {
739          @Override public boolean matches(char c) {
740            return table.get(c);
741          }
742    
743          // TODO(kevinb): make methods like negate() smart?
744    
745          @Override public CharMatcher precomputed() {
746            return this;
747          }
748    
749          @Override public String toString() {
750            return outer.toString();
751          }
752        };
753      }
754    
755      CharMatcher withToString(final String toString) {
756        final CharMatcher delegate = this;
757        return new CharMatcher() {
758          @Override
759          public boolean matches(char c) {
760            return delegate.matches(c);
761          }
762    
763          @Override
764          void setBits(LookupTable table) {
765            delegate.setBits(table);
766          }
767    
768          @Override
769          public String toString() {
770            return toString;
771          }
772        };
773      }
774    
775      /**
776       * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal
777       * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched.
778       *
779       * <p>The default implementation loops over every possible character value, invoking {@link
780       * #matches} for each one.
781       */
782      void setBits(LookupTable table) {
783        char c = Character.MIN_VALUE;
784        while (true) {
785          if (matches(c)) {
786            table.set(c);
787          }
788          if (c++ == Character.MAX_VALUE) {
789            break;
790          }
791        }
792      }
793    
794      /**
795       * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}.
796       *
797       * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a
798       * simpler java.util.BitSet.
799       */
800      private static final class LookupTable {
801        int[] data = new int[2048];
802    
803        void set(char index) {
804          data[index >> 5] |= (1 << index);
805        }
806    
807        boolean get(char index) {
808          return (data[index >> 5] & (1 << index)) != 0;
809        }
810      }
811    
812      // Text processing routines
813    
814      /**
815       * Returns {@code true} if a character sequence contains at least one matching character.
816       * Equivalent to {@code !matchesNoneOf(sequence)}.
817       *
818       * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
819       * character, until this returns {@code true} or the end is reached.
820       *
821       * @param sequence the character sequence to examine, possibly empty
822       * @return {@code true} if this matcher matches at least one character in the sequence
823       * @since 8.0
824       */
825      public boolean matchesAnyOf(CharSequence sequence) {
826        return !matchesNoneOf(sequence);
827      }
828    
829      /**
830       * Returns {@code true} if a character sequence contains only matching characters.
831       *
832       * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
833       * character, until this returns {@code false} or the end is reached.
834       *
835       * @param sequence the character sequence to examine, possibly empty
836       * @return {@code true} if this matcher matches every character in the sequence, including when
837       *         the sequence is empty
838       */
839      public boolean matchesAllOf(CharSequence sequence) {
840        for (int i = sequence.length() - 1; i >= 0; i--) {
841          if (!matches(sequence.charAt(i))) {
842            return false;
843          }
844        }
845        return true;
846      }
847    
848      /**
849       * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
850       * {@code !matchesAnyOf(sequence)}.
851       *
852       * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
853       * character, until this returns {@code false} or the end is reached.
854       *
855       * @param sequence the character sequence to examine, possibly empty
856       * @return {@code true} if this matcher matches every character in the sequence, including when
857       *         the sequence is empty
858       */
859      public boolean matchesNoneOf(CharSequence sequence) {
860        return indexIn(sequence) == -1;
861      }
862    
863      /**
864       * Returns the index of the first matching character in a character sequence, or {@code -1} if no
865       * matching character is present.
866       *
867       * <p>The default implementation iterates over the sequence in forward order calling {@link
868       * #matches} for each character.
869       *
870       * @param sequence the character sequence to examine from the beginning
871       * @return an index, or {@code -1} if no character matches
872       */
873      public int indexIn(CharSequence sequence) {
874        int length = sequence.length();
875        for (int i = 0; i < length; i++) {
876          if (matches(sequence.charAt(i))) {
877            return i;
878          }
879        }
880        return -1;
881      }
882    
883      /**
884       * Returns the index of the first matching character in a character sequence, starting from a
885       * given position, or {@code -1} if no character matches after that position.
886       *
887       * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
888       * start}, calling {@link #matches} for each character.
889       *
890       * @param sequence the character sequence to examine
891       * @param start the first index to examine; must be nonnegative and no greater than {@code
892       *        sequence.length()}
893       * @return the index of the first matching character, guaranteed to be no less than {@code start},
894       *         or {@code -1} if no character matches
895       * @throws IndexOutOfBoundsException if start is negative or greater than {@code
896       *         sequence.length()}
897       */
898      public int indexIn(CharSequence sequence, int start) {
899        int length = sequence.length();
900        Preconditions.checkPositionIndex(start, length);
901        for (int i = start; i < length; i++) {
902          if (matches(sequence.charAt(i))) {
903            return i;
904          }
905        }
906        return -1;
907      }
908    
909      /**
910       * Returns the index of the last matching character in a character sequence, or {@code -1} if no
911       * matching character is present.
912       *
913       * <p>The default implementation iterates over the sequence in reverse order calling {@link
914       * #matches} for each character.
915       *
916       * @param sequence the character sequence to examine from the end
917       * @return an index, or {@code -1} if no character matches
918       */
919      public int lastIndexIn(CharSequence sequence) {
920        for (int i = sequence.length() - 1; i >= 0; i--) {
921          if (matches(sequence.charAt(i))) {
922            return i;
923          }
924        }
925        return -1;
926      }
927    
928      /**
929       * Returns the number of matching characters found in a character sequence.
930       */
931      public int countIn(CharSequence sequence) {
932        int count = 0;
933        for (int i = 0; i < sequence.length(); i++) {
934          if (matches(sequence.charAt(i))) {
935            count++;
936          }
937        }
938        return count;
939      }
940    
941      /**
942       * Returns a string containing all non-matching characters of a character sequence, in order. For
943       * example: <pre>   {@code
944       *
945       *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
946       *
947       * ... returns {@code "bzr"}.
948       */
949      @CheckReturnValue
950      public String removeFrom(CharSequence sequence) {
951        String string = sequence.toString();
952        int pos = indexIn(string);
953        if (pos == -1) {
954          return string;
955        }
956    
957        char[] chars = string.toCharArray();
958        int spread = 1;
959    
960        // This unusual loop comes from extensive benchmarking
961        OUT: while (true) {
962          pos++;
963          while (true) {
964            if (pos == chars.length) {
965              break OUT;
966            }
967            if (matches(chars[pos])) {
968              break;
969            }
970            chars[pos - spread] = chars[pos];
971            pos++;
972          }
973          spread++;
974        }
975        return new String(chars, 0, pos - spread);
976      }
977    
978      /**
979       * Returns a string containing all matching characters of a character sequence, in order. For
980       * example: <pre>   {@code
981       *
982       *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
983       *
984       * ... returns {@code "aaa"}.
985       */
986      @CheckReturnValue
987      public String retainFrom(CharSequence sequence) {
988        return negate().removeFrom(sequence);
989      }
990    
991      /**
992       * Returns a string copy of the input character sequence, with each character that matches this
993       * matcher replaced by a given replacement character. For example: <pre>   {@code
994       *
995       *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
996       *
997       * ... returns {@code "rodor"}.
998       *
999       * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1000       * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1001       * character.
1002       *
1003       * @param sequence the character sequence to replace matching characters in
1004       * @param replacement the character to append to the result string in place of each matching
1005       *        character in {@code sequence}
1006       * @return the new string
1007       */
1008      @CheckReturnValue
1009      public String replaceFrom(CharSequence sequence, char replacement) {
1010        String string = sequence.toString();
1011        int pos = indexIn(string);
1012        if (pos == -1) {
1013          return string;
1014        }
1015        char[] chars = string.toCharArray();
1016        chars[pos] = replacement;
1017        for (int i = pos + 1; i < chars.length; i++) {
1018          if (matches(chars[i])) {
1019            chars[i] = replacement;
1020          }
1021        }
1022        return new String(chars);
1023      }
1024    
1025      /**
1026       * Returns a string copy of the input character sequence, with each character that matches this
1027       * matcher replaced by a given replacement sequence. For example: <pre>   {@code
1028       *
1029       *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1030       *
1031       * ... returns {@code "yoohoo"}.
1032       *
1033       * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1034       * off calling {@link #replaceFrom(CharSequence, char)} directly.
1035       *
1036       * @param sequence the character sequence to replace matching characters in
1037       * @param replacement the characters to append to the result string in place of each matching
1038       *        character in {@code sequence}
1039       * @return the new string
1040       */
1041      @CheckReturnValue
1042      public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1043        int replacementLen = replacement.length();
1044        if (replacementLen == 0) {
1045          return removeFrom(sequence);
1046        }
1047        if (replacementLen == 1) {
1048          return replaceFrom(sequence, replacement.charAt(0));
1049        }
1050    
1051        String string = sequence.toString();
1052        int pos = indexIn(string);
1053        if (pos == -1) {
1054          return string;
1055        }
1056    
1057        int len = string.length();
1058        StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1059    
1060        int oldpos = 0;
1061        do {
1062          buf.append(string, oldpos, pos);
1063          buf.append(replacement);
1064          oldpos = pos + 1;
1065          pos = indexIn(string, oldpos);
1066        } while (pos != -1);
1067    
1068        buf.append(string, oldpos, len);
1069        return buf.toString();
1070      }
1071    
1072      /**
1073       * Returns a substring of the input character sequence that omits all characters this matcher
1074       * matches from the beginning and from the end of the string. For example: <pre>   {@code
1075       *
1076       *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1077       *
1078       * ... returns {@code "cat"}.
1079       *
1080       * <p>Note that: <pre>   {@code
1081       *
1082       *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1083       *
1084       * ... is equivalent to {@link String#trim()}.
1085       */
1086      @CheckReturnValue
1087      public String trimFrom(CharSequence sequence) {
1088        int len = sequence.length();
1089        int first;
1090        int last;
1091    
1092        for (first = 0; first < len; first++) {
1093          if (!matches(sequence.charAt(first))) {
1094            break;
1095          }
1096        }
1097        for (last = len - 1; last > first; last--) {
1098          if (!matches(sequence.charAt(last))) {
1099            break;
1100          }
1101        }
1102    
1103        return sequence.subSequence(first, last + 1).toString();
1104      }
1105    
1106      /**
1107       * Returns a substring of the input character sequence that omits all characters this matcher
1108       * matches from the beginning of the string. For example: <pre> {@code
1109       *
1110       *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1111       *
1112       * ... returns {@code "catbab"}.
1113       */
1114      @CheckReturnValue
1115      public String trimLeadingFrom(CharSequence sequence) {
1116        int len = sequence.length();
1117        int first;
1118    
1119        for (first = 0; first < len; first++) {
1120          if (!matches(sequence.charAt(first))) {
1121            break;
1122          }
1123        }
1124    
1125        return sequence.subSequence(first, len).toString();
1126      }
1127    
1128      /**
1129       * Returns a substring of the input character sequence that omits all characters this matcher
1130       * matches from the end of the string. For example: <pre> {@code
1131       *
1132       *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1133       *
1134       * ... returns {@code "abacat"}.
1135       */
1136      @CheckReturnValue
1137      public String trimTrailingFrom(CharSequence sequence) {
1138        int len = sequence.length();
1139        int last;
1140    
1141        for (last = len - 1; last >= 0; last--) {
1142          if (!matches(sequence.charAt(last))) {
1143            break;
1144          }
1145        }
1146    
1147        return sequence.subSequence(0, last + 1).toString();
1148      }
1149    
1150      /**
1151       * Returns a string copy of the input character sequence, with each group of consecutive
1152       * characters that match this matcher replaced by a single replacement character. For example:
1153       * <pre>   {@code
1154       *
1155       *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1156       *
1157       * ... returns {@code "b-p-r"}.
1158       *
1159       * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1160       * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1161       * character.
1162       *
1163       * @param sequence the character sequence to replace matching groups of characters in
1164       * @param replacement the character to append to the result string in place of each group of
1165       *        matching characters in {@code sequence}
1166       * @return the new string
1167       */
1168      @CheckReturnValue
1169      public String collapseFrom(CharSequence sequence, char replacement) {
1170        int first = indexIn(sequence);
1171        if (first == -1) {
1172          return sequence.toString();
1173        }
1174    
1175        // TODO(kevinb): see if this implementation can be made faster
1176        StringBuilder builder = new StringBuilder(sequence.length())
1177            .append(sequence.subSequence(0, first))
1178            .append(replacement);
1179        boolean in = true;
1180        for (int i = first + 1; i < sequence.length(); i++) {
1181          char c = sequence.charAt(i);
1182          if (matches(c)) {
1183            if (!in) {
1184              builder.append(replacement);
1185              in = true;
1186            }
1187          } else {
1188            builder.append(c);
1189            in = false;
1190          }
1191        }
1192        return builder.toString();
1193      }
1194    
1195      /**
1196       * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1197       * groups of matching characters at the start or end of the sequence are removed without
1198       * replacement.
1199       */
1200      @CheckReturnValue
1201      public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1202        int first = negate().indexIn(sequence);
1203        if (first == -1) {
1204          return ""; // everything matches. nothing's left.
1205        }
1206        StringBuilder builder = new StringBuilder(sequence.length());
1207        boolean inMatchingGroup = false;
1208        for (int i = first; i < sequence.length(); i++) {
1209          char c = sequence.charAt(i);
1210          if (matches(c)) {
1211            inMatchingGroup = true;
1212          } else {
1213            if (inMatchingGroup) {
1214              builder.append(replacement);
1215              inMatchingGroup = false;
1216            }
1217            builder.append(c);
1218          }
1219        }
1220        return builder.toString();
1221      }
1222    
1223      // Predicate interface
1224    
1225      /**
1226       * Returns {@code true} if this matcher matches the given character.
1227       *
1228       * @throws NullPointerException if {@code character} is null
1229       */
1230      @Override public boolean apply(Character character) {
1231        return matches(character);
1232      }
1233    
1234      /**
1235       * Returns a string representation of this {@code CharMatcher}, such as
1236       * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1237       */
1238      @Override
1239      public String toString() {
1240        return super.toString();
1241      }
1242    
1243      /**
1244       * Determines whether a character is whitespace according to the latest Unicode standard, as
1245       * illustrated
1246       * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1247       * This is not the same definition used by other Java APIs. (See a
1248       * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1249       * definitions of "whitespace"</a>.)
1250       *
1251       * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1252       * to date.
1253       */
1254      public static final CharMatcher WHITESPACE = new CharMatcher() {
1255        /**
1256         * A special-case CharMatcher for Unicode whitespace characters that is extremely
1257         * efficient both in space required and in time to check for matches.
1258         *
1259         * Implementation details.
1260         * It turns out that all current (early 2012) Unicode characters are unique modulo 79:
1261         * so we can construct a lookup table of exactly 79 entries, and just check the character code
1262         * mod 79, and see if that character is in the table.
1263         *
1264         * There is a 1 at the beginning of the table so that the null character is not listed
1265         * as whitespace.
1266         *
1267         * Other things we tried that did not prove to be beneficial, mostly due to speed concerns:
1268         *
1269         *   * Binary search into the sorted list of characters, i.e., what
1270         *     CharMatcher.anyOf() does</li>
1271         *   * Perfect hash function into a table of size 26 (using an offset table and a special
1272         *     Jenkins hash function)</li>
1273         *   * Perfect-ish hash function that required two lookups into a single table of size 26.</li>
1274         *   * Using a power-of-2 sized hash table (size 64) with linear probing.</li>
1275         *
1276         * --Christopher Swenson, February 2012.
1277         */
1278    
1279        // Mod-79 lookup table.
1280        private final char[] table = {1, 0, 160, 0, 0, 0, 0, 0, 0, 9, 10, 11, 12, 13, 0, 0,
1281            8232, 8233, 0, 0, 0, 0, 0, 8239, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1282            12288, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199,
1283            8200, 8201, 8202, 0, 0, 0, 0, 0, 8287, 5760, 0, 0, 6158, 0, 0, 0};
1284    
1285        @Override public boolean matches(char c) {
1286          return table[c % 79] == c;
1287        }
1288    
1289        @Override public CharMatcher precomputed() {
1290          return this;
1291        }
1292    
1293        @Override public String toString() {
1294          return "CharMatcher.WHITESPACE";
1295        }
1296      };
1297    }