001    /*
002     * Copyright (C) 2008 The Guava Authors
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package com.google.common.base;
018    
019    import static com.google.common.base.Preconditions.checkArgument;
020    import static com.google.common.base.Preconditions.checkNotNull;
021    
022    import com.google.common.annotations.Beta;
023    import com.google.common.annotations.GwtCompatible;
024    
025    import java.util.Arrays;
026    import javax.annotation.CheckReturnValue;
027    
028    /**
029     * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
030     * for any {@link Object}. Also offers basic text processing methods based on this function.
031     * Implementations are strongly encouraged to be side-effect-free and immutable.
032     *
033     * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
034     * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
035     *
036     * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
037     * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
038     * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
039     * treats these just as two separate characters.
040     *
041     * <p>Example usages: <pre>
042     *   String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
043     *   if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
044     *
045     * <p>See the Guava User Guide article on <a href=
046     * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
047     * {@code CharMatcher}</a>.
048     *
049     * @author Kevin Bourrillion
050     * @since 1.0
051     */
052    @Beta // Possibly change from chars to code points; decide constants vs. methods
053    @GwtCompatible
054    public abstract class CharMatcher implements Predicate<Character> {
055      // Constants
056      /**
057       * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
058       * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
059       * discussion of that term.
060       *
061       * @since 2.0
062       */
063      public static final CharMatcher BREAKING_WHITESPACE =
064          anyOf("\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000")
065              .or(inRange('\u2000', '\u2006'))
066              .or(inRange('\u2008', '\u200a'))
067              .withToString("CharMatcher.BREAKING_WHITESPACE")
068              .precomputed();
069    
070      /**
071       * Determines whether a character is ASCII, meaning that its code point is less than 128.
072       */
073      public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII");
074    
075      /**
076       * Determines whether a character is a digit according to
077       * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
078       */
079      public static final CharMatcher DIGIT;
080    
081      static {
082        CharMatcher digit = inRange('0', '9');
083        String zeroes =
084            "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
085                + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
086                + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
087        for (char base : zeroes.toCharArray()) {
088          digit = digit.or(inRange(base, (char) (base + 9)));
089        }
090        DIGIT = digit.withToString("CharMatcher.DIGIT").precomputed();
091      }
092    
093      /**
094       * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
095       * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
096       */
097      public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") {
098        @Override public boolean matches(char c) {
099          return Character.isDigit(c);
100        }
101      };
102    
103      /**
104       * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
105       * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
106       * inRange('a', 'z').or(inRange('A', 'Z'))}.
107       */
108      public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") {
109        @Override public boolean matches(char c) {
110          return Character.isLetter(c);
111        }
112    
113        @Override public CharMatcher precomputed() {
114          return this;
115        }
116      };
117    
118      /**
119       * Determines whether a character is a letter or digit according to {@link
120       * Character#isLetterOrDigit(char) Java's definition}.
121       */
122      public static final CharMatcher JAVA_LETTER_OR_DIGIT =
123          new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") {
124        @Override public boolean matches(char c) {
125          return Character.isLetterOrDigit(c);
126        }
127      };
128    
129      /**
130       * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
131       * Java's definition}.
132       */
133      public static final CharMatcher JAVA_UPPER_CASE =
134          new CharMatcher("CharMatcher.JAVA_UPPER_CASE") {
135        @Override public boolean matches(char c) {
136          return Character.isUpperCase(c);
137        }
138      };
139    
140      /**
141       * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
142       * Java's definition}.
143       */
144      public static final CharMatcher JAVA_LOWER_CASE =
145          new CharMatcher("CharMatcher.JAVA_LOWER_CASE") {
146        @Override public boolean matches(char c) {
147          return Character.isLowerCase(c);
148        }
149      };
150    
151      /**
152       * Determines whether a character is an ISO control character as specified by {@link
153       * Character#isISOControl(char)}.
154       */
155      public static final CharMatcher JAVA_ISO_CONTROL =
156          inRange('\u0000', '\u001f')
157          .or(inRange('\u007f', '\u009f'))
158          .withToString("CharMatcher.JAVA_ISO_CONTROL");
159    
160      /**
161       * Determines whether a character is invisible; that is, if its Unicode category is any of
162       * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
163       * PRIVATE_USE according to ICU4J.
164       */
165      public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
166          .or(inRange('\u007f', '\u00a0'))
167          .or(is('\u00ad'))
168          .or(inRange('\u0600', '\u0604'))
169          .or(anyOf("\u06dd\u070f\u1680\u180e"))
170          .or(inRange('\u2000', '\u200f'))
171          .or(inRange('\u2028', '\u202f'))
172          .or(inRange('\u205f', '\u2064'))
173          .or(inRange('\u206a', '\u206f'))
174          .or(is('\u3000'))
175          .or(inRange('\ud800', '\uf8ff'))
176          .or(anyOf("\ufeff\ufff9\ufffa\ufffb"))
177          .withToString("CharMatcher.INVISIBLE")
178          .precomputed();
179    
180      /**
181       * Determines whether a character is single-width (not double-width). When in doubt, this matcher
182       * errs on the side of returning {@code false} (that is, it tends to assume a character is
183       * double-width).
184       *
185       * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
186       * date.
187       */
188      public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
189          .or(is('\u05be'))
190          .or(inRange('\u05d0', '\u05ea'))
191          .or(is('\u05f3'))
192          .or(is('\u05f4'))
193          .or(inRange('\u0600', '\u06ff'))
194          .or(inRange('\u0750', '\u077f'))
195          .or(inRange('\u0e00', '\u0e7f'))
196          .or(inRange('\u1e00', '\u20af'))
197          .or(inRange('\u2100', '\u213a'))
198          .or(inRange('\ufb50', '\ufdff'))
199          .or(inRange('\ufe70', '\ufeff'))
200          .or(inRange('\uff61', '\uffdc'))
201          .withToString("CharMatcher.SINGLE_WIDTH")
202          .precomputed();
203    
204      /** Matches any character. */
205      public static final CharMatcher ANY =
206          new CharMatcher("CharMatcher.ANY") {
207            @Override public boolean matches(char c) {
208              return true;
209            }
210    
211            @Override public int indexIn(CharSequence sequence) {
212              return (sequence.length() == 0) ? -1 : 0;
213            }
214    
215            @Override public int indexIn(CharSequence sequence, int start) {
216              int length = sequence.length();
217              Preconditions.checkPositionIndex(start, length);
218              return (start == length) ? -1 : start;
219            }
220    
221            @Override public int lastIndexIn(CharSequence sequence) {
222              return sequence.length() - 1;
223            }
224    
225            @Override public boolean matchesAllOf(CharSequence sequence) {
226              checkNotNull(sequence);
227              return true;
228            }
229    
230            @Override public boolean matchesNoneOf(CharSequence sequence) {
231              return sequence.length() == 0;
232            }
233    
234            @Override public String removeFrom(CharSequence sequence) {
235              checkNotNull(sequence);
236              return "";
237            }
238    
239            @Override public String replaceFrom(CharSequence sequence, char replacement) {
240              char[] array = new char[sequence.length()];
241              Arrays.fill(array, replacement);
242              return new String(array);
243            }
244    
245            @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
246              StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
247              for (int i = 0; i < sequence.length(); i++) {
248                retval.append(replacement);
249              }
250              return retval.toString();
251            }
252    
253            @Override public String collapseFrom(CharSequence sequence, char replacement) {
254              return (sequence.length() == 0) ? "" : String.valueOf(replacement);
255            }
256    
257            @Override public String trimFrom(CharSequence sequence) {
258              checkNotNull(sequence);
259              return "";
260            }
261    
262            @Override public int countIn(CharSequence sequence) {
263              return sequence.length();
264            }
265    
266            @Override public CharMatcher and(CharMatcher other) {
267              return checkNotNull(other);
268            }
269    
270            @Override public CharMatcher or(CharMatcher other) {
271              checkNotNull(other);
272              return this;
273            }
274    
275            @Override public CharMatcher negate() {
276              return NONE;
277            }
278    
279            @Override public CharMatcher precomputed() {
280              return this;
281            }
282          };
283    
284      /** Matches no characters. */
285      public static final CharMatcher NONE =
286          new CharMatcher("CharMatcher.NONE") {
287            @Override public boolean matches(char c) {
288              return false;
289            }
290    
291            @Override public int indexIn(CharSequence sequence) {
292              checkNotNull(sequence);
293              return -1;
294            }
295    
296            @Override public int indexIn(CharSequence sequence, int start) {
297              int length = sequence.length();
298              Preconditions.checkPositionIndex(start, length);
299              return -1;
300            }
301    
302            @Override public int lastIndexIn(CharSequence sequence) {
303              checkNotNull(sequence);
304              return -1;
305            }
306    
307            @Override public boolean matchesAllOf(CharSequence sequence) {
308              return sequence.length() == 0;
309            }
310    
311            @Override public boolean matchesNoneOf(CharSequence sequence) {
312              checkNotNull(sequence);
313              return true;
314            }
315    
316            @Override public String removeFrom(CharSequence sequence) {
317              return sequence.toString();
318            }
319    
320            @Override public String replaceFrom(CharSequence sequence, char replacement) {
321              return sequence.toString();
322            }
323    
324            @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
325              checkNotNull(replacement);
326              return sequence.toString();
327            }
328    
329            @Override public String collapseFrom(CharSequence sequence, char replacement) {
330              return sequence.toString();
331            }
332    
333            @Override public String trimFrom(CharSequence sequence) {
334              return sequence.toString();
335            }
336    
337            @Override public int countIn(CharSequence sequence) {
338              checkNotNull(sequence);
339              return 0;
340            }
341    
342            @Override public CharMatcher and(CharMatcher other) {
343              checkNotNull(other);
344              return this;
345            }
346    
347            @Override public CharMatcher or(CharMatcher other) {
348              return checkNotNull(other);
349            }
350    
351            @Override public CharMatcher negate() {
352              return ANY;
353            }
354    
355            @Override void setBits(LookupTable table) {}
356    
357            @Override public CharMatcher precomputed() {
358              return this;
359            }
360          };
361    
362      // Static factories
363    
364      /**
365       * Returns a {@code char} matcher that matches only one specified character.
366       */
367      public static CharMatcher is(final char match) {
368        String description = new StringBuilder("CharMatcher.is(")
369            .append(Integer.toHexString(match))
370            .append(")")
371            .toString();
372        return new CharMatcher(description) {
373          @Override public boolean matches(char c) {
374            return c == match;
375          }
376    
377          @Override public String replaceFrom(CharSequence sequence, char replacement) {
378            return sequence.toString().replace(match, replacement);
379          }
380    
381          @Override public CharMatcher and(CharMatcher other) {
382            return other.matches(match) ? this : NONE;
383          }
384    
385          @Override public CharMatcher or(CharMatcher other) {
386            return other.matches(match) ? other : super.or(other);
387          }
388    
389          @Override public CharMatcher negate() {
390            return isNot(match);
391          }
392    
393          @Override void setBits(LookupTable table) {
394            table.set(match);
395          }
396    
397          @Override public CharMatcher precomputed() {
398            return this;
399          }
400        };
401      }
402    
403      /**
404       * Returns a {@code char} matcher that matches any character except the one specified.
405       *
406       * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
407       */
408      public static CharMatcher isNot(final char match) {
409        String description = new StringBuilder("CharMatcher.isNot(")
410            .append(Integer.toHexString(match))
411            .append(")")
412            .toString();
413        return new CharMatcher(description) {
414          @Override public boolean matches(char c) {
415            return c != match;
416          }
417    
418          @Override public CharMatcher and(CharMatcher other) {
419            return other.matches(match) ? super.and(other) : other;
420          }
421    
422          @Override public CharMatcher or(CharMatcher other) {
423            return other.matches(match) ? ANY : this;
424          }
425    
426          @Override public CharMatcher negate() {
427            return is(match);
428          }
429        };
430      }
431    
432      /**
433       * Returns a {@code char} matcher that matches any character present in the given character
434       * sequence.
435       */
436      public static CharMatcher anyOf(final CharSequence sequence) {
437        switch (sequence.length()) {
438          case 0:
439            return NONE;
440          case 1:
441            return is(sequence.charAt(0));
442          case 2:
443            final char match1 = sequence.charAt(0);
444            final char match2 = sequence.charAt(1);
445            return new CharMatcher(
446                new StringBuilder("CharMatcher.anyOf(\"").append(sequence).append("\")").toString()) {
447              @Override public boolean matches(char c) {
448                return c == match1 || c == match2;
449              }
450    
451              @Override void setBits(LookupTable table) {
452                table.set(match1);
453                table.set(match2);
454              }
455    
456              @Override public CharMatcher precomputed() {
457                return this;
458              }
459            };
460        }
461        final char[] chars = sequence.toString().toCharArray();
462        Arrays.sort(chars);
463    
464        return new CharMatcher(new StringBuilder("CharMatcher.anyOf(\"").append(chars)
465            .append("\")").toString()) {
466              @Override public boolean matches(char c) {
467                return Arrays.binarySearch(chars, c) >= 0;
468              }
469        };
470      }
471    
472      /**
473       * Returns a {@code char} matcher that matches any character not present in the given character
474       * sequence.
475       */
476      public static CharMatcher noneOf(CharSequence sequence) {
477        return anyOf(sequence).negate();
478      }
479    
480      /**
481       * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
482       * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
483       * CharMatcher.inRange('a', 'z')}.
484       *
485       * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
486       */
487      public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
488        checkArgument(endInclusive >= startInclusive);
489        String description = new StringBuilder("CharMatcher.inRange(")
490            .append(Integer.toHexString(startInclusive))
491            .append(", ")
492            .append(Integer.toHexString(endInclusive))
493            .append(")")
494            .toString();
495        return inRange(startInclusive, endInclusive, description);
496      }
497    
498      static CharMatcher inRange(final char startInclusive, final char endInclusive,
499          String description) {
500        return new CharMatcher(description) {
501          @Override public boolean matches(char c) {
502            return startInclusive <= c && c <= endInclusive;
503          }
504    
505          @Override void setBits(LookupTable table) {
506            char c = startInclusive;
507            while (true) {
508              table.set(c);
509              if (c++ == endInclusive) {
510                break;
511              }
512            }
513          }
514    
515          @Override public CharMatcher precomputed() {
516            return this;
517          }
518        };
519      }
520    
521      /**
522       * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
523       * which operates on primitive {@code char} instances instead.
524       */
525      public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
526        checkNotNull(predicate);
527        if (predicate instanceof CharMatcher) {
528          return (CharMatcher) predicate;
529        }
530        String description = new StringBuilder("CharMatcher.forPredicate(")
531            .append(predicate)
532            .append(')')
533            .toString();
534        return new CharMatcher(description) {
535          @Override public boolean matches(char c) {
536            return predicate.apply(c);
537          }
538    
539          @Override public boolean apply(Character character) {
540            return predicate.apply(checkNotNull(character));
541          }
542        };
543      }
544    
545      // State
546      final String description;
547    
548      // Constructors
549    
550      /**
551       * Sets the {@code toString()} from the given description.
552       */
553      CharMatcher(String description) {
554        this.description = description;
555      }
556    
557      /**
558       * Constructor for use by subclasses. When subclassing, you may want to override
559       * {@code toString()} to provide a useful description.
560       */
561      protected CharMatcher() {
562        description = "UnknownCharMatcher";
563      }
564    
565      // Abstract methods
566    
567      /** Determines a true or false value for the given character. */
568      public abstract boolean matches(char c);
569    
570      // Non-static factories
571    
572      /**
573       * Returns a matcher that matches any character not matched by this matcher.
574       */
575      public CharMatcher negate() {
576        final CharMatcher original = this;
577        return new CharMatcher(original + ".negate()") {
578          @Override public boolean matches(char c) {
579            return !original.matches(c);
580          }
581    
582          @Override public boolean matchesAllOf(CharSequence sequence) {
583            return original.matchesNoneOf(sequence);
584          }
585    
586          @Override public boolean matchesNoneOf(CharSequence sequence) {
587            return original.matchesAllOf(sequence);
588          }
589    
590          @Override public int countIn(CharSequence sequence) {
591            return sequence.length() - original.countIn(sequence);
592          }
593    
594          @Override public CharMatcher negate() {
595            return original;
596          }
597        };
598      }
599    
600      /**
601       * Returns a matcher that matches any character matched by both this matcher and {@code other}.
602       */
603      public CharMatcher and(CharMatcher other) {
604        return new And(this, checkNotNull(other));
605      }
606    
607      private static class And extends CharMatcher {
608        final CharMatcher first;
609        final CharMatcher second;
610    
611        And(CharMatcher a, CharMatcher b) {
612          this(a, b, "CharMatcher.and(" + a + ", " + b + ")");
613        }
614    
615        And(CharMatcher a, CharMatcher b, String description) {
616          super(description);
617          first = checkNotNull(a);
618          second = checkNotNull(b);
619        }
620    
621        @Override
622        public CharMatcher and(CharMatcher other) {
623          return new And(this, other);
624        }
625    
626        @Override
627        public boolean matches(char c) {
628          return first.matches(c) && second.matches(c);
629        }
630    
631        @Override
632        CharMatcher withToString(String description) {
633          return new And(first, second, description);
634        }
635      }
636    
637      /**
638       * Returns a matcher that matches any character matched by either this matcher or {@code other}.
639       */
640      public CharMatcher or(CharMatcher other) {
641        return new Or(this, checkNotNull(other));
642      }
643    
644      private static class Or extends CharMatcher {
645        final CharMatcher first;
646        final CharMatcher second;
647    
648        Or(CharMatcher a, CharMatcher b, String description) {
649          super(description);
650          first = checkNotNull(a);
651          second = checkNotNull(b);
652        }
653    
654        Or(CharMatcher a, CharMatcher b) {
655          this(a, b, "CharMatcher.or(" + a + ", " + b + ")");
656        }
657    
658        @Override
659        public CharMatcher or(CharMatcher other) {
660          return new Or(this, checkNotNull(other));
661        }
662    
663        @Override
664        public boolean matches(char c) {
665          return first.matches(c) || second.matches(c);
666        }
667    
668        @Override
669        CharMatcher withToString(String description) {
670          return new Or(first, second, description);
671        }
672      }
673    
674      /**
675       * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
676       * query than the original; your mileage may vary. Precomputation takes time and is likely to be
677       * worthwhile only if the precomputed matcher is queried many thousands of times.
678       *
679       * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
680       * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
681       * worthwhile tradeoff in a browser.
682       */
683      public CharMatcher precomputed() {
684        return Platform.precomputeCharMatcher(this);
685      }
686    
687      /**
688       * Construct an array of all possible chars in the slowest way possible.
689       */
690      char[] slowGetChars() {
691        char[] allChars = new char[65536];
692        int size = 0;
693        for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) {
694          if (matches((char) c)) {
695            allChars[size++] = (char) c;
696          }
697        }
698        char[] retValue = new char[size];
699        System.arraycopy(allChars, 0, retValue, 0, size);
700        return retValue;
701      }
702    
703      /**
704       * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
705       * on {@link Platform} so that we can have different behavior in GWT.
706       *
707       * <p>If the number of matched characters is small enough, we try to build a small hash
708       * table to contain all of the characters. Otherwise, we record the characters in eight-kilobyte
709       * bit array. In many situations this produces a matcher which is faster to query
710       * than the original.
711       */
712      CharMatcher precomputedInternal() {
713        final char[] chars = slowGetChars();
714        int totalCharacters = chars.length;
715        if (totalCharacters == 0) {
716          return NONE;
717        } else if (totalCharacters == 1) {
718          return is(chars[0]);
719        } else if (totalCharacters < SmallCharMatcher.MAX_SIZE) {
720          return SmallCharMatcher.from(chars, toString());
721        } else if (totalCharacters < MediumCharMatcher.MAX_SIZE) {
722          return MediumCharMatcher.from(chars, toString());
723        }
724        // Otherwise, make the full lookup table.
725        final LookupTable table = new LookupTable();
726        setBits(table);
727        final CharMatcher outer = this;
728    
729        return new CharMatcher(outer.toString()) {
730          @Override public boolean matches(char c) {
731            return table.get(c);
732          }
733    
734          // TODO(kevinb): make methods like negate() smart?
735    
736          @Override public CharMatcher precomputed() {
737            return this;
738          }
739        };
740      }
741    
742      /**
743       * Subclasses should provide a new CharMatcher with the same characteristics as {@code this},
744       * but with their {@code toString} method overridden with the new description.
745       *
746       * <p>This is unsupported by default.
747       */
748      CharMatcher withToString(String description) {
749        throw new UnsupportedOperationException();
750    
751      }
752    
753      /**
754       * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal
755       * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched.
756       *
757       * <p>The default implementation loops over every possible character value, invoking {@link
758       * #matches} for each one.
759       */
760      void setBits(LookupTable table) {
761        char c = Character.MIN_VALUE;
762        while (true) {
763          if (matches(c)) {
764            table.set(c);
765          }
766          if (c++ == Character.MAX_VALUE) {
767            break;
768          }
769        }
770      }
771    
772      /**
773       * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}.
774       *
775       * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a
776       * simpler java.util.BitSet.
777       */
778      private static final class LookupTable {
779        int[] data = new int[2048];
780    
781        void set(char index) {
782          data[index >> 5] |= (1 << index);
783        }
784    
785        boolean get(char index) {
786          return (data[index >> 5] & (1 << index)) != 0;
787        }
788      }
789    
790      // Text processing routines
791    
792      /**
793       * Returns {@code true} if a character sequence contains at least one matching character.
794       * Equivalent to {@code !matchesNoneOf(sequence)}.
795       *
796       * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
797       * character, until this returns {@code true} or the end is reached.
798       *
799       * @param sequence the character sequence to examine, possibly empty
800       * @return {@code true} if this matcher matches at least one character in the sequence
801       * @since 8.0
802       */
803      public boolean matchesAnyOf(CharSequence sequence) {
804        return !matchesNoneOf(sequence);
805      }
806    
807      /**
808       * Returns {@code true} if a character sequence contains only matching characters.
809       *
810       * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
811       * character, until this returns {@code false} or the end is reached.
812       *
813       * @param sequence the character sequence to examine, possibly empty
814       * @return {@code true} if this matcher matches every character in the sequence, including when
815       *         the sequence is empty
816       */
817      public boolean matchesAllOf(CharSequence sequence) {
818        for (int i = sequence.length() - 1; i >= 0; i--) {
819          if (!matches(sequence.charAt(i))) {
820            return false;
821          }
822        }
823        return true;
824      }
825    
826      /**
827       * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
828       * {@code !matchesAnyOf(sequence)}.
829       *
830       * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
831       * character, until this returns {@code false} or the end is reached.
832       *
833       * @param sequence the character sequence to examine, possibly empty
834       * @return {@code true} if this matcher matches every character in the sequence, including when
835       *         the sequence is empty
836       */
837      public boolean matchesNoneOf(CharSequence sequence) {
838        return indexIn(sequence) == -1;
839      }
840    
841      /**
842       * Returns the index of the first matching character in a character sequence, or {@code -1} if no
843       * matching character is present.
844       *
845       * <p>The default implementation iterates over the sequence in forward order calling {@link
846       * #matches} for each character.
847       *
848       * @param sequence the character sequence to examine from the beginning
849       * @return an index, or {@code -1} if no character matches
850       */
851      public int indexIn(CharSequence sequence) {
852        int length = sequence.length();
853        for (int i = 0; i < length; i++) {
854          if (matches(sequence.charAt(i))) {
855            return i;
856          }
857        }
858        return -1;
859      }
860    
861      /**
862       * Returns the index of the first matching character in a character sequence, starting from a
863       * given position, or {@code -1} if no character matches after that position.
864       *
865       * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
866       * start}, calling {@link #matches} for each character.
867       *
868       * @param sequence the character sequence to examine
869       * @param start the first index to examine; must be nonnegative and no greater than {@code
870       *        sequence.length()}
871       * @return the index of the first matching character, guaranteed to be no less than {@code start},
872       *         or {@code -1} if no character matches
873       * @throws IndexOutOfBoundsException if start is negative or greater than {@code
874       *         sequence.length()}
875       */
876      public int indexIn(CharSequence sequence, int start) {
877        int length = sequence.length();
878        Preconditions.checkPositionIndex(start, length);
879        for (int i = start; i < length; i++) {
880          if (matches(sequence.charAt(i))) {
881            return i;
882          }
883        }
884        return -1;
885      }
886    
887      /**
888       * Returns the index of the last matching character in a character sequence, or {@code -1} if no
889       * matching character is present.
890       *
891       * <p>The default implementation iterates over the sequence in reverse order calling {@link
892       * #matches} for each character.
893       *
894       * @param sequence the character sequence to examine from the end
895       * @return an index, or {@code -1} if no character matches
896       */
897      public int lastIndexIn(CharSequence sequence) {
898        for (int i = sequence.length() - 1; i >= 0; i--) {
899          if (matches(sequence.charAt(i))) {
900            return i;
901          }
902        }
903        return -1;
904      }
905    
906      /**
907       * Returns the number of matching characters found in a character sequence.
908       */
909      public int countIn(CharSequence sequence) {
910        int count = 0;
911        for (int i = 0; i < sequence.length(); i++) {
912          if (matches(sequence.charAt(i))) {
913            count++;
914          }
915        }
916        return count;
917      }
918    
919      /**
920       * Returns a string containing all non-matching characters of a character sequence, in order. For
921       * example: <pre>   {@code
922       *
923       *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
924       *
925       * ... returns {@code "bzr"}.
926       */
927      @CheckReturnValue
928      public String removeFrom(CharSequence sequence) {
929        String string = sequence.toString();
930        int pos = indexIn(string);
931        if (pos == -1) {
932          return string;
933        }
934    
935        char[] chars = string.toCharArray();
936        int spread = 1;
937    
938        // This unusual loop comes from extensive benchmarking
939        OUT: while (true) {
940          pos++;
941          while (true) {
942            if (pos == chars.length) {
943              break OUT;
944            }
945            if (matches(chars[pos])) {
946              break;
947            }
948            chars[pos - spread] = chars[pos];
949            pos++;
950          }
951          spread++;
952        }
953        return new String(chars, 0, pos - spread);
954      }
955    
956      /**
957       * Returns a string containing all matching characters of a character sequence, in order. For
958       * example: <pre>   {@code
959       *
960       *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
961       *
962       * ... returns {@code "aaa"}.
963       */
964      @CheckReturnValue
965      public String retainFrom(CharSequence sequence) {
966        return negate().removeFrom(sequence);
967      }
968    
969      /**
970       * Returns a string copy of the input character sequence, with each character that matches this
971       * matcher replaced by a given replacement character. For example: <pre>   {@code
972       *
973       *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
974       *
975       * ... returns {@code "rodor"}.
976       *
977       * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
978       * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
979       * character.
980       *
981       * @param sequence the character sequence to replace matching characters in
982       * @param replacement the character to append to the result string in place of each matching
983       *        character in {@code sequence}
984       * @return the new string
985       */
986      @CheckReturnValue
987      public String replaceFrom(CharSequence sequence, char replacement) {
988        String string = sequence.toString();
989        int pos = indexIn(string);
990        if (pos == -1) {
991          return string;
992        }
993        char[] chars = string.toCharArray();
994        chars[pos] = replacement;
995        for (int i = pos + 1; i < chars.length; i++) {
996          if (matches(chars[i])) {
997            chars[i] = replacement;
998          }
999        }
1000        return new String(chars);
1001      }
1002    
1003      /**
1004       * Returns a string copy of the input character sequence, with each character that matches this
1005       * matcher replaced by a given replacement sequence. For example: <pre>   {@code
1006       *
1007       *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1008       *
1009       * ... returns {@code "yoohoo"}.
1010       *
1011       * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1012       * off calling {@link #replaceFrom(CharSequence, char)} directly.
1013       *
1014       * @param sequence the character sequence to replace matching characters in
1015       * @param replacement the characters to append to the result string in place of each matching
1016       *        character in {@code sequence}
1017       * @return the new string
1018       */
1019      @CheckReturnValue
1020      public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1021        int replacementLen = replacement.length();
1022        if (replacementLen == 0) {
1023          return removeFrom(sequence);
1024        }
1025        if (replacementLen == 1) {
1026          return replaceFrom(sequence, replacement.charAt(0));
1027        }
1028    
1029        String string = sequence.toString();
1030        int pos = indexIn(string);
1031        if (pos == -1) {
1032          return string;
1033        }
1034    
1035        int len = string.length();
1036        StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1037    
1038        int oldpos = 0;
1039        do {
1040          buf.append(string, oldpos, pos);
1041          buf.append(replacement);
1042          oldpos = pos + 1;
1043          pos = indexIn(string, oldpos);
1044        } while (pos != -1);
1045    
1046        buf.append(string, oldpos, len);
1047        return buf.toString();
1048      }
1049    
1050      /**
1051       * Returns a substring of the input character sequence that omits all characters this matcher
1052       * matches from the beginning and from the end of the string. For example: <pre>   {@code
1053       *
1054       *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1055       *
1056       * ... returns {@code "cat"}.
1057       *
1058       * <p>Note that: <pre>   {@code
1059       *
1060       *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1061       *
1062       * ... is equivalent to {@link String#trim()}.
1063       */
1064      @CheckReturnValue
1065      public String trimFrom(CharSequence sequence) {
1066        int len = sequence.length();
1067        int first;
1068        int last;
1069    
1070        for (first = 0; first < len; first++) {
1071          if (!matches(sequence.charAt(first))) {
1072            break;
1073          }
1074        }
1075        for (last = len - 1; last > first; last--) {
1076          if (!matches(sequence.charAt(last))) {
1077            break;
1078          }
1079        }
1080    
1081        return sequence.subSequence(first, last + 1).toString();
1082      }
1083    
1084      /**
1085       * Returns a substring of the input character sequence that omits all characters this matcher
1086       * matches from the beginning of the string. For example: <pre> {@code
1087       *
1088       *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1089       *
1090       * ... returns {@code "catbab"}.
1091       */
1092      @CheckReturnValue
1093      public String trimLeadingFrom(CharSequence sequence) {
1094        int len = sequence.length();
1095        int first;
1096    
1097        for (first = 0; first < len; first++) {
1098          if (!matches(sequence.charAt(first))) {
1099            break;
1100          }
1101        }
1102    
1103        return sequence.subSequence(first, len).toString();
1104      }
1105    
1106      /**
1107       * Returns a substring of the input character sequence that omits all characters this matcher
1108       * matches from the end of the string. For example: <pre> {@code
1109       *
1110       *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1111       *
1112       * ... returns {@code "abacat"}.
1113       */
1114      @CheckReturnValue
1115      public String trimTrailingFrom(CharSequence sequence) {
1116        int len = sequence.length();
1117        int last;
1118    
1119        for (last = len - 1; last >= 0; last--) {
1120          if (!matches(sequence.charAt(last))) {
1121            break;
1122          }
1123        }
1124    
1125        return sequence.subSequence(0, last + 1).toString();
1126      }
1127    
1128      /**
1129       * Returns a string copy of the input character sequence, with each group of consecutive
1130       * characters that match this matcher replaced by a single replacement character. For example:
1131       * <pre>   {@code
1132       *
1133       *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1134       *
1135       * ... returns {@code "b-p-r"}.
1136       *
1137       * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1138       * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1139       * character.
1140       *
1141       * @param sequence the character sequence to replace matching groups of characters in
1142       * @param replacement the character to append to the result string in place of each group of
1143       *        matching characters in {@code sequence}
1144       * @return the new string
1145       */
1146      @CheckReturnValue
1147      public String collapseFrom(CharSequence sequence, char replacement) {
1148        int first = indexIn(sequence);
1149        if (first == -1) {
1150          return sequence.toString();
1151        }
1152    
1153        // TODO(kevinb): see if this implementation can be made faster
1154        StringBuilder builder = new StringBuilder(sequence.length())
1155            .append(sequence.subSequence(0, first))
1156            .append(replacement);
1157        boolean in = true;
1158        for (int i = first + 1; i < sequence.length(); i++) {
1159          char c = sequence.charAt(i);
1160          if (matches(c)) {
1161            if (!in) {
1162              builder.append(replacement);
1163              in = true;
1164            }
1165          } else {
1166            builder.append(c);
1167            in = false;
1168          }
1169        }
1170        return builder.toString();
1171      }
1172    
1173      /**
1174       * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1175       * groups of matching characters at the start or end of the sequence are removed without
1176       * replacement.
1177       */
1178      @CheckReturnValue
1179      public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1180        int first = negate().indexIn(sequence);
1181        if (first == -1) {
1182          return ""; // everything matches. nothing's left.
1183        }
1184        StringBuilder builder = new StringBuilder(sequence.length());
1185        boolean inMatchingGroup = false;
1186        for (int i = first; i < sequence.length(); i++) {
1187          char c = sequence.charAt(i);
1188          if (matches(c)) {
1189            inMatchingGroup = true;
1190          } else {
1191            if (inMatchingGroup) {
1192              builder.append(replacement);
1193              inMatchingGroup = false;
1194            }
1195            builder.append(c);
1196          }
1197        }
1198        return builder.toString();
1199      }
1200    
1201      // Predicate interface
1202    
1203      /**
1204       * Returns {@code true} if this matcher matches the given character.
1205       *
1206       * @throws NullPointerException if {@code character} is null
1207       */
1208      @Override public boolean apply(Character character) {
1209        return matches(character);
1210      }
1211    
1212      /**
1213       * Returns a string representation of this {@code CharMatcher}, such as
1214       * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1215       */
1216      @Override
1217      public String toString() {
1218        return description;
1219      }
1220    
1221      /**
1222       * Determines whether a character is whitespace according to the latest Unicode standard, as
1223       * illustrated
1224       * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1225       * This is not the same definition used by other Java APIs. (See a
1226       * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1227       * definitions of "whitespace"</a>.)
1228       *
1229       * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1230       * to date.
1231       */
1232      public static final CharMatcher WHITESPACE = new CharMatcher("CharMatcher.WHITESPACE") {
1233        /**
1234         * A special-case CharMatcher for Unicode whitespace characters that is extremely
1235         * efficient both in space required and in time to check for matches.
1236         *
1237         * Implementation details.
1238         * It turns out that all current (early 2012) Unicode characters are unique modulo 79:
1239         * so we can construct a lookup table of exactly 79 entries, and just check the character code
1240         * mod 79, and see if that character is in the table.
1241         *
1242         * There is a 1 at the beginning of the table so that the null character is not listed
1243         * as whitespace.
1244         *
1245         * Other things we tried that did not prove to be beneficial, mostly due to speed concerns:
1246         *
1247         *   * Binary search into the sorted list of characters, i.e., what
1248         *     CharMatcher.anyOf() does</li>
1249         *   * Perfect hash function into a table of size 26 (using an offset table and a special
1250         *     Jenkins hash function)</li>
1251         *   * Perfect-ish hash function that required two lookups into a single table of size 26.</li>
1252         *   * Using a power-of-2 sized hash table (size 64) with linear probing.</li>
1253         *
1254         * --Christopher Swenson, February 2012.
1255         */
1256    
1257        // Mod-79 lookup table.
1258        private final char[] table = {1, 0, 160, 0, 0, 0, 0, 0, 0, 9, 10, 11, 12, 13, 0, 0,
1259            8232, 8233, 0, 0, 0, 0, 0, 8239, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1260            12288, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199,
1261            8200, 8201, 8202, 0, 0, 0, 0, 0, 8287, 5760, 0, 0, 6158, 0, 0, 0};
1262    
1263        @Override public boolean matches(char c) {
1264          return table[c % 79] == c;
1265        }
1266    
1267        @Override public CharMatcher precomputed() {
1268          return this;
1269        }
1270      };
1271    }