001 /*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.base;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021
022 import com.google.common.annotations.Beta;
023 import com.google.common.annotations.GwtCompatible;
024
025 import java.util.ArrayList;
026 import java.util.Arrays;
027 import java.util.List;
028
029 import javax.annotation.CheckReturnValue;
030
031 /**
032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
033 * for any {@link Object}. Also offers basic text processing methods based on this function.
034 * Implementations are strongly encouraged to be side-effect-free and immutable.
035 *
036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
038 *
039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
042 * treats these just as two separate characters.
043 *
044 * <p>Example usages: <pre>
045 * String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
046 * if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
047 *
048 * <p>See the Guava User Guide article on <a href=
049 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
050 * {@code CharMatcher}</a>.
051 *
052 * @author Kevin Bourrillion
053 * @since 1.0
054 */
055 @Beta // Possibly change from chars to code points; decide constants vs. methods
056 @GwtCompatible
057 public abstract class CharMatcher implements Predicate<Character> {
058 // Constants
059 /**
060 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
061 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
062 * discussion of that term.
063 *
064 * @since 2.0
065 */
066 public static final CharMatcher BREAKING_WHITESPACE =
067 anyOf("\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000")
068 .or(inRange('\u2000', '\u2006'))
069 .or(inRange('\u2008', '\u200a'))
070 .withToString("CharMatcher.BREAKING_WHITESPACE")
071 .precomputed();
072
073 /**
074 * Determines whether a character is ASCII, meaning that its code point is less than 128.
075 */
076 public static final CharMatcher ASCII = inRange('\0', '\u007f')
077 .withToString("CharMatcher.ASCII");
078
079 /**
080 * Determines whether a character is a digit according to
081 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
082 */
083 public static final CharMatcher DIGIT;
084
085 static {
086 CharMatcher digit = inRange('0', '9');
087 String zeroes =
088 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
089 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
090 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
091 for (char base : zeroes.toCharArray()) {
092 digit = digit.or(inRange(base, (char) (base + 9)));
093 }
094 DIGIT = digit.withToString("CharMatcher.DIGIT").precomputed();
095 }
096
097 /**
098 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
099 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
100 */
101 public static final CharMatcher JAVA_DIGIT = new CharMatcher() {
102 @Override public boolean matches(char c) {
103 return Character.isDigit(c);
104 }
105
106 @Override public String toString() {
107 return "CharMatcher.JAVA_DIGIT";
108 }
109 };
110
111 /**
112 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
113 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
114 * inRange('a', 'z').or(inRange('A', 'Z'))}.
115 */
116 public static final CharMatcher JAVA_LETTER = new CharMatcher() {
117 @Override public boolean matches(char c) {
118 return Character.isLetter(c);
119 }
120
121 @Override public String toString() {
122 return "CharMatcher.JAVA_LETTER";
123 }
124 };
125
126 /**
127 * Determines whether a character is a letter or digit according to {@link
128 * Character#isLetterOrDigit(char) Java's definition}.
129 */
130 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() {
131 @Override public boolean matches(char c) {
132 return Character.isLetterOrDigit(c);
133 }
134
135 @Override public String toString() {
136 return "CharMatcher.JAVA_LETTER_OR_DIGIT";
137 }
138 };
139
140 /**
141 * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
142 * Java's definition}.
143 */
144 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() {
145 @Override public boolean matches(char c) {
146 return Character.isUpperCase(c);
147 }
148
149 @Override public String toString() {
150 return "CharMatcher.JAVA_UPPER_CASE";
151 }
152 };
153
154 /**
155 * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
156 * Java's definition}.
157 */
158 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() {
159 @Override public boolean matches(char c) {
160 return Character.isLowerCase(c);
161 }
162
163 @Override public String toString() {
164 return "CharMatcher.JAVA_LOWER_CASE";
165 }
166 };
167
168 /**
169 * Determines whether a character is an ISO control character as specified by {@link
170 * Character#isISOControl(char)}.
171 */
172 public static final CharMatcher JAVA_ISO_CONTROL =
173 inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f'))
174 .withToString("CharMatcher.JAVA_ISO_CONTROL");
175
176 /**
177 * Determines whether a character is invisible; that is, if its Unicode category is any of
178 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
179 * PRIVATE_USE according to ICU4J.
180 */
181 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
182 .or(inRange('\u007f', '\u00a0'))
183 .or(is('\u00ad'))
184 .or(inRange('\u0600', '\u0603'))
185 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e"))
186 .or(inRange('\u2000', '\u200f'))
187 .or(inRange('\u2028', '\u202f'))
188 .or(inRange('\u205f', '\u2064'))
189 .or(inRange('\u206a', '\u206f'))
190 .or(is('\u3000'))
191 .or(inRange('\ud800', '\uf8ff'))
192 .or(anyOf("\ufeff\ufff9\ufffa\ufffb"))
193 .withToString("CharMatcher.INVISIBLE")
194 .precomputed();
195
196 /**
197 * Determines whether a character is single-width (not double-width). When in doubt, this matcher
198 * errs on the side of returning {@code false} (that is, it tends to assume a character is
199 * double-width).
200 *
201 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
202 * date.
203 */
204 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
205 .or(is('\u05be'))
206 .or(inRange('\u05d0', '\u05ea'))
207 .or(is('\u05f3'))
208 .or(is('\u05f4'))
209 .or(inRange('\u0600', '\u06ff'))
210 .or(inRange('\u0750', '\u077f'))
211 .or(inRange('\u0e00', '\u0e7f'))
212 .or(inRange('\u1e00', '\u20af'))
213 .or(inRange('\u2100', '\u213a'))
214 .or(inRange('\ufb50', '\ufdff'))
215 .or(inRange('\ufe70', '\ufeff'))
216 .or(inRange('\uff61', '\uffdc'))
217 .withToString("CharMatcher.SINGLE_WIDTH")
218 .precomputed();
219
220 /** Matches any character. */
221 public static final CharMatcher ANY =
222 new CharMatcher() {
223 @Override public boolean matches(char c) {
224 return true;
225 }
226
227 @Override public int indexIn(CharSequence sequence) {
228 return (sequence.length() == 0) ? -1 : 0;
229 }
230
231 @Override public int indexIn(CharSequence sequence, int start) {
232 int length = sequence.length();
233 Preconditions.checkPositionIndex(start, length);
234 return (start == length) ? -1 : start;
235 }
236
237 @Override public int lastIndexIn(CharSequence sequence) {
238 return sequence.length() - 1;
239 }
240
241 @Override public boolean matchesAllOf(CharSequence sequence) {
242 checkNotNull(sequence);
243 return true;
244 }
245
246 @Override public boolean matchesNoneOf(CharSequence sequence) {
247 return sequence.length() == 0;
248 }
249
250 @Override public String removeFrom(CharSequence sequence) {
251 checkNotNull(sequence);
252 return "";
253 }
254
255 @Override public String replaceFrom(CharSequence sequence, char replacement) {
256 char[] array = new char[sequence.length()];
257 Arrays.fill(array, replacement);
258 return new String(array);
259 }
260
261 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
262 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
263 for (int i = 0; i < sequence.length(); i++) {
264 retval.append(replacement);
265 }
266 return retval.toString();
267 }
268
269 @Override public String collapseFrom(CharSequence sequence, char replacement) {
270 return (sequence.length() == 0) ? "" : String.valueOf(replacement);
271 }
272
273 @Override public String trimFrom(CharSequence sequence) {
274 checkNotNull(sequence);
275 return "";
276 }
277
278 @Override public int countIn(CharSequence sequence) {
279 return sequence.length();
280 }
281
282 @Override public CharMatcher and(CharMatcher other) {
283 return checkNotNull(other);
284 }
285
286 @Override public CharMatcher or(CharMatcher other) {
287 checkNotNull(other);
288 return this;
289 }
290
291 @Override public CharMatcher negate() {
292 return NONE;
293 }
294
295 @Override public CharMatcher precomputed() {
296 return this;
297 }
298
299 @Override public String toString() {
300 return "CharMatcher.ANY";
301 }
302 };
303
304 /** Matches no characters. */
305 public static final CharMatcher NONE =
306 new CharMatcher() {
307 @Override public boolean matches(char c) {
308 return false;
309 }
310
311 @Override public int indexIn(CharSequence sequence) {
312 checkNotNull(sequence);
313 return -1;
314 }
315
316 @Override public int indexIn(CharSequence sequence, int start) {
317 int length = sequence.length();
318 Preconditions.checkPositionIndex(start, length);
319 return -1;
320 }
321
322 @Override public int lastIndexIn(CharSequence sequence) {
323 checkNotNull(sequence);
324 return -1;
325 }
326
327 @Override public boolean matchesAllOf(CharSequence sequence) {
328 return sequence.length() == 0;
329 }
330
331 @Override public boolean matchesNoneOf(CharSequence sequence) {
332 checkNotNull(sequence);
333 return true;
334 }
335
336 @Override public String removeFrom(CharSequence sequence) {
337 return sequence.toString();
338 }
339
340 @Override public String replaceFrom(CharSequence sequence, char replacement) {
341 return sequence.toString();
342 }
343
344 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
345 checkNotNull(replacement);
346 return sequence.toString();
347 }
348
349 @Override public String collapseFrom(CharSequence sequence, char replacement) {
350 return sequence.toString();
351 }
352
353 @Override public String trimFrom(CharSequence sequence) {
354 return sequence.toString();
355 }
356
357 @Override public int countIn(CharSequence sequence) {
358 checkNotNull(sequence);
359 return 0;
360 }
361
362 @Override public CharMatcher and(CharMatcher other) {
363 checkNotNull(other);
364 return this;
365 }
366
367 @Override public CharMatcher or(CharMatcher other) {
368 return checkNotNull(other);
369 }
370
371 @Override public CharMatcher negate() {
372 return ANY;
373 }
374
375 @Override void setBits(LookupTable table) {}
376
377 @Override public CharMatcher precomputed() {
378 return this;
379 }
380
381 @Override public String toString() {
382 return "CharMatcher.NONE";
383 }
384 };
385
386 // Static factories
387
388 /**
389 * Returns a {@code char} matcher that matches only one specified character.
390 */
391 public static CharMatcher is(final char match) {
392 return new CharMatcher() {
393 @Override public boolean matches(char c) {
394 return c == match;
395 }
396
397 @Override public String replaceFrom(CharSequence sequence, char replacement) {
398 return sequence.toString().replace(match, replacement);
399 }
400
401 @Override public CharMatcher and(CharMatcher other) {
402 return other.matches(match) ? this : NONE;
403 }
404
405 @Override public CharMatcher or(CharMatcher other) {
406 return other.matches(match) ? other : super.or(other);
407 }
408
409 @Override public CharMatcher negate() {
410 return isNot(match);
411 }
412
413 @Override void setBits(LookupTable table) {
414 table.set(match);
415 }
416
417 @Override public CharMatcher precomputed() {
418 return this;
419 }
420
421 @Override public String toString() {
422 return new StringBuilder("CharMatcher.is(")
423 .append(Integer.toHexString(match))
424 .append(")")
425 .toString();
426 }
427 };
428 }
429
430 /**
431 * Returns a {@code char} matcher that matches any character except the one specified.
432 *
433 * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
434 */
435 public static CharMatcher isNot(final char match) {
436 return new CharMatcher() {
437 @Override public boolean matches(char c) {
438 return c != match;
439 }
440
441 @Override public CharMatcher and(CharMatcher other) {
442 return other.matches(match) ? super.and(other) : other;
443 }
444
445 @Override public CharMatcher or(CharMatcher other) {
446 return other.matches(match) ? ANY : this;
447 }
448
449 @Override public CharMatcher negate() {
450 return is(match);
451 }
452
453 @Override public String toString() {
454 return new StringBuilder("CharMatcher.isNot(")
455 .append(Integer.toHexString(match))
456 .append(")")
457 .toString();
458 }
459 };
460 }
461
462 /**
463 * Returns a {@code char} matcher that matches any character present in the given character
464 * sequence.
465 */
466 public static CharMatcher anyOf(final CharSequence sequence) {
467 switch (sequence.length()) {
468 case 0:
469 return NONE;
470 case 1:
471 return is(sequence.charAt(0));
472 case 2:
473 final char match1 = sequence.charAt(0);
474 final char match2 = sequence.charAt(1);
475 return new CharMatcher() {
476 @Override public boolean matches(char c) {
477 return c == match1 || c == match2;
478 }
479
480 @Override void setBits(LookupTable table) {
481 table.set(match1);
482 table.set(match2);
483 }
484
485 @Override public CharMatcher precomputed() {
486 return this;
487 }
488 };
489 }
490
491 final char[] chars = sequence.toString().toCharArray();
492 Arrays.sort(chars); // not worth collapsing duplicates
493
494 return new CharMatcher() {
495 @Override public boolean matches(char c) {
496 return Arrays.binarySearch(chars, c) >= 0;
497 }
498
499 @Override void setBits(LookupTable table) {
500 for (char c : chars) {
501 table.set(c);
502 }
503 }
504
505 @Override public String toString() {
506 return new StringBuilder("CharMatcher.anyOf(\"").append(chars).append("\")").toString();
507 }
508 };
509 }
510
511 /**
512 * Returns a {@code char} matcher that matches any character not present in the given character
513 * sequence.
514 */
515 public static CharMatcher noneOf(CharSequence sequence) {
516 return anyOf(sequence).negate();
517 }
518
519 /**
520 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
521 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
522 * CharMatcher.inRange('a', 'z')}.
523 *
524 * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
525 */
526 public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
527 checkArgument(endInclusive >= startInclusive);
528 return new CharMatcher() {
529 @Override public boolean matches(char c) {
530 return startInclusive <= c && c <= endInclusive;
531 }
532
533 @Override void setBits(LookupTable table) {
534 char c = startInclusive;
535 while (true) {
536 table.set(c);
537 if (c++ == endInclusive) {
538 break;
539 }
540 }
541 }
542
543 @Override public CharMatcher precomputed() {
544 return this;
545 }
546
547 @Override public String toString() {
548 return new StringBuilder("CharMatcher.inRange(")
549 .append(Integer.toHexString(startInclusive))
550 .append(", ")
551 .append(Integer.toHexString(endInclusive))
552 .append(")")
553 .toString();
554 }
555 };
556 }
557
558 /**
559 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
560 * which operates on primitive {@code char} instances instead.
561 */
562 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
563 checkNotNull(predicate);
564 if (predicate instanceof CharMatcher) {
565 return (CharMatcher) predicate;
566 }
567 return new CharMatcher() {
568 @Override public boolean matches(char c) {
569 return predicate.apply(c);
570 }
571
572 @Override public boolean apply(Character character) {
573 return predicate.apply(checkNotNull(character));
574 }
575
576 @Override public String toString() {
577 return new StringBuilder("CharMatcher.forPredicate(")
578 .append(predicate)
579 .append(')')
580 .toString();
581 }
582 };
583 }
584
585 // Constructors
586
587 /**
588 * Constructor for use by subclasses.
589 */
590 protected CharMatcher() {}
591
592 // Abstract methods
593
594 /** Determines a true or false value for the given character. */
595 public abstract boolean matches(char c);
596
597 // Non-static factories
598
599 /**
600 * Returns a matcher that matches any character not matched by this matcher.
601 */
602 public CharMatcher negate() {
603 final CharMatcher original = this;
604 return new CharMatcher() {
605 @Override public boolean matches(char c) {
606 return !original.matches(c);
607 }
608
609 @Override public boolean matchesAllOf(CharSequence sequence) {
610 return original.matchesNoneOf(sequence);
611 }
612
613 @Override public boolean matchesNoneOf(CharSequence sequence) {
614 return original.matchesAllOf(sequence);
615 }
616
617 @Override public int countIn(CharSequence sequence) {
618 return sequence.length() - original.countIn(sequence);
619 }
620
621 @Override public CharMatcher negate() {
622 return original;
623 }
624
625 @Override public String toString() {
626 return original + ".negate()";
627 }
628 };
629 }
630
631 /**
632 * Returns a matcher that matches any character matched by both this matcher and {@code other}.
633 */
634 public CharMatcher and(CharMatcher other) {
635 return new And(Arrays.asList(this, checkNotNull(other)));
636 }
637
638 private static class And extends CharMatcher {
639 List<CharMatcher> components;
640
641 And(List<CharMatcher> components) {
642 this.components = components; // Skip defensive copy (private)
643 }
644
645 @Override public boolean matches(char c) {
646 for (CharMatcher matcher : components) {
647 if (!matcher.matches(c)) {
648 return false;
649 }
650 }
651 return true;
652 }
653
654 @Override public CharMatcher and(CharMatcher other) {
655 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
656 newComponents.add(checkNotNull(other));
657 return new And(newComponents);
658 }
659
660 @Override public String toString() {
661 StringBuilder builder = new StringBuilder("CharMatcher.and(");
662 Joiner.on(", ").appendTo(builder, components);
663 return builder.append(')').toString();
664 }
665 }
666
667 /**
668 * Returns a matcher that matches any character matched by either this matcher or {@code other}.
669 */
670 public CharMatcher or(CharMatcher other) {
671 return new Or(Arrays.asList(this, checkNotNull(other)));
672 }
673
674 private static class Or extends CharMatcher {
675 List<CharMatcher> components;
676
677 Or(List<CharMatcher> components) {
678 this.components = components; // Skip defensive copy (private)
679 }
680
681 @Override public boolean matches(char c) {
682 for (CharMatcher matcher : components) {
683 if (matcher.matches(c)) {
684 return true;
685 }
686 }
687 return false;
688 }
689
690 @Override public CharMatcher or(CharMatcher other) {
691 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
692 newComponents.add(checkNotNull(other));
693 return new Or(newComponents);
694 }
695
696 @Override void setBits(LookupTable table) {
697 for (CharMatcher matcher : components) {
698 matcher.setBits(table);
699 }
700 }
701
702 @Override public String toString() {
703 StringBuilder builder = new StringBuilder("CharMatcher.or(");
704 Joiner.on(", ").appendTo(builder, components);
705 return builder.append(')').toString();
706 }
707 }
708
709 /**
710 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
711 * query than the original; your mileage may vary. Precomputation takes time and is likely to be
712 * worthwhile only if the precomputed matcher is queried many thousands of times.
713 *
714 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
715 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
716 * worthwhile tradeoff in a browser.
717 */
718 public CharMatcher precomputed() {
719 return Platform.precomputeCharMatcher(this);
720 }
721
722 /**
723 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
724 * on {@link Platform} so that we can have different behavior in GWT.
725 *
726 * <p>The default precomputation is to cache the configuration of the original matcher in an
727 * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query
728 * than the original.
729 *
730 * <p>The default implementation creates a new bit array and passes it to {@link
731 * #setBits(LookupTable)}.
732 */
733 CharMatcher precomputedInternal() {
734 final LookupTable table = new LookupTable();
735 setBits(table);
736 final CharMatcher outer = this;
737
738 return new CharMatcher() {
739 @Override public boolean matches(char c) {
740 return table.get(c);
741 }
742
743 // TODO(kevinb): make methods like negate() smart?
744
745 @Override public CharMatcher precomputed() {
746 return this;
747 }
748
749 @Override public String toString() {
750 return outer.toString();
751 }
752 };
753 }
754
755 CharMatcher withToString(final String toString) {
756 final CharMatcher delegate = this;
757 return new CharMatcher() {
758 @Override
759 public boolean matches(char c) {
760 return delegate.matches(c);
761 }
762
763 @Override
764 void setBits(LookupTable table) {
765 delegate.setBits(table);
766 }
767
768 @Override
769 public String toString() {
770 return toString;
771 }
772 };
773 }
774
775 /**
776 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal
777 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched.
778 *
779 * <p>The default implementation loops over every possible character value, invoking {@link
780 * #matches} for each one.
781 */
782 void setBits(LookupTable table) {
783 char c = Character.MIN_VALUE;
784 while (true) {
785 if (matches(c)) {
786 table.set(c);
787 }
788 if (c++ == Character.MAX_VALUE) {
789 break;
790 }
791 }
792 }
793
794 /**
795 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}.
796 *
797 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a
798 * simpler java.util.BitSet.
799 */
800 private static final class LookupTable {
801 int[] data = new int[2048];
802
803 void set(char index) {
804 data[index >> 5] |= (1 << index);
805 }
806
807 boolean get(char index) {
808 return (data[index >> 5] & (1 << index)) != 0;
809 }
810 }
811
812 // Text processing routines
813
814 /**
815 * Returns {@code true} if a character sequence contains at least one matching character.
816 * Equivalent to {@code !matchesNoneOf(sequence)}.
817 *
818 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
819 * character, until this returns {@code true} or the end is reached.
820 *
821 * @param sequence the character sequence to examine, possibly empty
822 * @return {@code true} if this matcher matches at least one character in the sequence
823 * @since 8.0
824 */
825 public boolean matchesAnyOf(CharSequence sequence) {
826 return !matchesNoneOf(sequence);
827 }
828
829 /**
830 * Returns {@code true} if a character sequence contains only matching characters.
831 *
832 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
833 * character, until this returns {@code false} or the end is reached.
834 *
835 * @param sequence the character sequence to examine, possibly empty
836 * @return {@code true} if this matcher matches every character in the sequence, including when
837 * the sequence is empty
838 */
839 public boolean matchesAllOf(CharSequence sequence) {
840 for (int i = sequence.length() - 1; i >= 0; i--) {
841 if (!matches(sequence.charAt(i))) {
842 return false;
843 }
844 }
845 return true;
846 }
847
848 /**
849 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
850 * {@code !matchesAnyOf(sequence)}.
851 *
852 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
853 * character, until this returns {@code false} or the end is reached.
854 *
855 * @param sequence the character sequence to examine, possibly empty
856 * @return {@code true} if this matcher matches every character in the sequence, including when
857 * the sequence is empty
858 */
859 public boolean matchesNoneOf(CharSequence sequence) {
860 return indexIn(sequence) == -1;
861 }
862
863 /**
864 * Returns the index of the first matching character in a character sequence, or {@code -1} if no
865 * matching character is present.
866 *
867 * <p>The default implementation iterates over the sequence in forward order calling {@link
868 * #matches} for each character.
869 *
870 * @param sequence the character sequence to examine from the beginning
871 * @return an index, or {@code -1} if no character matches
872 */
873 public int indexIn(CharSequence sequence) {
874 int length = sequence.length();
875 for (int i = 0; i < length; i++) {
876 if (matches(sequence.charAt(i))) {
877 return i;
878 }
879 }
880 return -1;
881 }
882
883 /**
884 * Returns the index of the first matching character in a character sequence, starting from a
885 * given position, or {@code -1} if no character matches after that position.
886 *
887 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
888 * start}, calling {@link #matches} for each character.
889 *
890 * @param sequence the character sequence to examine
891 * @param start the first index to examine; must be nonnegative and no greater than {@code
892 * sequence.length()}
893 * @return the index of the first matching character, guaranteed to be no less than {@code start},
894 * or {@code -1} if no character matches
895 * @throws IndexOutOfBoundsException if start is negative or greater than {@code
896 * sequence.length()}
897 */
898 public int indexIn(CharSequence sequence, int start) {
899 int length = sequence.length();
900 Preconditions.checkPositionIndex(start, length);
901 for (int i = start; i < length; i++) {
902 if (matches(sequence.charAt(i))) {
903 return i;
904 }
905 }
906 return -1;
907 }
908
909 /**
910 * Returns the index of the last matching character in a character sequence, or {@code -1} if no
911 * matching character is present.
912 *
913 * <p>The default implementation iterates over the sequence in reverse order calling {@link
914 * #matches} for each character.
915 *
916 * @param sequence the character sequence to examine from the end
917 * @return an index, or {@code -1} if no character matches
918 */
919 public int lastIndexIn(CharSequence sequence) {
920 for (int i = sequence.length() - 1; i >= 0; i--) {
921 if (matches(sequence.charAt(i))) {
922 return i;
923 }
924 }
925 return -1;
926 }
927
928 /**
929 * Returns the number of matching characters found in a character sequence.
930 */
931 public int countIn(CharSequence sequence) {
932 int count = 0;
933 for (int i = 0; i < sequence.length(); i++) {
934 if (matches(sequence.charAt(i))) {
935 count++;
936 }
937 }
938 return count;
939 }
940
941 /**
942 * Returns a string containing all non-matching characters of a character sequence, in order. For
943 * example: <pre> {@code
944 *
945 * CharMatcher.is('a').removeFrom("bazaar")}</pre>
946 *
947 * ... returns {@code "bzr"}.
948 */
949 @CheckReturnValue
950 public String removeFrom(CharSequence sequence) {
951 String string = sequence.toString();
952 int pos = indexIn(string);
953 if (pos == -1) {
954 return string;
955 }
956
957 char[] chars = string.toCharArray();
958 int spread = 1;
959
960 // This unusual loop comes from extensive benchmarking
961 OUT: while (true) {
962 pos++;
963 while (true) {
964 if (pos == chars.length) {
965 break OUT;
966 }
967 if (matches(chars[pos])) {
968 break;
969 }
970 chars[pos - spread] = chars[pos];
971 pos++;
972 }
973 spread++;
974 }
975 return new String(chars, 0, pos - spread);
976 }
977
978 /**
979 * Returns a string containing all matching characters of a character sequence, in order. For
980 * example: <pre> {@code
981 *
982 * CharMatcher.is('a').retainFrom("bazaar")}</pre>
983 *
984 * ... returns {@code "aaa"}.
985 */
986 @CheckReturnValue
987 public String retainFrom(CharSequence sequence) {
988 return negate().removeFrom(sequence);
989 }
990
991 /**
992 * Returns a string copy of the input character sequence, with each character that matches this
993 * matcher replaced by a given replacement character. For example: <pre> {@code
994 *
995 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
996 *
997 * ... returns {@code "rodor"}.
998 *
999 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1000 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1001 * character.
1002 *
1003 * @param sequence the character sequence to replace matching characters in
1004 * @param replacement the character to append to the result string in place of each matching
1005 * character in {@code sequence}
1006 * @return the new string
1007 */
1008 @CheckReturnValue
1009 public String replaceFrom(CharSequence sequence, char replacement) {
1010 String string = sequence.toString();
1011 int pos = indexIn(string);
1012 if (pos == -1) {
1013 return string;
1014 }
1015 char[] chars = string.toCharArray();
1016 chars[pos] = replacement;
1017 for (int i = pos + 1; i < chars.length; i++) {
1018 if (matches(chars[i])) {
1019 chars[i] = replacement;
1020 }
1021 }
1022 return new String(chars);
1023 }
1024
1025 /**
1026 * Returns a string copy of the input character sequence, with each character that matches this
1027 * matcher replaced by a given replacement sequence. For example: <pre> {@code
1028 *
1029 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1030 *
1031 * ... returns {@code "yoohoo"}.
1032 *
1033 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1034 * off calling {@link #replaceFrom(CharSequence, char)} directly.
1035 *
1036 * @param sequence the character sequence to replace matching characters in
1037 * @param replacement the characters to append to the result string in place of each matching
1038 * character in {@code sequence}
1039 * @return the new string
1040 */
1041 @CheckReturnValue
1042 public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1043 int replacementLen = replacement.length();
1044 if (replacementLen == 0) {
1045 return removeFrom(sequence);
1046 }
1047 if (replacementLen == 1) {
1048 return replaceFrom(sequence, replacement.charAt(0));
1049 }
1050
1051 String string = sequence.toString();
1052 int pos = indexIn(string);
1053 if (pos == -1) {
1054 return string;
1055 }
1056
1057 int len = string.length();
1058 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1059
1060 int oldpos = 0;
1061 do {
1062 buf.append(string, oldpos, pos);
1063 buf.append(replacement);
1064 oldpos = pos + 1;
1065 pos = indexIn(string, oldpos);
1066 } while (pos != -1);
1067
1068 buf.append(string, oldpos, len);
1069 return buf.toString();
1070 }
1071
1072 /**
1073 * Returns a substring of the input character sequence that omits all characters this matcher
1074 * matches from the beginning and from the end of the string. For example: <pre> {@code
1075 *
1076 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1077 *
1078 * ... returns {@code "cat"}.
1079 *
1080 * <p>Note that: <pre> {@code
1081 *
1082 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1083 *
1084 * ... is equivalent to {@link String#trim()}.
1085 */
1086 @CheckReturnValue
1087 public String trimFrom(CharSequence sequence) {
1088 int len = sequence.length();
1089 int first;
1090 int last;
1091
1092 for (first = 0; first < len; first++) {
1093 if (!matches(sequence.charAt(first))) {
1094 break;
1095 }
1096 }
1097 for (last = len - 1; last > first; last--) {
1098 if (!matches(sequence.charAt(last))) {
1099 break;
1100 }
1101 }
1102
1103 return sequence.subSequence(first, last + 1).toString();
1104 }
1105
1106 /**
1107 * Returns a substring of the input character sequence that omits all characters this matcher
1108 * matches from the beginning of the string. For example: <pre> {@code
1109 *
1110 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1111 *
1112 * ... returns {@code "catbab"}.
1113 */
1114 @CheckReturnValue
1115 public String trimLeadingFrom(CharSequence sequence) {
1116 int len = sequence.length();
1117 int first;
1118
1119 for (first = 0; first < len; first++) {
1120 if (!matches(sequence.charAt(first))) {
1121 break;
1122 }
1123 }
1124
1125 return sequence.subSequence(first, len).toString();
1126 }
1127
1128 /**
1129 * Returns a substring of the input character sequence that omits all characters this matcher
1130 * matches from the end of the string. For example: <pre> {@code
1131 *
1132 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1133 *
1134 * ... returns {@code "abacat"}.
1135 */
1136 @CheckReturnValue
1137 public String trimTrailingFrom(CharSequence sequence) {
1138 int len = sequence.length();
1139 int last;
1140
1141 for (last = len - 1; last >= 0; last--) {
1142 if (!matches(sequence.charAt(last))) {
1143 break;
1144 }
1145 }
1146
1147 return sequence.subSequence(0, last + 1).toString();
1148 }
1149
1150 /**
1151 * Returns a string copy of the input character sequence, with each group of consecutive
1152 * characters that match this matcher replaced by a single replacement character. For example:
1153 * <pre> {@code
1154 *
1155 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1156 *
1157 * ... returns {@code "b-p-r"}.
1158 *
1159 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1160 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1161 * character.
1162 *
1163 * @param sequence the character sequence to replace matching groups of characters in
1164 * @param replacement the character to append to the result string in place of each group of
1165 * matching characters in {@code sequence}
1166 * @return the new string
1167 */
1168 @CheckReturnValue
1169 public String collapseFrom(CharSequence sequence, char replacement) {
1170 int first = indexIn(sequence);
1171 if (first == -1) {
1172 return sequence.toString();
1173 }
1174
1175 // TODO(kevinb): see if this implementation can be made faster
1176 StringBuilder builder = new StringBuilder(sequence.length())
1177 .append(sequence.subSequence(0, first))
1178 .append(replacement);
1179 boolean in = true;
1180 for (int i = first + 1; i < sequence.length(); i++) {
1181 char c = sequence.charAt(i);
1182 if (matches(c)) {
1183 if (!in) {
1184 builder.append(replacement);
1185 in = true;
1186 }
1187 } else {
1188 builder.append(c);
1189 in = false;
1190 }
1191 }
1192 return builder.toString();
1193 }
1194
1195 /**
1196 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1197 * groups of matching characters at the start or end of the sequence are removed without
1198 * replacement.
1199 */
1200 @CheckReturnValue
1201 public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1202 int first = negate().indexIn(sequence);
1203 if (first == -1) {
1204 return ""; // everything matches. nothing's left.
1205 }
1206 StringBuilder builder = new StringBuilder(sequence.length());
1207 boolean inMatchingGroup = false;
1208 for (int i = first; i < sequence.length(); i++) {
1209 char c = sequence.charAt(i);
1210 if (matches(c)) {
1211 inMatchingGroup = true;
1212 } else {
1213 if (inMatchingGroup) {
1214 builder.append(replacement);
1215 inMatchingGroup = false;
1216 }
1217 builder.append(c);
1218 }
1219 }
1220 return builder.toString();
1221 }
1222
1223 // Predicate interface
1224
1225 /**
1226 * Returns {@code true} if this matcher matches the given character.
1227 *
1228 * @throws NullPointerException if {@code character} is null
1229 */
1230 @Override public boolean apply(Character character) {
1231 return matches(character);
1232 }
1233
1234 /**
1235 * Returns a string representation of this {@code CharMatcher}, such as
1236 * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1237 */
1238 @Override
1239 public String toString() {
1240 return super.toString();
1241 }
1242
1243 /**
1244 * Determines whether a character is whitespace according to the latest Unicode standard, as
1245 * illustrated
1246 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1247 * This is not the same definition used by other Java APIs. (See a
1248 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1249 * definitions of "whitespace"</a>.)
1250 *
1251 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1252 * to date.
1253 */
1254 public static final CharMatcher WHITESPACE = new CharMatcher() {
1255 /**
1256 * A special-case CharMatcher for Unicode whitespace characters that is extremely
1257 * efficient both in space required and in time to check for matches.
1258 *
1259 * Implementation details.
1260 * It turns out that all current (early 2012) Unicode characters are unique modulo 79:
1261 * so we can construct a lookup table of exactly 79 entries, and just check the character code
1262 * mod 79, and see if that character is in the table.
1263 *
1264 * There is a 1 at the beginning of the table so that the null character is not listed
1265 * as whitespace.
1266 *
1267 * Other things we tried that did not prove to be beneficial, mostly due to speed concerns:
1268 *
1269 * * Binary search into the sorted list of characters, i.e., what
1270 * CharMatcher.anyOf() does</li>
1271 * * Perfect hash function into a table of size 26 (using an offset table and a special
1272 * Jenkins hash function)</li>
1273 * * Perfect-ish hash function that required two lookups into a single table of size 26.</li>
1274 * * Using a power-of-2 sized hash table (size 64) with linear probing.</li>
1275 *
1276 * --Christopher Swenson, February 2012.
1277 */
1278
1279 // Mod-79 lookup table.
1280 private final char[] table = {1, 0, 160, 0, 0, 0, 0, 0, 0, 9, 10, 11, 12, 13, 0, 0,
1281 8232, 8233, 0, 0, 0, 0, 0, 8239, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1282 12288, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199,
1283 8200, 8201, 8202, 0, 0, 0, 0, 0, 8287, 5760, 0, 0, 6158, 0, 0, 0};
1284
1285 @Override public boolean matches(char c) {
1286 return table[c % 79] == c;
1287 }
1288
1289 @Override public CharMatcher precomputed() {
1290 return this;
1291 }
1292
1293 @Override public String toString() {
1294 return "CharMatcher.WHITESPACE";
1295 }
1296 };
1297 }