001 /*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.base;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021
022 import com.google.common.annotations.Beta;
023 import com.google.common.annotations.GwtCompatible;
024
025 import java.util.ArrayList;
026 import java.util.Arrays;
027 import java.util.List;
028
029 /**
030 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
031 * for any {@link Object}. Also offers basic text processing methods based on this function.
032 * Implementations are strongly encouraged to be side-effect-free and immutable.
033 *
034 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
035 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
036 *
037 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
038 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
039 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
040 * treats these just as two separate characters.
041 *
042 * @author Kevin Bourrillion
043 * @since 1
044 */
045 @Beta // Possibly change from chars to code points; decide constants vs. methods
046 @GwtCompatible
047 public abstract class CharMatcher implements Predicate<Character> {
048 // Constants
049
050 // Excludes 2000-2000a, which is handled as a range
051 private static final String BREAKING_WHITESPACE_CHARS =
052 "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000";
053
054 // Excludes 2007, which is handled as a gap in a pair of ranges
055 private static final String NON_BREAKING_WHITESPACE_CHARS =
056 "\u00a0\u180e\u202f";
057
058 /**
059 * Determines whether a character is whitespace according to the latest Unicode standard, as
060 * illustrated
061 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
062 * This is not the same definition used by other Java APIs. (See a
063 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
064 * definitions of "whitespace"</a>.)
065 *
066 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
067 * to date.
068 */
069 public static final CharMatcher WHITESPACE =
070 anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS)
071 .or(inRange('\u2000', '\u200a'))
072 .precomputed();
073
074 /**
075 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
076 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
077 * discussion of that term.
078 *
079 * @since 2
080 */
081 public static final CharMatcher BREAKING_WHITESPACE =
082 anyOf(BREAKING_WHITESPACE_CHARS)
083 .or(inRange('\u2000', '\u2006'))
084 .or(inRange('\u2008', '\u200a'))
085 .precomputed();
086
087 /**
088 * Determines whether a character is ASCII, meaning that its code point is less than 128.
089 */
090 public static final CharMatcher ASCII = inRange('\0', '\u007f');
091
092 /**
093 * Determines whether a character is a digit according to
094 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
095 */
096 public static final CharMatcher DIGIT;
097
098 static {
099 CharMatcher digit = inRange('0', '9');
100 String zeroes =
101 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
102 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
103 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
104 for (char base : zeroes.toCharArray()) {
105 digit = digit.or(inRange(base, (char) (base + 9)));
106 }
107 DIGIT = digit.precomputed();
108 }
109
110 /**
111 * Determines whether a character is whitespace according to {@link Character#isWhitespace(char)
112 * Java's definition}; it is usually preferable to use {@link #WHITESPACE}. (See a
113 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
114 * definitions of "whitespace"</a>.)
115 */
116 public static final CharMatcher JAVA_WHITESPACE =
117 inRange('\u0009', (char) 13) // \\u000d doesn't work as a char literal
118 .or(inRange('\u001c', '\u0020'))
119 .or(is('\u1680'))
120 .or(is('\u180e'))
121 .or(inRange('\u2000', '\u2006'))
122 .or(inRange('\u2008', '\u200b'))
123 .or(inRange('\u2028', '\u2029'))
124 .or(is('\u205f'))
125 .or(is('\u3000'))
126 .precomputed();
127
128 /**
129 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
130 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
131 */
132 public static final CharMatcher JAVA_DIGIT = new CharMatcher() {
133 @Override public boolean matches(char c) {
134 return Character.isDigit(c);
135 }
136 };
137
138 /**
139 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
140 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
141 * inRange('a', 'z').or(inRange('A', 'Z'))}.
142 */
143 public static final CharMatcher JAVA_LETTER = new CharMatcher() {
144 @Override public boolean matches(char c) {
145 return Character.isLetter(c);
146 }
147 };
148
149 /**
150 * Determines whether a character is a letter or digit according to {@link
151 * Character#isLetterOrDigit(char) Java's definition}.
152 */
153 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() {
154 @Override public boolean matches(char c) {
155 return Character.isLetterOrDigit(c);
156 }
157 };
158
159 /**
160 * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
161 * Java's definition}.
162 */
163 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() {
164 @Override public boolean matches(char c) {
165 return Character.isUpperCase(c);
166 }
167 };
168
169 /**
170 * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
171 * Java's definition}.
172 */
173 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() {
174 @Override public boolean matches(char c) {
175 return Character.isLowerCase(c);
176 }
177 };
178
179 /**
180 * Determines whether a character is an ISO control character as specified by {@link
181 * Character#isISOControl(char)}.
182 */
183 public static final CharMatcher JAVA_ISO_CONTROL =
184 inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f'));
185
186 /**
187 * Determines whether a character is invisible; that is, if its Unicode category is any of
188 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
189 * PRIVATE_USE according to ICU4J.
190 */
191 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
192 .or(inRange('\u007f', '\u00a0'))
193 .or(is('\u00ad'))
194 .or(inRange('\u0600', '\u0603'))
195 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e"))
196 .or(inRange('\u2000', '\u200f'))
197 .or(inRange('\u2028', '\u202f'))
198 .or(inRange('\u205f', '\u2064'))
199 .or(inRange('\u206a', '\u206f'))
200 .or(is('\u3000'))
201 .or(inRange('\ud800', '\uf8ff'))
202 .or(anyOf("\ufeff\ufff9\ufffa\ufffb"))
203 .precomputed();
204
205 /**
206 * Determines whether a character is single-width (not double-width). When in doubt, this matcher
207 * errs on the side of returning {@code false} (that is, it tends to assume a character is
208 * double-width).
209 *
210 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
211 * date.
212 */
213 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
214 .or(is('\u05be'))
215 .or(inRange('\u05d0', '\u05ea'))
216 .or(is('\u05f3'))
217 .or(is('\u05f4'))
218 .or(inRange('\u0600', '\u06ff'))
219 .or(inRange('\u0750', '\u077f'))
220 .or(inRange('\u0e00', '\u0e7f'))
221 .or(inRange('\u1e00', '\u20af'))
222 .or(inRange('\u2100', '\u213a'))
223 .or(inRange('\ufb50', '\ufdff'))
224 .or(inRange('\ufe70', '\ufeff'))
225 .or(inRange('\uff61', '\uffdc'))
226 .precomputed();
227
228 /** Matches any character. */
229 public static final CharMatcher ANY =
230 new CharMatcher() {
231 @Override public boolean matches(char c) {
232 return true;
233 }
234
235 @Override public int indexIn(CharSequence sequence) {
236 return (sequence.length() == 0) ? -1 : 0;
237 }
238
239 @Override public int indexIn(CharSequence sequence, int start) {
240 int length = sequence.length();
241 Preconditions.checkPositionIndex(start, length);
242 return (start == length) ? -1 : start;
243 }
244
245 @Override public int lastIndexIn(CharSequence sequence) {
246 return sequence.length() - 1;
247 }
248
249 @Override public boolean matchesAllOf(CharSequence sequence) {
250 checkNotNull(sequence);
251 return true;
252 }
253
254 @Override public boolean matchesNoneOf(CharSequence sequence) {
255 return sequence.length() == 0;
256 }
257
258 @Override public String removeFrom(CharSequence sequence) {
259 checkNotNull(sequence);
260 return "";
261 }
262
263 @Override public String replaceFrom(CharSequence sequence, char replacement) {
264 char[] array = new char[sequence.length()];
265 Arrays.fill(array, replacement);
266 return new String(array);
267 }
268
269 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
270 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
271 for (int i = 0; i < sequence.length(); i++) {
272 retval.append(replacement);
273 }
274 return retval.toString();
275 }
276
277 @Override public String collapseFrom(CharSequence sequence, char replacement) {
278 return (sequence.length() == 0) ? "" : String.valueOf(replacement);
279 }
280
281 @Override public String trimFrom(CharSequence sequence) {
282 checkNotNull(sequence);
283 return "";
284 }
285
286 @Override public int countIn(CharSequence sequence) {
287 return sequence.length();
288 }
289
290 @Override public CharMatcher and(CharMatcher other) {
291 return checkNotNull(other);
292 }
293
294 @Override public CharMatcher or(CharMatcher other) {
295 checkNotNull(other);
296 return this;
297 }
298
299 @Override public CharMatcher negate() {
300 return NONE;
301 }
302
303 @Override public CharMatcher precomputed() {
304 return this;
305 }
306 };
307
308 /** Matches no characters. */
309 public static final CharMatcher NONE =
310 new CharMatcher() {
311 @Override public boolean matches(char c) {
312 return false;
313 }
314
315 @Override public int indexIn(CharSequence sequence) {
316 checkNotNull(sequence);
317 return -1;
318 }
319
320 @Override public int indexIn(CharSequence sequence, int start) {
321 int length = sequence.length();
322 Preconditions.checkPositionIndex(start, length);
323 return -1;
324 }
325
326 @Override public int lastIndexIn(CharSequence sequence) {
327 checkNotNull(sequence);
328 return -1;
329 }
330
331 @Override public boolean matchesAllOf(CharSequence sequence) {
332 return sequence.length() == 0;
333 }
334
335 @Override public boolean matchesNoneOf(CharSequence sequence) {
336 checkNotNull(sequence);
337 return true;
338 }
339
340 @Override public String removeFrom(CharSequence sequence) {
341 return sequence.toString();
342 }
343
344 @Override public String replaceFrom(CharSequence sequence, char replacement) {
345 return sequence.toString();
346 }
347
348 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
349 checkNotNull(replacement);
350 return sequence.toString();
351 }
352
353 @Override public String collapseFrom(CharSequence sequence, char replacement) {
354 return sequence.toString();
355 }
356
357 @Override public String trimFrom(CharSequence sequence) {
358 return sequence.toString();
359 }
360
361 @Override public int countIn(CharSequence sequence) {
362 checkNotNull(sequence);
363 return 0;
364 }
365
366 @Override public CharMatcher and(CharMatcher other) {
367 checkNotNull(other);
368 return this;
369 }
370
371 @Override public CharMatcher or(CharMatcher other) {
372 return checkNotNull(other);
373 }
374
375 @Override public CharMatcher negate() {
376 return ANY;
377 }
378
379 @Override void setBits(LookupTable table) {}
380
381 @Override public CharMatcher precomputed() {
382 return this;
383 }
384 };
385
386 // Static factories
387
388 /**
389 * Returns a {@code char} matcher that matches only one specified character.
390 */
391 public static CharMatcher is(final char match) {
392 return new CharMatcher() {
393 @Override public boolean matches(char c) {
394 return c == match;
395 }
396
397 @Override public String replaceFrom(CharSequence sequence, char replacement) {
398 return sequence.toString().replace(match, replacement);
399 }
400
401 @Override public CharMatcher and(CharMatcher other) {
402 return other.matches(match) ? this : NONE;
403 }
404
405 @Override public CharMatcher or(CharMatcher other) {
406 return other.matches(match) ? other : super.or(other);
407 }
408
409 @Override public CharMatcher negate() {
410 return isNot(match);
411 }
412
413 @Override void setBits(LookupTable table) {
414 table.set(match);
415 }
416
417 @Override public CharMatcher precomputed() {
418 return this;
419 }
420 };
421 }
422
423 /**
424 * Returns a {@code char} matcher that matches any character except the one specified.
425 *
426 * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
427 */
428 public static CharMatcher isNot(final char match) {
429 return new CharMatcher() {
430 @Override public boolean matches(char c) {
431 return c != match;
432 }
433
434 @Override public CharMatcher and(CharMatcher other) {
435 return other.matches(match) ? super.and(other) : other;
436 }
437
438 @Override public CharMatcher or(CharMatcher other) {
439 return other.matches(match) ? ANY : this;
440 }
441
442 @Override public CharMatcher negate() {
443 return is(match);
444 }
445 };
446 }
447
448 /**
449 * Returns a {@code char} matcher that matches any character present in the given character
450 * sequence.
451 */
452 public static CharMatcher anyOf(final CharSequence sequence) {
453 switch (sequence.length()) {
454 case 0:
455 return NONE;
456 case 1:
457 return is(sequence.charAt(0));
458 case 2:
459 final char match1 = sequence.charAt(0);
460 final char match2 = sequence.charAt(1);
461 return new CharMatcher() {
462 @Override public boolean matches(char c) {
463 return c == match1 || c == match2;
464 }
465
466 @Override void setBits(LookupTable table) {
467 table.set(match1);
468 table.set(match2);
469 }
470
471 @Override public CharMatcher precomputed() {
472 return this;
473 }
474 };
475 }
476
477 final char[] chars = sequence.toString().toCharArray();
478 Arrays.sort(chars); // not worth collapsing duplicates
479
480 return new CharMatcher() {
481 @Override public boolean matches(char c) {
482 return Arrays.binarySearch(chars, c) >= 0;
483 }
484
485 @Override void setBits(LookupTable table) {
486 for (char c : chars) {
487 table.set(c);
488 }
489 }
490 };
491 }
492
493 /**
494 * Returns a {@code char} matcher that matches any character not present in the given character
495 * sequence.
496 */
497 public static CharMatcher noneOf(CharSequence sequence) {
498 return anyOf(sequence).negate();
499 }
500
501 /**
502 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
503 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
504 * CharMatcher.inRange('a', 'z')}.
505 *
506 * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
507 */
508 public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
509 checkArgument(endInclusive >= startInclusive);
510 return new CharMatcher() {
511 @Override public boolean matches(char c) {
512 return startInclusive <= c && c <= endInclusive;
513 }
514
515 @Override void setBits(LookupTable table) {
516 char c = startInclusive;
517 while (true) {
518 table.set(c);
519 if (c++ == endInclusive) {
520 break;
521 }
522 }
523 }
524
525 @Override public CharMatcher precomputed() {
526 return this;
527 }
528 };
529 }
530
531 /**
532 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
533 * which operates on primitive {@code char} instances instead.
534 */
535 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
536 checkNotNull(predicate);
537 if (predicate instanceof CharMatcher) {
538 return (CharMatcher) predicate;
539 }
540 return new CharMatcher() {
541 @Override public boolean matches(char c) {
542 return predicate.apply(c);
543 }
544
545 @Override public boolean apply(Character character) {
546 return predicate.apply(checkNotNull(character));
547 }
548 };
549 }
550
551 // Abstract methods
552
553 /** Determines a true or false value for the given character. */
554 public abstract boolean matches(char c);
555
556 // Non-static factories
557
558 /**
559 * Returns a matcher that matches any character not matched by this matcher.
560 */
561 public CharMatcher negate() {
562 final CharMatcher original = this;
563 return new CharMatcher() {
564 @Override public boolean matches(char c) {
565 return !original.matches(c);
566 }
567
568 @Override public boolean matchesAllOf(CharSequence sequence) {
569 return original.matchesNoneOf(sequence);
570 }
571
572 @Override public boolean matchesNoneOf(CharSequence sequence) {
573 return original.matchesAllOf(sequence);
574 }
575
576 @Override public int countIn(CharSequence sequence) {
577 return sequence.length() - original.countIn(sequence);
578 }
579
580 @Override public CharMatcher negate() {
581 return original;
582 }
583 };
584 }
585
586 /**
587 * Returns a matcher that matches any character matched by both this matcher and {@code other}.
588 */
589 public CharMatcher and(CharMatcher other) {
590 return new And(Arrays.asList(this, checkNotNull(other)));
591 }
592
593 private static class And extends CharMatcher {
594 List<CharMatcher> components;
595
596 And(List<CharMatcher> components) {
597 this.components = components; // Skip defensive copy (private)
598 }
599
600 @Override public boolean matches(char c) {
601 for (CharMatcher matcher : components) {
602 if (!matcher.matches(c)) {
603 return false;
604 }
605 }
606 return true;
607 }
608
609 @Override public CharMatcher and(CharMatcher other) {
610 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
611 newComponents.add(checkNotNull(other));
612 return new And(newComponents);
613 }
614 }
615
616 /**
617 * Returns a matcher that matches any character matched by either this matcher or {@code other}.
618 */
619 public CharMatcher or(CharMatcher other) {
620 return new Or(Arrays.asList(this, checkNotNull(other)));
621 }
622
623 private static class Or extends CharMatcher {
624 List<CharMatcher> components;
625
626 Or(List<CharMatcher> components) {
627 this.components = components; // Skip defensive copy (private)
628 }
629
630 @Override public boolean matches(char c) {
631 for (CharMatcher matcher : components) {
632 if (matcher.matches(c)) {
633 return true;
634 }
635 }
636 return false;
637 }
638
639 @Override public CharMatcher or(CharMatcher other) {
640 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
641 newComponents.add(checkNotNull(other));
642 return new Or(newComponents);
643 }
644
645 @Override void setBits(LookupTable table) {
646 for (CharMatcher matcher : components) {
647 matcher.setBits(table);
648 }
649 }
650 }
651
652 /**
653 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
654 * query than the original; your mileage may vary. Precomputation takes time and is likely to be
655 * worthwhile only if the precomputed matcher is queried many thousands of times.
656 *
657 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
658 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
659 * worthwhile tradeoff in a browser.
660 */
661 public CharMatcher precomputed() {
662 return Platform.precomputeCharMatcher(this);
663 }
664
665 /**
666 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
667 * on {@link Platform} so that we can have different behavior in GWT.
668 *
669 * <p>The default precomputation is to cache the configuration of the original matcher in an
670 * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query
671 * than the original.
672 *
673 * <p>The default implementation creates a new bit array and passes it to {@link
674 * #setBits(LookupTable)}.
675 */
676 CharMatcher precomputedInternal() {
677 final LookupTable table = new LookupTable();
678 setBits(table);
679
680 return new CharMatcher() {
681 @Override public boolean matches(char c) {
682 return table.get(c);
683 }
684
685 // TODO(kevinb): make methods like negate() smart?
686
687 @Override public CharMatcher precomputed() {
688 return this;
689 }
690 };
691 }
692
693 /**
694 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal
695 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched.
696 *
697 * <p>The default implementation loops over every possible character value, invoking {@link
698 * #matches} for each one.
699 */
700 void setBits(LookupTable table) {
701 char c = Character.MIN_VALUE;
702 while (true) {
703 if (matches(c)) {
704 table.set(c);
705 }
706 if (c++ == Character.MAX_VALUE) {
707 break;
708 }
709 }
710 }
711
712 /**
713 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}.
714 *
715 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a
716 * simpler java.util.BitSet.
717 */
718 private static final class LookupTable {
719 int[] data = new int[2048];
720
721 void set(char index) {
722 data[index >> 5] |= (1 << index);
723 }
724
725 boolean get(char index) {
726 return (data[index >> 5] & (1 << index)) != 0;
727 }
728 }
729
730 // Text processing routines
731
732 /**
733 * Returns {@code true} if a character sequence contains at least one matching character.
734 * Equivalent to {@code !matchesNoneOf(sequence)}.
735 *
736 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
737 * character, until this returns {@code true} or the end is reached.
738 *
739 * @param sequence the character sequence to examine, possibly empty
740 * @return {@code true} if this matcher matches at least one character in the sequence
741 * @since 8
742 */
743 public boolean matchesAnyOf(CharSequence sequence) {
744 return !matchesNoneOf(sequence);
745 }
746
747 /**
748 * Returns {@code true} if a character sequence contains only matching characters.
749 *
750 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
751 * character, until this returns {@code false} or the end is reached.
752 *
753 * @param sequence the character sequence to examine, possibly empty
754 * @return {@code true} if this matcher matches every character in the sequence, including when
755 * the sequence is empty
756 */
757 public boolean matchesAllOf(CharSequence sequence) {
758 for (int i = sequence.length() - 1; i >= 0; i--) {
759 if (!matches(sequence.charAt(i))) {
760 return false;
761 }
762 }
763 return true;
764 }
765
766 /**
767 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
768 * {@code !matchesAnyOf(sequence)}.
769 *
770 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
771 * character, until this returns {@code false} or the end is reached.
772 *
773 * @param sequence the character sequence to examine, possibly empty
774 * @return {@code true} if this matcher matches every character in the sequence, including when
775 * the sequence is empty
776 */
777 public boolean matchesNoneOf(CharSequence sequence) {
778 return indexIn(sequence) == -1;
779 }
780
781 // TODO(kevinb): add matchesAnyOf()
782
783 /**
784 * Returns the index of the first matching character in a character sequence, or {@code -1} if no
785 * matching character is present.
786 *
787 * <p>The default implementation iterates over the sequence in forward order calling {@link
788 * #matches} for each character.
789 *
790 * @param sequence the character sequence to examine from the beginning
791 * @return an index, or {@code -1} if no character matches
792 */
793 public int indexIn(CharSequence sequence) {
794 int length = sequence.length();
795 for (int i = 0; i < length; i++) {
796 if (matches(sequence.charAt(i))) {
797 return i;
798 }
799 }
800 return -1;
801 }
802
803 /**
804 * Returns the index of the first matching character in a character sequence, starting from a
805 * given position, or {@code -1} if no character matches after that position.
806 *
807 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
808 * start}, calling {@link #matches} for each character.
809 *
810 * @param sequence the character sequence to examine
811 * @param start the first index to examine; must be nonnegative and no greater than {@code
812 * sequence.length()}
813 * @return the index of the first matching character, guaranteed to be no less than {@code start},
814 * or {@code -1} if no character matches
815 * @throws IndexOutOfBoundsException if start is negative or greater than {@code
816 * sequence.length()}
817 */
818 public int indexIn(CharSequence sequence, int start) {
819 int length = sequence.length();
820 Preconditions.checkPositionIndex(start, length);
821 for (int i = start; i < length; i++) {
822 if (matches(sequence.charAt(i))) {
823 return i;
824 }
825 }
826 return -1;
827 }
828
829 /**
830 * Returns the index of the last matching character in a character sequence, or {@code -1} if no
831 * matching character is present.
832 *
833 * <p>The default implementation iterates over the sequence in reverse order calling {@link
834 * #matches} for each character.
835 *
836 * @param sequence the character sequence to examine from the end
837 * @return an index, or {@code -1} if no character matches
838 */
839 public int lastIndexIn(CharSequence sequence) {
840 for (int i = sequence.length() - 1; i >= 0; i--) {
841 if (matches(sequence.charAt(i))) {
842 return i;
843 }
844 }
845 return -1;
846 }
847
848 /**
849 * Returns the number of matching characters found in a character sequence.
850 */
851 public int countIn(CharSequence sequence) {
852 int count = 0;
853 for (int i = 0; i < sequence.length(); i++) {
854 if (matches(sequence.charAt(i))) {
855 count++;
856 }
857 }
858 return count;
859 }
860
861 /**
862 * Returns a string containing all non-matching characters of a character sequence, in order. For
863 * example: <pre> {@code
864 *
865 * CharMatcher.is('a').removeFrom("bazaar")}</pre>
866 *
867 * ... returns {@code "bzr"}.
868 */
869 public String removeFrom(CharSequence sequence) {
870 String string = sequence.toString();
871 int pos = indexIn(string);
872 if (pos == -1) {
873 return string;
874 }
875
876 char[] chars = string.toCharArray();
877 int spread = 1;
878
879 // This unusual loop comes from extensive benchmarking
880 OUT: while (true) {
881 pos++;
882 while (true) {
883 if (pos == chars.length) {
884 break OUT;
885 }
886 if (matches(chars[pos])) {
887 break;
888 }
889 chars[pos - spread] = chars[pos];
890 pos++;
891 }
892 spread++;
893 }
894 return new String(chars, 0, pos - spread);
895 }
896
897 /**
898 * Returns a string containing all matching characters of a character sequence, in order. For
899 * example: <pre> {@code
900 *
901 * CharMatcher.is('a').retainFrom("bazaar")}</pre>
902 *
903 * ... returns {@code "aaa"}.
904 */
905 public String retainFrom(CharSequence sequence) {
906 return negate().removeFrom(sequence);
907 }
908
909 /**
910 * Returns a string copy of the input character sequence, with each character that matches this
911 * matcher replaced by a given replacement character. For example: <pre> {@code
912 *
913 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
914 *
915 * ... returns {@code "rodor"}.
916 *
917 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
918 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
919 * character.
920 *
921 * @param sequence the character sequence to replace matching characters in
922 * @param replacement the character to append to the result string in place of each matching
923 * character in {@code sequence}
924 * @return the new string
925 */
926 public String replaceFrom(CharSequence sequence, char replacement) {
927 String string = sequence.toString();
928 int pos = indexIn(string);
929 if (pos == -1) {
930 return string;
931 }
932 char[] chars = string.toCharArray();
933 chars[pos] = replacement;
934 for (int i = pos + 1; i < chars.length; i++) {
935 if (matches(chars[i])) {
936 chars[i] = replacement;
937 }
938 }
939 return new String(chars);
940 }
941
942 /**
943 * Returns a string copy of the input character sequence, with each character that matches this
944 * matcher replaced by a given replacement sequence. For example: <pre> {@code
945 *
946 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
947 *
948 * ... returns {@code "yoohoo"}.
949 *
950 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
951 * off calling {@link #replaceFrom(CharSequence, char)} directly.
952 *
953 * @param sequence the character sequence to replace matching characters in
954 * @param replacement the characters to append to the result string in place of each matching
955 * character in {@code sequence}
956 * @return the new string
957 */
958 public String replaceFrom(CharSequence sequence, CharSequence replacement) {
959 int replacementLen = replacement.length();
960 if (replacementLen == 0) {
961 return removeFrom(sequence);
962 }
963 if (replacementLen == 1) {
964 return replaceFrom(sequence, replacement.charAt(0));
965 }
966
967 String string = sequence.toString();
968 int pos = indexIn(string);
969 if (pos == -1) {
970 return string;
971 }
972
973 int len = string.length();
974 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
975
976 int oldpos = 0;
977 do {
978 buf.append(string, oldpos, pos);
979 buf.append(replacement);
980 oldpos = pos + 1;
981 pos = indexIn(string, oldpos);
982 } while (pos != -1);
983
984 buf.append(string, oldpos, len);
985 return buf.toString();
986 }
987
988 /**
989 * Returns a substring of the input character sequence that omits all characters this matcher
990 * matches from the beginning and from the end of the string. For example: <pre> {@code
991 *
992 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
993 *
994 * ... returns {@code "cat"}.
995 *
996 * <p>Note that: <pre> {@code
997 *
998 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
999 *
1000 * ... is equivalent to {@link String#trim()}.
1001 */
1002 public String trimFrom(CharSequence sequence) {
1003 int len = sequence.length();
1004 int first;
1005 int last;
1006
1007 for (first = 0; first < len; first++) {
1008 if (!matches(sequence.charAt(first))) {
1009 break;
1010 }
1011 }
1012 for (last = len - 1; last > first; last--) {
1013 if (!matches(sequence.charAt(last))) {
1014 break;
1015 }
1016 }
1017
1018 return sequence.subSequence(first, last + 1).toString();
1019 }
1020
1021 /**
1022 * Returns a substring of the input character sequence that omits all characters this matcher
1023 * matches from the beginning of the string. For example: <pre> {@code
1024 *
1025 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1026 *
1027 * ... returns {@code "catbab"}.
1028 */
1029 public String trimLeadingFrom(CharSequence sequence) {
1030 int len = sequence.length();
1031 int first;
1032
1033 for (first = 0; first < len; first++) {
1034 if (!matches(sequence.charAt(first))) {
1035 break;
1036 }
1037 }
1038
1039 return sequence.subSequence(first, len).toString();
1040 }
1041
1042 /**
1043 * Returns a substring of the input character sequence that omits all characters this matcher
1044 * matches from the end of the string. For example: <pre> {@code
1045 *
1046 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1047 *
1048 * ... returns {@code "abacat"}.
1049 */
1050 public String trimTrailingFrom(CharSequence sequence) {
1051 int len = sequence.length();
1052 int last;
1053
1054 for (last = len - 1; last >= 0; last--) {
1055 if (!matches(sequence.charAt(last))) {
1056 break;
1057 }
1058 }
1059
1060 return sequence.subSequence(0, last + 1).toString();
1061 }
1062
1063 /**
1064 * Returns a string copy of the input character sequence, with each group of consecutive
1065 * characters that match this matcher replaced by a single replacement character. For example:
1066 * <pre> {@code
1067 *
1068 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1069 *
1070 * ... returns {@code "b-p-r"}.
1071 *
1072 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1073 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1074 * character.
1075 *
1076 * @param sequence the character sequence to replace matching groups of characters in
1077 * @param replacement the character to append to the result string in place of each group of
1078 * matching characters in {@code sequence}
1079 * @return the new string
1080 */
1081 public String collapseFrom(CharSequence sequence, char replacement) {
1082 int first = indexIn(sequence);
1083 if (first == -1) {
1084 return sequence.toString();
1085 }
1086
1087 // TODO(kevinb): see if this implementation can be made faster
1088 StringBuilder builder = new StringBuilder(sequence.length())
1089 .append(sequence.subSequence(0, first))
1090 .append(replacement);
1091 boolean in = true;
1092 for (int i = first + 1; i < sequence.length(); i++) {
1093 char c = sequence.charAt(i);
1094 if (apply(c)) {
1095 if (!in) {
1096 builder.append(replacement);
1097 in = true;
1098 }
1099 } else {
1100 builder.append(c);
1101 in = false;
1102 }
1103 }
1104 return builder.toString();
1105 }
1106
1107 /**
1108 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1109 * groups of matching characters at the start or end of the sequence are removed without
1110 * replacement.
1111 */
1112 public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1113 int first = negate().indexIn(sequence);
1114 if (first == -1) {
1115 return ""; // everything matches. nothing's left.
1116 }
1117 StringBuilder builder = new StringBuilder(sequence.length());
1118 boolean inMatchingGroup = false;
1119 for (int i = first; i < sequence.length(); i++) {
1120 char c = sequence.charAt(i);
1121 if (apply(c)) {
1122 inMatchingGroup = true;
1123 } else {
1124 if (inMatchingGroup) {
1125 builder.append(replacement);
1126 inMatchingGroup = false;
1127 }
1128 builder.append(c);
1129 }
1130 }
1131 return builder.toString();
1132 }
1133
1134 // Predicate interface
1135
1136 /**
1137 * Returns {@code true} if this matcher matches the given character.
1138 *
1139 * @throws NullPointerException if {@code character} is null
1140 */
1141 @Override public boolean apply(Character character) {
1142 return matches(character);
1143 }
1144 }