001 /*
002 * Copyright (C) 2008 Google Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the
010 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
011 * express or implied. See the License for the specific language governing permissions and
012 * limitations under the License.
013 */
014
015 package com.google.common.base;
016
017 import static com.google.common.base.Preconditions.checkArgument;
018 import static com.google.common.base.Preconditions.checkNotNull;
019
020 import com.google.common.annotations.Beta;
021 import com.google.common.annotations.GwtCompatible;
022
023 import java.util.ArrayList;
024 import java.util.Arrays;
025 import java.util.List;
026
027 /**
028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
029 * for any {@link Object}. Also offers basic text processing methods based on this function.
030 * Implementations are strongly encouraged to be side-effect-free and immutable.
031 *
032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
033 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
034 *
035 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
036 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
037 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
038 * treats these just as two separate characters.
039 *
040 * @author Kevin Bourrillion
041 * @since 1
042 */
043 @Beta // Possibly change from chars to code points; decide constants vs. methods
044 @GwtCompatible
045 public abstract class CharMatcher implements Predicate<Character> {
046 // Constants
047
048 // Excludes 2000-2000a, which is handled as a range
049 private static final String BREAKING_WHITESPACE_CHARS =
050 "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000";
051
052 // Excludes 2007, which is handled as a gap in a pair of ranges
053 private static final String NON_BREAKING_WHITESPACE_CHARS =
054 "\u00a0\u180e\u202f";
055
056 /**
057 * Determines whether a character is whitespace according to the latest Unicode standard, as
058 * illustrated
059 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
060 * This is not the same definition used by other Java APIs. (See a
061 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
062 * definitions of "whitespace"</a>.)
063 *
064 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
065 * to date.
066 */
067 public static final CharMatcher WHITESPACE =
068 anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS)
069 .or(inRange('\u2000', '\u200a'))
070 .precomputed();
071
072 /**
073 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
074 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
075 * discussion of that term.
076 *
077 * @since 2
078 */
079 public static final CharMatcher BREAKING_WHITESPACE =
080 anyOf(BREAKING_WHITESPACE_CHARS)
081 .or(inRange('\u2000', '\u2006'))
082 .or(inRange('\u2008', '\u200a'))
083 .precomputed();
084
085 /**
086 * Determines whether a character is ASCII, meaning that its code point is less than 128.
087 */
088 public static final CharMatcher ASCII = inRange('\0', '\u007f');
089
090 /**
091 * Determines whether a character is a digit according to
092 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
093 */
094 public static final CharMatcher DIGIT;
095
096 static {
097 CharMatcher digit = inRange('0', '9');
098 String zeroes =
099 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
100 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
101 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
102 for (char base : zeroes.toCharArray()) {
103 digit = digit.or(inRange(base, (char) (base + 9)));
104 }
105 DIGIT = digit.precomputed();
106 }
107
108 /**
109 * Determines whether a character is whitespace according to {@link Character#isWhitespace(char)
110 * Java's definition}; it is usually preferable to use {@link #WHITESPACE}. (See a
111 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
112 * definitions of "whitespace"</a>.)
113 */
114 public static final CharMatcher JAVA_WHITESPACE =
115 inRange('\u0009', (char) 13) // \\u000d doesn't work as a char literal
116 .or(inRange('\u001c', '\u0020'))
117 .or(is('\u1680'))
118 .or(is('\u180e'))
119 .or(inRange('\u2000', '\u2006'))
120 .or(inRange('\u2008', '\u200b'))
121 .or(inRange('\u2028', '\u2029'))
122 .or(is('\u205f'))
123 .or(is('\u3000'))
124 .precomputed();
125
126 /**
127 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
128 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
129 */
130 public static final CharMatcher JAVA_DIGIT = new CharMatcher() {
131 @Override public boolean matches(char c) {
132 return Character.isDigit(c);
133 }
134 };
135
136 /**
137 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
138 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
139 * inRange('a', 'z').or(inRange('A', 'Z'))}.
140 */
141 public static final CharMatcher JAVA_LETTER = new CharMatcher() {
142 @Override public boolean matches(char c) {
143 return Character.isLetter(c);
144 }
145 };
146
147 /**
148 * Determines whether a character is a letter or digit according to {@link
149 * Character#isLetterOrDigit(char) Java's definition}.
150 */
151 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() {
152 @Override public boolean matches(char c) {
153 return Character.isLetterOrDigit(c);
154 }
155 };
156
157 /**
158 * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
159 * Java's definition}.
160 */
161 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() {
162 @Override public boolean matches(char c) {
163 return Character.isUpperCase(c);
164 }
165 };
166
167 /**
168 * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
169 * Java's definition}.
170 */
171 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() {
172 @Override public boolean matches(char c) {
173 return Character.isLowerCase(c);
174 }
175 };
176
177 /**
178 * Determines whether a character is an ISO control character as specified by {@link
179 * Character#isISOControl(char)}.
180 */
181 public static final CharMatcher JAVA_ISO_CONTROL =
182 inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f'));
183
184 /**
185 * Determines whether a character is invisible; that is, if its Unicode category is any of
186 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
187 * PRIVATE_USE according to ICU4J.
188 */
189 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
190 .or(inRange('\u007f', '\u00a0'))
191 .or(is('\u00ad'))
192 .or(inRange('\u0600', '\u0603'))
193 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e"))
194 .or(inRange('\u2000', '\u200f'))
195 .or(inRange('\u2028', '\u202f'))
196 .or(inRange('\u205f', '\u2064'))
197 .or(inRange('\u206a', '\u206f'))
198 .or(is('\u3000'))
199 .or(inRange('\ud800', '\uf8ff'))
200 .or(anyOf("\ufeff\ufff9\ufffa\ufffb"))
201 .precomputed();
202
203 /**
204 * Determines whether a character is single-width (not double-width). When in doubt, this matcher
205 * errs on the side of returning {@code false} (that is, it tends to assume a character is
206 * double-width).
207 *
208 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
209 * date.
210 */
211 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
212 .or(is('\u05be'))
213 .or(inRange('\u05d0', '\u05ea'))
214 .or(is('\u05f3'))
215 .or(is('\u05f4'))
216 .or(inRange('\u0600', '\u06ff'))
217 .or(inRange('\u0750', '\u077f'))
218 .or(inRange('\u0e00', '\u0e7f'))
219 .or(inRange('\u1e00', '\u20af'))
220 .or(inRange('\u2100', '\u213a'))
221 .or(inRange('\ufb50', '\ufdff'))
222 .or(inRange('\ufe70', '\ufeff'))
223 .or(inRange('\uff61', '\uffdc'))
224 .precomputed();
225
226 /** Matches any character. */
227 public static final CharMatcher ANY =
228 new CharMatcher() {
229 @Override public boolean matches(char c) {
230 return true;
231 }
232
233 @Override public int indexIn(CharSequence sequence) {
234 return (sequence.length() == 0) ? -1 : 0;
235 }
236
237 @Override public int indexIn(CharSequence sequence, int start) {
238 int length = sequence.length();
239 Preconditions.checkPositionIndex(start, length);
240 return (start == length) ? -1 : start;
241 }
242
243 @Override public int lastIndexIn(CharSequence sequence) {
244 return sequence.length() - 1;
245 }
246
247 @Override public boolean matchesAllOf(CharSequence sequence) {
248 checkNotNull(sequence);
249 return true;
250 }
251
252 @Override public boolean matchesNoneOf(CharSequence sequence) {
253 return sequence.length() == 0;
254 }
255
256 @Override public String removeFrom(CharSequence sequence) {
257 checkNotNull(sequence);
258 return "";
259 }
260
261 @Override public String replaceFrom(CharSequence sequence, char replacement) {
262 char[] array = new char[sequence.length()];
263 Arrays.fill(array, replacement);
264 return new String(array);
265 }
266
267 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
268 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
269 for (int i = 0; i < sequence.length(); i++) {
270 retval.append(replacement);
271 }
272 return retval.toString();
273 }
274
275 @Override public String collapseFrom(CharSequence sequence, char replacement) {
276 return (sequence.length() == 0) ? "" : String.valueOf(replacement);
277 }
278
279 @Override public String trimFrom(CharSequence sequence) {
280 checkNotNull(sequence);
281 return "";
282 }
283
284 @Override public int countIn(CharSequence sequence) {
285 return sequence.length();
286 }
287
288 @Override public CharMatcher and(CharMatcher other) {
289 return checkNotNull(other);
290 }
291
292 @Override public CharMatcher or(CharMatcher other) {
293 checkNotNull(other);
294 return this;
295 }
296
297 @Override public CharMatcher negate() {
298 return NONE;
299 }
300
301 @Override public CharMatcher precomputed() {
302 return this;
303 }
304 };
305
306 /** Matches no characters. */
307 public static final CharMatcher NONE =
308 new CharMatcher() {
309 @Override public boolean matches(char c) {
310 return false;
311 }
312
313 @Override public int indexIn(CharSequence sequence) {
314 checkNotNull(sequence);
315 return -1;
316 }
317
318 @Override public int indexIn(CharSequence sequence, int start) {
319 int length = sequence.length();
320 Preconditions.checkPositionIndex(start, length);
321 return -1;
322 }
323
324 @Override public int lastIndexIn(CharSequence sequence) {
325 checkNotNull(sequence);
326 return -1;
327 }
328
329 @Override public boolean matchesAllOf(CharSequence sequence) {
330 return sequence.length() == 0;
331 }
332
333 @Override public boolean matchesNoneOf(CharSequence sequence) {
334 checkNotNull(sequence);
335 return true;
336 }
337
338 @Override public String removeFrom(CharSequence sequence) {
339 return sequence.toString();
340 }
341
342 @Override public String replaceFrom(CharSequence sequence, char replacement) {
343 return sequence.toString();
344 }
345
346 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
347 checkNotNull(replacement);
348 return sequence.toString();
349 }
350
351 @Override public String collapseFrom(CharSequence sequence, char replacement) {
352 return sequence.toString();
353 }
354
355 @Override public String trimFrom(CharSequence sequence) {
356 return sequence.toString();
357 }
358
359 @Override public int countIn(CharSequence sequence) {
360 checkNotNull(sequence);
361 return 0;
362 }
363
364 @Override public CharMatcher and(CharMatcher other) {
365 checkNotNull(other);
366 return this;
367 }
368
369 @Override public CharMatcher or(CharMatcher other) {
370 return checkNotNull(other);
371 }
372
373 @Override public CharMatcher negate() {
374 return ANY;
375 }
376
377 @Override void setBits(LookupTable table) {}
378
379 @Override public CharMatcher precomputed() {
380 return this;
381 }
382 };
383
384 // Static factories
385
386 /**
387 * Returns a {@code char} matcher that matches only one specified character.
388 */
389 public static CharMatcher is(final char match) {
390 return new CharMatcher() {
391 @Override public boolean matches(char c) {
392 return c == match;
393 }
394
395 @Override public String replaceFrom(CharSequence sequence, char replacement) {
396 return sequence.toString().replace(match, replacement);
397 }
398
399 @Override public CharMatcher and(CharMatcher other) {
400 return other.matches(match) ? this : NONE;
401 }
402
403 @Override public CharMatcher or(CharMatcher other) {
404 return other.matches(match) ? other : super.or(other);
405 }
406
407 @Override public CharMatcher negate() {
408 return isNot(match);
409 }
410
411 @Override void setBits(LookupTable table) {
412 table.set(match);
413 }
414
415 @Override public CharMatcher precomputed() {
416 return this;
417 }
418 };
419 }
420
421 /**
422 * Returns a {@code char} matcher that matches any character except the one specified.
423 *
424 * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
425 */
426 public static CharMatcher isNot(final char match) {
427 return new CharMatcher() {
428 @Override public boolean matches(char c) {
429 return c != match;
430 }
431
432 @Override public CharMatcher and(CharMatcher other) {
433 return other.matches(match) ? super.and(other) : other;
434 }
435
436 @Override public CharMatcher or(CharMatcher other) {
437 return other.matches(match) ? ANY : this;
438 }
439
440 @Override public CharMatcher negate() {
441 return is(match);
442 }
443 };
444 }
445
446 /**
447 * Returns a {@code char} matcher that matches any character present in the given character
448 * sequence.
449 */
450 public static CharMatcher anyOf(final CharSequence sequence) {
451 switch (sequence.length()) {
452 case 0:
453 return NONE;
454 case 1:
455 return is(sequence.charAt(0));
456 case 2:
457 final char match1 = sequence.charAt(0);
458 final char match2 = sequence.charAt(1);
459 return new CharMatcher() {
460 @Override public boolean matches(char c) {
461 return c == match1 || c == match2;
462 }
463
464 @Override void setBits(LookupTable table) {
465 table.set(match1);
466 table.set(match2);
467 }
468
469 @Override public CharMatcher precomputed() {
470 return this;
471 }
472 };
473 }
474
475 final char[] chars = sequence.toString().toCharArray();
476 Arrays.sort(chars); // not worth collapsing duplicates
477
478 return new CharMatcher() {
479 @Override public boolean matches(char c) {
480 return Arrays.binarySearch(chars, c) >= 0;
481 }
482
483 @Override void setBits(LookupTable table) {
484 for (char c : chars) {
485 table.set(c);
486 }
487 }
488 };
489 }
490
491 /**
492 * Returns a {@code char} matcher that matches any character not present in the given character
493 * sequence.
494 */
495 public static CharMatcher noneOf(CharSequence sequence) {
496 return anyOf(sequence).negate();
497 }
498
499 /**
500 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
501 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
502 * CharMatcher.inRange('a', 'z')}.
503 *
504 * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
505 */
506 public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
507 checkArgument(endInclusive >= startInclusive);
508 return new CharMatcher() {
509 @Override public boolean matches(char c) {
510 return startInclusive <= c && c <= endInclusive;
511 }
512
513 @Override void setBits(LookupTable table) {
514 char c = startInclusive;
515 while (true) {
516 table.set(c);
517 if (c++ == endInclusive) {
518 break;
519 }
520 }
521 }
522
523 @Override public CharMatcher precomputed() {
524 return this;
525 }
526 };
527 }
528
529 /**
530 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
531 * which operates on primitive {@code char} instances instead.
532 */
533 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
534 checkNotNull(predicate);
535 if (predicate instanceof CharMatcher) {
536 return (CharMatcher) predicate;
537 }
538 return new CharMatcher() {
539 @Override public boolean matches(char c) {
540 return predicate.apply(c);
541 }
542
543 @Override public boolean apply(Character character) {
544 return predicate.apply(checkNotNull(character));
545 }
546 };
547 }
548
549 // Abstract methods
550
551 /** Determines a true or false value for the given character. */
552 public abstract boolean matches(char c);
553
554 // Non-static factories
555
556 /**
557 * Returns a matcher that matches any character not matched by this matcher.
558 */
559 public CharMatcher negate() {
560 final CharMatcher original = this;
561 return new CharMatcher() {
562 @Override public boolean matches(char c) {
563 return !original.matches(c);
564 }
565
566 @Override public boolean matchesAllOf(CharSequence sequence) {
567 return original.matchesNoneOf(sequence);
568 }
569
570 @Override public boolean matchesNoneOf(CharSequence sequence) {
571 return original.matchesAllOf(sequence);
572 }
573
574 @Override public int countIn(CharSequence sequence) {
575 return sequence.length() - original.countIn(sequence);
576 }
577
578 @Override public CharMatcher negate() {
579 return original;
580 }
581 };
582 }
583
584 /**
585 * Returns a matcher that matches any character matched by both this matcher and {@code other}.
586 */
587 public CharMatcher and(CharMatcher other) {
588 return new And(Arrays.asList(this, checkNotNull(other)));
589 }
590
591 private static class And extends CharMatcher {
592 List<CharMatcher> components;
593
594 And(List<CharMatcher> components) {
595 this.components = components; // Skip defensive copy (private)
596 }
597
598 @Override public boolean matches(char c) {
599 for (CharMatcher matcher : components) {
600 if (!matcher.matches(c)) {
601 return false;
602 }
603 }
604 return true;
605 }
606
607 @Override public CharMatcher and(CharMatcher other) {
608 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
609 newComponents.add(checkNotNull(other));
610 return new And(newComponents);
611 }
612 }
613
614 /**
615 * Returns a matcher that matches any character matched by either this matcher or {@code other}.
616 */
617 public CharMatcher or(CharMatcher other) {
618 return new Or(Arrays.asList(this, checkNotNull(other)));
619 }
620
621 private static class Or extends CharMatcher {
622 List<CharMatcher> components;
623
624 Or(List<CharMatcher> components) {
625 this.components = components; // Skip defensive copy (private)
626 }
627
628 @Override public boolean matches(char c) {
629 for (CharMatcher matcher : components) {
630 if (matcher.matches(c)) {
631 return true;
632 }
633 }
634 return false;
635 }
636
637 @Override public CharMatcher or(CharMatcher other) {
638 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components);
639 newComponents.add(checkNotNull(other));
640 return new Or(newComponents);
641 }
642
643 @Override void setBits(LookupTable table) {
644 for (CharMatcher matcher : components) {
645 matcher.setBits(table);
646 }
647 }
648 }
649
650 /**
651 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
652 * query than the original; your mileage may vary. Precomputation takes time and is likely to be
653 * worthwhile only if the precomputed matcher is queried many thousands of times.
654 *
655 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
656 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
657 * worthwhile tradeoff in a browser.
658 */
659 public CharMatcher precomputed() {
660 return Platform.precomputeCharMatcher(this);
661 }
662
663 /**
664 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
665 * on {@link Platform} so that we can have different behavior in GWT.
666 *
667 * <p>The default precomputation is to cache the configuration of the original matcher in an
668 * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query
669 * than the original.
670 *
671 * <p>The default implementation creates a new bit array and passes it to {@link
672 * #setBits(LookupTable)}.
673 */
674 CharMatcher precomputedInternal() {
675 final LookupTable table = new LookupTable();
676 setBits(table);
677
678 return new CharMatcher() {
679 @Override public boolean matches(char c) {
680 return table.get(c);
681 }
682
683 // TODO(kevinb): make methods like negate() smart?
684
685 @Override public CharMatcher precomputed() {
686 return this;
687 }
688 };
689 }
690
691 /**
692 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal
693 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched.
694 *
695 * <p>The default implementation loops over every possible character value, invoking {@link
696 * #matches} for each one.
697 */
698 void setBits(LookupTable table) {
699 char c = Character.MIN_VALUE;
700 while (true) {
701 if (matches(c)) {
702 table.set(c);
703 }
704 if (c++ == Character.MAX_VALUE) {
705 break;
706 }
707 }
708 }
709
710 /**
711 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}.
712 *
713 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a
714 * simpler java.util.BitSet.
715 */
716 private static final class LookupTable {
717 int[] data = new int[2048];
718
719 void set(char index) {
720 data[index >> 5] |= (1 << index);
721 }
722
723 boolean get(char index) {
724 return (data[index >> 5] & (1 << index)) != 0;
725 }
726 }
727
728 // Text processing routines
729
730 /**
731 * Returns {@code true} if a character sequence contains at least one matching character.
732 * Equivalent to {@code !matchesNoneOf(sequence)}.
733 *
734 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
735 * character, until this returns {@code true} or the end is reached.
736 *
737 * @param sequence the character sequence to examine, possibly empty
738 * @return {@code true} if this matcher matches at least one character in the sequence
739 * @since 8
740 */
741 public boolean matchesAnyOf(CharSequence sequence) {
742 return !matchesNoneOf(sequence);
743 }
744
745 /**
746 * Returns {@code true} if a character sequence contains only matching characters.
747 *
748 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
749 * character, until this returns {@code false} or the end is reached.
750 *
751 * @param sequence the character sequence to examine, possibly empty
752 * @return {@code true} if this matcher matches every character in the sequence, including when
753 * the sequence is empty
754 */
755 public boolean matchesAllOf(CharSequence sequence) {
756 for (int i = sequence.length() - 1; i >= 0; i--) {
757 if (!matches(sequence.charAt(i))) {
758 return false;
759 }
760 }
761 return true;
762 }
763
764 /**
765 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
766 * {@code !matchesAnyOf(sequence)}.
767 *
768 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
769 * character, until this returns {@code false} or the end is reached.
770 *
771 * @param sequence the character sequence to examine, possibly empty
772 * @return {@code true} if this matcher matches every character in the sequence, including when
773 * the sequence is empty
774 */
775 public boolean matchesNoneOf(CharSequence sequence) {
776 return indexIn(sequence) == -1;
777 }
778
779 // TODO(kevinb): add matchesAnyOf()
780
781 /**
782 * Returns the index of the first matching character in a character sequence, or {@code -1} if no
783 * matching character is present.
784 *
785 * <p>The default implementation iterates over the sequence in forward order calling {@link
786 * #matches} for each character.
787 *
788 * @param sequence the character sequence to examine from the beginning
789 * @return an index, or {@code -1} if no character matches
790 */
791 public int indexIn(CharSequence sequence) {
792 int length = sequence.length();
793 for (int i = 0; i < length; i++) {
794 if (matches(sequence.charAt(i))) {
795 return i;
796 }
797 }
798 return -1;
799 }
800
801 /**
802 * Returns the index of the first matching character in a character sequence, starting from a
803 * given position, or {@code -1} if no character matches after that position.
804 *
805 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
806 * start}, calling {@link #matches} for each character.
807 *
808 * @param sequence the character sequence to examine
809 * @param start the first index to examine; must be nonnegative and no greater than {@code
810 * sequence.length()}
811 * @return the index of the first matching character, guaranteed to be no less than {@code start},
812 * or {@code -1} if no character matches
813 * @throws IndexOutOfBoundsException if start is negative or greater than {@code
814 * sequence.length()}
815 */
816 public int indexIn(CharSequence sequence, int start) {
817 int length = sequence.length();
818 Preconditions.checkPositionIndex(start, length);
819 for (int i = start; i < length; i++) {
820 if (matches(sequence.charAt(i))) {
821 return i;
822 }
823 }
824 return -1;
825 }
826
827 /**
828 * Returns the index of the last matching character in a character sequence, or {@code -1} if no
829 * matching character is present.
830 *
831 * <p>The default implementation iterates over the sequence in reverse order calling {@link
832 * #matches} for each character.
833 *
834 * @param sequence the character sequence to examine from the end
835 * @return an index, or {@code -1} if no character matches
836 */
837 public int lastIndexIn(CharSequence sequence) {
838 for (int i = sequence.length() - 1; i >= 0; i--) {
839 if (matches(sequence.charAt(i))) {
840 return i;
841 }
842 }
843 return -1;
844 }
845
846 /**
847 * Returns the number of matching characters found in a character sequence.
848 */
849 public int countIn(CharSequence sequence) {
850 int count = 0;
851 for (int i = 0; i < sequence.length(); i++) {
852 if (matches(sequence.charAt(i))) {
853 count++;
854 }
855 }
856 return count;
857 }
858
859 /**
860 * Returns a string containing all non-matching characters of a character sequence, in order. For
861 * example: <pre> {@code
862 *
863 * CharMatcher.is('a').removeFrom("bazaar")}</pre>
864 *
865 * ... returns {@code "bzr"}.
866 */
867 public String removeFrom(CharSequence sequence) {
868 String string = sequence.toString();
869 int pos = indexIn(string);
870 if (pos == -1) {
871 return string;
872 }
873
874 char[] chars = string.toCharArray();
875 int spread = 1;
876
877 // This unusual loop comes from extensive benchmarking
878 OUT: while (true) {
879 pos++;
880 while (true) {
881 if (pos == chars.length) {
882 break OUT;
883 }
884 if (matches(chars[pos])) {
885 break;
886 }
887 chars[pos - spread] = chars[pos];
888 pos++;
889 }
890 spread++;
891 }
892 return new String(chars, 0, pos - spread);
893 }
894
895 /**
896 * Returns a string containing all matching characters of a character sequence, in order. For
897 * example: <pre> {@code
898 *
899 * CharMatcher.is('a').retainFrom("bazaar")}</pre>
900 *
901 * ... returns {@code "aaa"}.
902 */
903 public String retainFrom(CharSequence sequence) {
904 return negate().removeFrom(sequence);
905 }
906
907 /**
908 * Returns a string copy of the input character sequence, with each character that matches this
909 * matcher replaced by a given replacement character. For example: <pre> {@code
910 *
911 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
912 *
913 * ... returns {@code "rodor"}.
914 *
915 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
916 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
917 * character.
918 *
919 * @param sequence the character sequence to replace matching characters in
920 * @param replacement the character to append to the result string in place of each matching
921 * character in {@code sequence}
922 * @return the new string
923 */
924 public String replaceFrom(CharSequence sequence, char replacement) {
925 String string = sequence.toString();
926 int pos = indexIn(string);
927 if (pos == -1) {
928 return string;
929 }
930 char[] chars = string.toCharArray();
931 chars[pos] = replacement;
932 for (int i = pos + 1; i < chars.length; i++) {
933 if (matches(chars[i])) {
934 chars[i] = replacement;
935 }
936 }
937 return new String(chars);
938 }
939
940 /**
941 * Returns a string copy of the input character sequence, with each character that matches this
942 * matcher replaced by a given replacement sequence. For example: <pre> {@code
943 *
944 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
945 *
946 * ... returns {@code "yoohoo"}.
947 *
948 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
949 * off calling {@link #replaceFrom(CharSequence, char)} directly.
950 *
951 * @param sequence the character sequence to replace matching characters in
952 * @param replacement the characters to append to the result string in place of each matching
953 * character in {@code sequence}
954 * @return the new string
955 */
956 public String replaceFrom(CharSequence sequence, CharSequence replacement) {
957 int replacementLen = replacement.length();
958 if (replacementLen == 0) {
959 return removeFrom(sequence);
960 }
961 if (replacementLen == 1) {
962 return replaceFrom(sequence, replacement.charAt(0));
963 }
964
965 String string = sequence.toString();
966 int pos = indexIn(string);
967 if (pos == -1) {
968 return string;
969 }
970
971 int len = string.length();
972 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
973
974 int oldpos = 0;
975 do {
976 buf.append(string, oldpos, pos);
977 buf.append(replacement);
978 oldpos = pos + 1;
979 pos = indexIn(string, oldpos);
980 } while (pos != -1);
981
982 buf.append(string, oldpos, len);
983 return buf.toString();
984 }
985
986 /**
987 * Returns a substring of the input character sequence that omits all characters this matcher
988 * matches from the beginning and from the end of the string. For example: <pre> {@code
989 *
990 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
991 *
992 * ... returns {@code "cat"}.
993 *
994 * <p>Note that: <pre> {@code
995 *
996 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
997 *
998 * ... is equivalent to {@link String#trim()}.
999 */
1000 public String trimFrom(CharSequence sequence) {
1001 int len = sequence.length();
1002 int first;
1003 int last;
1004
1005 for (first = 0; first < len; first++) {
1006 if (!matches(sequence.charAt(first))) {
1007 break;
1008 }
1009 }
1010 for (last = len - 1; last > first; last--) {
1011 if (!matches(sequence.charAt(last))) {
1012 break;
1013 }
1014 }
1015
1016 return sequence.subSequence(first, last + 1).toString();
1017 }
1018
1019 /**
1020 * Returns a substring of the input character sequence that omits all characters this matcher
1021 * matches from the beginning of the string. For example: <pre> {@code
1022 *
1023 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1024 *
1025 * ... returns {@code "catbab"}.
1026 */
1027 public String trimLeadingFrom(CharSequence sequence) {
1028 int len = sequence.length();
1029 int first;
1030
1031 for (first = 0; first < len; first++) {
1032 if (!matches(sequence.charAt(first))) {
1033 break;
1034 }
1035 }
1036
1037 return sequence.subSequence(first, len).toString();
1038 }
1039
1040 /**
1041 * Returns a substring of the input character sequence that omits all characters this matcher
1042 * matches from the end of the string. For example: <pre> {@code
1043 *
1044 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1045 *
1046 * ... returns {@code "abacat"}.
1047 */
1048 public String trimTrailingFrom(CharSequence sequence) {
1049 int len = sequence.length();
1050 int last;
1051
1052 for (last = len - 1; last >= 0; last--) {
1053 if (!matches(sequence.charAt(last))) {
1054 break;
1055 }
1056 }
1057
1058 return sequence.subSequence(0, last + 1).toString();
1059 }
1060
1061 /**
1062 * Returns a string copy of the input character sequence, with each group of consecutive
1063 * characters that match this matcher replaced by a single replacement character. For example:
1064 * <pre> {@code
1065 *
1066 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1067 *
1068 * ... returns {@code "b-p-r"}.
1069 *
1070 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1071 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1072 * character.
1073 *
1074 * @param sequence the character sequence to replace matching groups of characters in
1075 * @param replacement the character to append to the result string in place of each group of
1076 * matching characters in {@code sequence}
1077 * @return the new string
1078 */
1079 public String collapseFrom(CharSequence sequence, char replacement) {
1080 int first = indexIn(sequence);
1081 if (first == -1) {
1082 return sequence.toString();
1083 }
1084
1085 // TODO(kevinb): see if this implementation can be made faster
1086 StringBuilder builder = new StringBuilder(sequence.length())
1087 .append(sequence.subSequence(0, first))
1088 .append(replacement);
1089 boolean in = true;
1090 for (int i = first + 1; i < sequence.length(); i++) {
1091 char c = sequence.charAt(i);
1092 if (apply(c)) {
1093 if (!in) {
1094 builder.append(replacement);
1095 in = true;
1096 }
1097 } else {
1098 builder.append(c);
1099 in = false;
1100 }
1101 }
1102 return builder.toString();
1103 }
1104
1105 /**
1106 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1107 * groups of matching characters at the start or end of the sequence are removed without
1108 * replacement.
1109 */
1110 public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1111 int first = negate().indexIn(sequence);
1112 if (first == -1) {
1113 return ""; // everything matches. nothing's left.
1114 }
1115 StringBuilder builder = new StringBuilder(sequence.length());
1116 boolean inMatchingGroup = false;
1117 for (int i = first; i < sequence.length(); i++) {
1118 char c = sequence.charAt(i);
1119 if (apply(c)) {
1120 inMatchingGroup = true;
1121 } else {
1122 if (inMatchingGroup) {
1123 builder.append(replacement);
1124 inMatchingGroup = false;
1125 }
1126 builder.append(c);
1127 }
1128 }
1129 return builder.toString();
1130 }
1131
1132 // Predicate interface
1133
1134 /**
1135 * Returns {@code true} if this matcher matches the given character.
1136 *
1137 * @throws NullPointerException if {@code character} is null
1138 */
1139 @Override public boolean apply(Character character) {
1140 return matches(character);
1141 }
1142 }