001 /* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.base; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 022 import com.google.common.annotations.Beta; 023 import com.google.common.annotations.GwtCompatible; 024 025 import java.util.ArrayList; 026 import java.util.Arrays; 027 import java.util.List; 028 029 /** 030 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 031 * for any {@link Object}. Also offers basic text processing methods based on this function. 032 * Implementations are strongly encouraged to be side-effect-free and immutable. 033 * 034 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 035 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}". 036 * 037 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand 038 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical 039 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher} 040 * treats these just as two separate characters. 041 * 042 * @author Kevin Bourrillion 043 * @since 1 044 */ 045 @Beta // Possibly change from chars to code points; decide constants vs. methods 046 @GwtCompatible 047 public abstract class CharMatcher implements Predicate<Character> { 048 // Constants 049 050 // Excludes 2000-2000a, which is handled as a range 051 private static final String BREAKING_WHITESPACE_CHARS = 052 "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000"; 053 054 // Excludes 2007, which is handled as a gap in a pair of ranges 055 private static final String NON_BREAKING_WHITESPACE_CHARS = 056 "\u00a0\u180e\u202f"; 057 058 /** 059 * Determines whether a character is whitespace according to the latest Unicode standard, as 060 * illustrated 061 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 062 * This is not the same definition used by other Java APIs. (See a 063 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 064 * definitions of "whitespace"</a>.) 065 * 066 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up 067 * to date. 068 */ 069 public static final CharMatcher WHITESPACE = 070 anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS) 071 .or(inRange('\u2000', '\u200a')) 072 .precomputed(); 073 074 /** 075 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 076 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a 077 * discussion of that term. 078 * 079 * @since 2 080 */ 081 public static final CharMatcher BREAKING_WHITESPACE = 082 anyOf(BREAKING_WHITESPACE_CHARS) 083 .or(inRange('\u2000', '\u2006')) 084 .or(inRange('\u2008', '\u200a')) 085 .precomputed(); 086 087 /** 088 * Determines whether a character is ASCII, meaning that its code point is less than 128. 089 */ 090 public static final CharMatcher ASCII = inRange('\0', '\u007f'); 091 092 /** 093 * Determines whether a character is a digit according to 094 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 095 */ 096 public static final CharMatcher DIGIT; 097 098 static { 099 CharMatcher digit = inRange('0', '9'); 100 String zeroes = 101 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66" 102 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946" 103 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 104 for (char base : zeroes.toCharArray()) { 105 digit = digit.or(inRange(base, (char) (base + 9))); 106 } 107 DIGIT = digit.precomputed(); 108 } 109 110 /** 111 * Determines whether a character is whitespace according to {@link Character#isWhitespace(char) 112 * Java's definition}; it is usually preferable to use {@link #WHITESPACE}. (See a 113 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 114 * definitions of "whitespace"</a>.) 115 */ 116 public static final CharMatcher JAVA_WHITESPACE = 117 inRange('\u0009', (char) 13) // \\u000d doesn't work as a char literal 118 .or(inRange('\u001c', '\u0020')) 119 .or(is('\u1680')) 120 .or(is('\u180e')) 121 .or(inRange('\u2000', '\u2006')) 122 .or(inRange('\u2008', '\u200b')) 123 .or(inRange('\u2028', '\u2029')) 124 .or(is('\u205f')) 125 .or(is('\u3000')) 126 .precomputed(); 127 128 /** 129 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's 130 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 131 */ 132 public static final CharMatcher JAVA_DIGIT = new CharMatcher() { 133 @Override public boolean matches(char c) { 134 return Character.isDigit(c); 135 } 136 }; 137 138 /** 139 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's 140 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code 141 * inRange('a', 'z').or(inRange('A', 'Z'))}. 142 */ 143 public static final CharMatcher JAVA_LETTER = new CharMatcher() { 144 @Override public boolean matches(char c) { 145 return Character.isLetter(c); 146 } 147 }; 148 149 /** 150 * Determines whether a character is a letter or digit according to {@link 151 * Character#isLetterOrDigit(char) Java's definition}. 152 */ 153 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() { 154 @Override public boolean matches(char c) { 155 return Character.isLetterOrDigit(c); 156 } 157 }; 158 159 /** 160 * Determines whether a character is upper case according to {@link Character#isUpperCase(char) 161 * Java's definition}. 162 */ 163 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() { 164 @Override public boolean matches(char c) { 165 return Character.isUpperCase(c); 166 } 167 }; 168 169 /** 170 * Determines whether a character is lower case according to {@link Character#isLowerCase(char) 171 * Java's definition}. 172 */ 173 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() { 174 @Override public boolean matches(char c) { 175 return Character.isLowerCase(c); 176 } 177 }; 178 179 /** 180 * Determines whether a character is an ISO control character as specified by {@link 181 * Character#isISOControl(char)}. 182 */ 183 public static final CharMatcher JAVA_ISO_CONTROL = 184 inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f')); 185 186 /** 187 * Determines whether a character is invisible; that is, if its Unicode category is any of 188 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 189 * PRIVATE_USE according to ICU4J. 190 */ 191 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020') 192 .or(inRange('\u007f', '\u00a0')) 193 .or(is('\u00ad')) 194 .or(inRange('\u0600', '\u0603')) 195 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e")) 196 .or(inRange('\u2000', '\u200f')) 197 .or(inRange('\u2028', '\u202f')) 198 .or(inRange('\u205f', '\u2064')) 199 .or(inRange('\u206a', '\u206f')) 200 .or(is('\u3000')) 201 .or(inRange('\ud800', '\uf8ff')) 202 .or(anyOf("\ufeff\ufff9\ufffa\ufffb")) 203 .precomputed(); 204 205 /** 206 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 207 * errs on the side of returning {@code false} (that is, it tends to assume a character is 208 * double-width). 209 * 210 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to 211 * date. 212 */ 213 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9') 214 .or(is('\u05be')) 215 .or(inRange('\u05d0', '\u05ea')) 216 .or(is('\u05f3')) 217 .or(is('\u05f4')) 218 .or(inRange('\u0600', '\u06ff')) 219 .or(inRange('\u0750', '\u077f')) 220 .or(inRange('\u0e00', '\u0e7f')) 221 .or(inRange('\u1e00', '\u20af')) 222 .or(inRange('\u2100', '\u213a')) 223 .or(inRange('\ufb50', '\ufdff')) 224 .or(inRange('\ufe70', '\ufeff')) 225 .or(inRange('\uff61', '\uffdc')) 226 .precomputed(); 227 228 /** Matches any character. */ 229 public static final CharMatcher ANY = 230 new CharMatcher() { 231 @Override public boolean matches(char c) { 232 return true; 233 } 234 235 @Override public int indexIn(CharSequence sequence) { 236 return (sequence.length() == 0) ? -1 : 0; 237 } 238 239 @Override public int indexIn(CharSequence sequence, int start) { 240 int length = sequence.length(); 241 Preconditions.checkPositionIndex(start, length); 242 return (start == length) ? -1 : start; 243 } 244 245 @Override public int lastIndexIn(CharSequence sequence) { 246 return sequence.length() - 1; 247 } 248 249 @Override public boolean matchesAllOf(CharSequence sequence) { 250 checkNotNull(sequence); 251 return true; 252 } 253 254 @Override public boolean matchesNoneOf(CharSequence sequence) { 255 return sequence.length() == 0; 256 } 257 258 @Override public String removeFrom(CharSequence sequence) { 259 checkNotNull(sequence); 260 return ""; 261 } 262 263 @Override public String replaceFrom(CharSequence sequence, char replacement) { 264 char[] array = new char[sequence.length()]; 265 Arrays.fill(array, replacement); 266 return new String(array); 267 } 268 269 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 270 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 271 for (int i = 0; i < sequence.length(); i++) { 272 retval.append(replacement); 273 } 274 return retval.toString(); 275 } 276 277 @Override public String collapseFrom(CharSequence sequence, char replacement) { 278 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 279 } 280 281 @Override public String trimFrom(CharSequence sequence) { 282 checkNotNull(sequence); 283 return ""; 284 } 285 286 @Override public int countIn(CharSequence sequence) { 287 return sequence.length(); 288 } 289 290 @Override public CharMatcher and(CharMatcher other) { 291 return checkNotNull(other); 292 } 293 294 @Override public CharMatcher or(CharMatcher other) { 295 checkNotNull(other); 296 return this; 297 } 298 299 @Override public CharMatcher negate() { 300 return NONE; 301 } 302 303 @Override public CharMatcher precomputed() { 304 return this; 305 } 306 }; 307 308 /** Matches no characters. */ 309 public static final CharMatcher NONE = 310 new CharMatcher() { 311 @Override public boolean matches(char c) { 312 return false; 313 } 314 315 @Override public int indexIn(CharSequence sequence) { 316 checkNotNull(sequence); 317 return -1; 318 } 319 320 @Override public int indexIn(CharSequence sequence, int start) { 321 int length = sequence.length(); 322 Preconditions.checkPositionIndex(start, length); 323 return -1; 324 } 325 326 @Override public int lastIndexIn(CharSequence sequence) { 327 checkNotNull(sequence); 328 return -1; 329 } 330 331 @Override public boolean matchesAllOf(CharSequence sequence) { 332 return sequence.length() == 0; 333 } 334 335 @Override public boolean matchesNoneOf(CharSequence sequence) { 336 checkNotNull(sequence); 337 return true; 338 } 339 340 @Override public String removeFrom(CharSequence sequence) { 341 return sequence.toString(); 342 } 343 344 @Override public String replaceFrom(CharSequence sequence, char replacement) { 345 return sequence.toString(); 346 } 347 348 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 349 checkNotNull(replacement); 350 return sequence.toString(); 351 } 352 353 @Override public String collapseFrom(CharSequence sequence, char replacement) { 354 return sequence.toString(); 355 } 356 357 @Override public String trimFrom(CharSequence sequence) { 358 return sequence.toString(); 359 } 360 361 @Override public int countIn(CharSequence sequence) { 362 checkNotNull(sequence); 363 return 0; 364 } 365 366 @Override public CharMatcher and(CharMatcher other) { 367 checkNotNull(other); 368 return this; 369 } 370 371 @Override public CharMatcher or(CharMatcher other) { 372 return checkNotNull(other); 373 } 374 375 @Override public CharMatcher negate() { 376 return ANY; 377 } 378 379 @Override void setBits(LookupTable table) {} 380 381 @Override public CharMatcher precomputed() { 382 return this; 383 } 384 }; 385 386 // Static factories 387 388 /** 389 * Returns a {@code char} matcher that matches only one specified character. 390 */ 391 public static CharMatcher is(final char match) { 392 return new CharMatcher() { 393 @Override public boolean matches(char c) { 394 return c == match; 395 } 396 397 @Override public String replaceFrom(CharSequence sequence, char replacement) { 398 return sequence.toString().replace(match, replacement); 399 } 400 401 @Override public CharMatcher and(CharMatcher other) { 402 return other.matches(match) ? this : NONE; 403 } 404 405 @Override public CharMatcher or(CharMatcher other) { 406 return other.matches(match) ? other : super.or(other); 407 } 408 409 @Override public CharMatcher negate() { 410 return isNot(match); 411 } 412 413 @Override void setBits(LookupTable table) { 414 table.set(match); 415 } 416 417 @Override public CharMatcher precomputed() { 418 return this; 419 } 420 }; 421 } 422 423 /** 424 * Returns a {@code char} matcher that matches any character except the one specified. 425 * 426 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 427 */ 428 public static CharMatcher isNot(final char match) { 429 return new CharMatcher() { 430 @Override public boolean matches(char c) { 431 return c != match; 432 } 433 434 @Override public CharMatcher and(CharMatcher other) { 435 return other.matches(match) ? super.and(other) : other; 436 } 437 438 @Override public CharMatcher or(CharMatcher other) { 439 return other.matches(match) ? ANY : this; 440 } 441 442 @Override public CharMatcher negate() { 443 return is(match); 444 } 445 }; 446 } 447 448 /** 449 * Returns a {@code char} matcher that matches any character present in the given character 450 * sequence. 451 */ 452 public static CharMatcher anyOf(final CharSequence sequence) { 453 switch (sequence.length()) { 454 case 0: 455 return NONE; 456 case 1: 457 return is(sequence.charAt(0)); 458 case 2: 459 final char match1 = sequence.charAt(0); 460 final char match2 = sequence.charAt(1); 461 return new CharMatcher() { 462 @Override public boolean matches(char c) { 463 return c == match1 || c == match2; 464 } 465 466 @Override void setBits(LookupTable table) { 467 table.set(match1); 468 table.set(match2); 469 } 470 471 @Override public CharMatcher precomputed() { 472 return this; 473 } 474 }; 475 } 476 477 final char[] chars = sequence.toString().toCharArray(); 478 Arrays.sort(chars); // not worth collapsing duplicates 479 480 return new CharMatcher() { 481 @Override public boolean matches(char c) { 482 return Arrays.binarySearch(chars, c) >= 0; 483 } 484 485 @Override void setBits(LookupTable table) { 486 for (char c : chars) { 487 table.set(c); 488 } 489 } 490 }; 491 } 492 493 /** 494 * Returns a {@code char} matcher that matches any character not present in the given character 495 * sequence. 496 */ 497 public static CharMatcher noneOf(CharSequence sequence) { 498 return anyOf(sequence).negate(); 499 } 500 501 /** 502 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are 503 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 504 * CharMatcher.inRange('a', 'z')}. 505 * 506 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 507 */ 508 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 509 checkArgument(endInclusive >= startInclusive); 510 return new CharMatcher() { 511 @Override public boolean matches(char c) { 512 return startInclusive <= c && c <= endInclusive; 513 } 514 515 @Override void setBits(LookupTable table) { 516 char c = startInclusive; 517 while (true) { 518 table.set(c); 519 if (c++ == endInclusive) { 520 break; 521 } 522 } 523 } 524 525 @Override public CharMatcher precomputed() { 526 return this; 527 } 528 }; 529 } 530 531 /** 532 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 533 * which operates on primitive {@code char} instances instead. 534 */ 535 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 536 checkNotNull(predicate); 537 if (predicate instanceof CharMatcher) { 538 return (CharMatcher) predicate; 539 } 540 return new CharMatcher() { 541 @Override public boolean matches(char c) { 542 return predicate.apply(c); 543 } 544 545 @Override public boolean apply(Character character) { 546 return predicate.apply(checkNotNull(character)); 547 } 548 }; 549 } 550 551 // Abstract methods 552 553 /** Determines a true or false value for the given character. */ 554 public abstract boolean matches(char c); 555 556 // Non-static factories 557 558 /** 559 * Returns a matcher that matches any character not matched by this matcher. 560 */ 561 public CharMatcher negate() { 562 final CharMatcher original = this; 563 return new CharMatcher() { 564 @Override public boolean matches(char c) { 565 return !original.matches(c); 566 } 567 568 @Override public boolean matchesAllOf(CharSequence sequence) { 569 return original.matchesNoneOf(sequence); 570 } 571 572 @Override public boolean matchesNoneOf(CharSequence sequence) { 573 return original.matchesAllOf(sequence); 574 } 575 576 @Override public int countIn(CharSequence sequence) { 577 return sequence.length() - original.countIn(sequence); 578 } 579 580 @Override public CharMatcher negate() { 581 return original; 582 } 583 }; 584 } 585 586 /** 587 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 588 */ 589 public CharMatcher and(CharMatcher other) { 590 return new And(Arrays.asList(this, checkNotNull(other))); 591 } 592 593 private static class And extends CharMatcher { 594 List<CharMatcher> components; 595 596 And(List<CharMatcher> components) { 597 this.components = components; // Skip defensive copy (private) 598 } 599 600 @Override public boolean matches(char c) { 601 for (CharMatcher matcher : components) { 602 if (!matcher.matches(c)) { 603 return false; 604 } 605 } 606 return true; 607 } 608 609 @Override public CharMatcher and(CharMatcher other) { 610 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 611 newComponents.add(checkNotNull(other)); 612 return new And(newComponents); 613 } 614 } 615 616 /** 617 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 618 */ 619 public CharMatcher or(CharMatcher other) { 620 return new Or(Arrays.asList(this, checkNotNull(other))); 621 } 622 623 private static class Or extends CharMatcher { 624 List<CharMatcher> components; 625 626 Or(List<CharMatcher> components) { 627 this.components = components; // Skip defensive copy (private) 628 } 629 630 @Override public boolean matches(char c) { 631 for (CharMatcher matcher : components) { 632 if (matcher.matches(c)) { 633 return true; 634 } 635 } 636 return false; 637 } 638 639 @Override public CharMatcher or(CharMatcher other) { 640 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 641 newComponents.add(checkNotNull(other)); 642 return new Or(newComponents); 643 } 644 645 @Override void setBits(LookupTable table) { 646 for (CharMatcher matcher : components) { 647 matcher.setBits(table); 648 } 649 } 650 } 651 652 /** 653 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 654 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 655 * worthwhile only if the precomputed matcher is queried many thousands of times. 656 * 657 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 658 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 659 * worthwhile tradeoff in a browser. 660 */ 661 public CharMatcher precomputed() { 662 return Platform.precomputeCharMatcher(this); 663 } 664 665 /** 666 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 667 * on {@link Platform} so that we can have different behavior in GWT. 668 * 669 * <p>The default precomputation is to cache the configuration of the original matcher in an 670 * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query 671 * than the original. 672 * 673 * <p>The default implementation creates a new bit array and passes it to {@link 674 * #setBits(LookupTable)}. 675 */ 676 CharMatcher precomputedInternal() { 677 final LookupTable table = new LookupTable(); 678 setBits(table); 679 680 return new CharMatcher() { 681 @Override public boolean matches(char c) { 682 return table.get(c); 683 } 684 685 // TODO(kevinb): make methods like negate() smart? 686 687 @Override public CharMatcher precomputed() { 688 return this; 689 } 690 }; 691 } 692 693 /** 694 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal 695 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched. 696 * 697 * <p>The default implementation loops over every possible character value, invoking {@link 698 * #matches} for each one. 699 */ 700 void setBits(LookupTable table) { 701 char c = Character.MIN_VALUE; 702 while (true) { 703 if (matches(c)) { 704 table.set(c); 705 } 706 if (c++ == Character.MAX_VALUE) { 707 break; 708 } 709 } 710 } 711 712 /** 713 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}. 714 * 715 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a 716 * simpler java.util.BitSet. 717 */ 718 private static final class LookupTable { 719 int[] data = new int[2048]; 720 721 void set(char index) { 722 data[index >> 5] |= (1 << index); 723 } 724 725 boolean get(char index) { 726 return (data[index >> 5] & (1 << index)) != 0; 727 } 728 } 729 730 // Text processing routines 731 732 /** 733 * Returns {@code true} if a character sequence contains at least one matching character. 734 * Equivalent to {@code !matchesNoneOf(sequence)}. 735 * 736 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 737 * character, until this returns {@code true} or the end is reached. 738 * 739 * @param sequence the character sequence to examine, possibly empty 740 * @return {@code true} if this matcher matches at least one character in the sequence 741 * @since 8 742 */ 743 public boolean matchesAnyOf(CharSequence sequence) { 744 return !matchesNoneOf(sequence); 745 } 746 747 /** 748 * Returns {@code true} if a character sequence contains only matching characters. 749 * 750 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 751 * character, until this returns {@code false} or the end is reached. 752 * 753 * @param sequence the character sequence to examine, possibly empty 754 * @return {@code true} if this matcher matches every character in the sequence, including when 755 * the sequence is empty 756 */ 757 public boolean matchesAllOf(CharSequence sequence) { 758 for (int i = sequence.length() - 1; i >= 0; i--) { 759 if (!matches(sequence.charAt(i))) { 760 return false; 761 } 762 } 763 return true; 764 } 765 766 /** 767 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to 768 * {@code !matchesAnyOf(sequence)}. 769 * 770 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 771 * character, until this returns {@code false} or the end is reached. 772 * 773 * @param sequence the character sequence to examine, possibly empty 774 * @return {@code true} if this matcher matches every character in the sequence, including when 775 * the sequence is empty 776 */ 777 public boolean matchesNoneOf(CharSequence sequence) { 778 return indexIn(sequence) == -1; 779 } 780 781 // TODO(kevinb): add matchesAnyOf() 782 783 /** 784 * Returns the index of the first matching character in a character sequence, or {@code -1} if no 785 * matching character is present. 786 * 787 * <p>The default implementation iterates over the sequence in forward order calling {@link 788 * #matches} for each character. 789 * 790 * @param sequence the character sequence to examine from the beginning 791 * @return an index, or {@code -1} if no character matches 792 */ 793 public int indexIn(CharSequence sequence) { 794 int length = sequence.length(); 795 for (int i = 0; i < length; i++) { 796 if (matches(sequence.charAt(i))) { 797 return i; 798 } 799 } 800 return -1; 801 } 802 803 /** 804 * Returns the index of the first matching character in a character sequence, starting from a 805 * given position, or {@code -1} if no character matches after that position. 806 * 807 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 808 * start}, calling {@link #matches} for each character. 809 * 810 * @param sequence the character sequence to examine 811 * @param start the first index to examine; must be nonnegative and no greater than {@code 812 * sequence.length()} 813 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 814 * or {@code -1} if no character matches 815 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 816 * sequence.length()} 817 */ 818 public int indexIn(CharSequence sequence, int start) { 819 int length = sequence.length(); 820 Preconditions.checkPositionIndex(start, length); 821 for (int i = start; i < length; i++) { 822 if (matches(sequence.charAt(i))) { 823 return i; 824 } 825 } 826 return -1; 827 } 828 829 /** 830 * Returns the index of the last matching character in a character sequence, or {@code -1} if no 831 * matching character is present. 832 * 833 * <p>The default implementation iterates over the sequence in reverse order calling {@link 834 * #matches} for each character. 835 * 836 * @param sequence the character sequence to examine from the end 837 * @return an index, or {@code -1} if no character matches 838 */ 839 public int lastIndexIn(CharSequence sequence) { 840 for (int i = sequence.length() - 1; i >= 0; i--) { 841 if (matches(sequence.charAt(i))) { 842 return i; 843 } 844 } 845 return -1; 846 } 847 848 /** 849 * Returns the number of matching characters found in a character sequence. 850 */ 851 public int countIn(CharSequence sequence) { 852 int count = 0; 853 for (int i = 0; i < sequence.length(); i++) { 854 if (matches(sequence.charAt(i))) { 855 count++; 856 } 857 } 858 return count; 859 } 860 861 /** 862 * Returns a string containing all non-matching characters of a character sequence, in order. For 863 * example: <pre> {@code 864 * 865 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 866 * 867 * ... returns {@code "bzr"}. 868 */ 869 public String removeFrom(CharSequence sequence) { 870 String string = sequence.toString(); 871 int pos = indexIn(string); 872 if (pos == -1) { 873 return string; 874 } 875 876 char[] chars = string.toCharArray(); 877 int spread = 1; 878 879 // This unusual loop comes from extensive benchmarking 880 OUT: while (true) { 881 pos++; 882 while (true) { 883 if (pos == chars.length) { 884 break OUT; 885 } 886 if (matches(chars[pos])) { 887 break; 888 } 889 chars[pos - spread] = chars[pos]; 890 pos++; 891 } 892 spread++; 893 } 894 return new String(chars, 0, pos - spread); 895 } 896 897 /** 898 * Returns a string containing all matching characters of a character sequence, in order. For 899 * example: <pre> {@code 900 * 901 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 902 * 903 * ... returns {@code "aaa"}. 904 */ 905 public String retainFrom(CharSequence sequence) { 906 return negate().removeFrom(sequence); 907 } 908 909 /** 910 * Returns a string copy of the input character sequence, with each character that matches this 911 * matcher replaced by a given replacement character. For example: <pre> {@code 912 * 913 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 914 * 915 * ... returns {@code "rodor"}. 916 * 917 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 918 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 919 * character. 920 * 921 * @param sequence the character sequence to replace matching characters in 922 * @param replacement the character to append to the result string in place of each matching 923 * character in {@code sequence} 924 * @return the new string 925 */ 926 public String replaceFrom(CharSequence sequence, char replacement) { 927 String string = sequence.toString(); 928 int pos = indexIn(string); 929 if (pos == -1) { 930 return string; 931 } 932 char[] chars = string.toCharArray(); 933 chars[pos] = replacement; 934 for (int i = pos + 1; i < chars.length; i++) { 935 if (matches(chars[i])) { 936 chars[i] = replacement; 937 } 938 } 939 return new String(chars); 940 } 941 942 /** 943 * Returns a string copy of the input character sequence, with each character that matches this 944 * matcher replaced by a given replacement sequence. For example: <pre> {@code 945 * 946 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 947 * 948 * ... returns {@code "yoohoo"}. 949 * 950 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 951 * off calling {@link #replaceFrom(CharSequence, char)} directly. 952 * 953 * @param sequence the character sequence to replace matching characters in 954 * @param replacement the characters to append to the result string in place of each matching 955 * character in {@code sequence} 956 * @return the new string 957 */ 958 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 959 int replacementLen = replacement.length(); 960 if (replacementLen == 0) { 961 return removeFrom(sequence); 962 } 963 if (replacementLen == 1) { 964 return replaceFrom(sequence, replacement.charAt(0)); 965 } 966 967 String string = sequence.toString(); 968 int pos = indexIn(string); 969 if (pos == -1) { 970 return string; 971 } 972 973 int len = string.length(); 974 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 975 976 int oldpos = 0; 977 do { 978 buf.append(string, oldpos, pos); 979 buf.append(replacement); 980 oldpos = pos + 1; 981 pos = indexIn(string, oldpos); 982 } while (pos != -1); 983 984 buf.append(string, oldpos, len); 985 return buf.toString(); 986 } 987 988 /** 989 * Returns a substring of the input character sequence that omits all characters this matcher 990 * matches from the beginning and from the end of the string. For example: <pre> {@code 991 * 992 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 993 * 994 * ... returns {@code "cat"}. 995 * 996 * <p>Note that: <pre> {@code 997 * 998 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 999 * 1000 * ... is equivalent to {@link String#trim()}. 1001 */ 1002 public String trimFrom(CharSequence sequence) { 1003 int len = sequence.length(); 1004 int first; 1005 int last; 1006 1007 for (first = 0; first < len; first++) { 1008 if (!matches(sequence.charAt(first))) { 1009 break; 1010 } 1011 } 1012 for (last = len - 1; last > first; last--) { 1013 if (!matches(sequence.charAt(last))) { 1014 break; 1015 } 1016 } 1017 1018 return sequence.subSequence(first, last + 1).toString(); 1019 } 1020 1021 /** 1022 * Returns a substring of the input character sequence that omits all characters this matcher 1023 * matches from the beginning of the string. For example: <pre> {@code 1024 * 1025 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1026 * 1027 * ... returns {@code "catbab"}. 1028 */ 1029 public String trimLeadingFrom(CharSequence sequence) { 1030 int len = sequence.length(); 1031 int first; 1032 1033 for (first = 0; first < len; first++) { 1034 if (!matches(sequence.charAt(first))) { 1035 break; 1036 } 1037 } 1038 1039 return sequence.subSequence(first, len).toString(); 1040 } 1041 1042 /** 1043 * Returns a substring of the input character sequence that omits all characters this matcher 1044 * matches from the end of the string. For example: <pre> {@code 1045 * 1046 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1047 * 1048 * ... returns {@code "abacat"}. 1049 */ 1050 public String trimTrailingFrom(CharSequence sequence) { 1051 int len = sequence.length(); 1052 int last; 1053 1054 for (last = len - 1; last >= 0; last--) { 1055 if (!matches(sequence.charAt(last))) { 1056 break; 1057 } 1058 } 1059 1060 return sequence.subSequence(0, last + 1).toString(); 1061 } 1062 1063 /** 1064 * Returns a string copy of the input character sequence, with each group of consecutive 1065 * characters that match this matcher replaced by a single replacement character. For example: 1066 * <pre> {@code 1067 * 1068 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1069 * 1070 * ... returns {@code "b-p-r"}. 1071 * 1072 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1073 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1074 * character. 1075 * 1076 * @param sequence the character sequence to replace matching groups of characters in 1077 * @param replacement the character to append to the result string in place of each group of 1078 * matching characters in {@code sequence} 1079 * @return the new string 1080 */ 1081 public String collapseFrom(CharSequence sequence, char replacement) { 1082 int first = indexIn(sequence); 1083 if (first == -1) { 1084 return sequence.toString(); 1085 } 1086 1087 // TODO(kevinb): see if this implementation can be made faster 1088 StringBuilder builder = new StringBuilder(sequence.length()) 1089 .append(sequence.subSequence(0, first)) 1090 .append(replacement); 1091 boolean in = true; 1092 for (int i = first + 1; i < sequence.length(); i++) { 1093 char c = sequence.charAt(i); 1094 if (apply(c)) { 1095 if (!in) { 1096 builder.append(replacement); 1097 in = true; 1098 } 1099 } else { 1100 builder.append(c); 1101 in = false; 1102 } 1103 } 1104 return builder.toString(); 1105 } 1106 1107 /** 1108 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1109 * groups of matching characters at the start or end of the sequence are removed without 1110 * replacement. 1111 */ 1112 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1113 int first = negate().indexIn(sequence); 1114 if (first == -1) { 1115 return ""; // everything matches. nothing's left. 1116 } 1117 StringBuilder builder = new StringBuilder(sequence.length()); 1118 boolean inMatchingGroup = false; 1119 for (int i = first; i < sequence.length(); i++) { 1120 char c = sequence.charAt(i); 1121 if (apply(c)) { 1122 inMatchingGroup = true; 1123 } else { 1124 if (inMatchingGroup) { 1125 builder.append(replacement); 1126 inMatchingGroup = false; 1127 } 1128 builder.append(c); 1129 } 1130 } 1131 return builder.toString(); 1132 } 1133 1134 // Predicate interface 1135 1136 /** 1137 * Returns {@code true} if this matcher matches the given character. 1138 * 1139 * @throws NullPointerException if {@code character} is null 1140 */ 1141 @Override public boolean apply(Character character) { 1142 return matches(character); 1143 } 1144 }