001 /* 002 * Copyright (C) 2008 Google Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the 010 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 011 * express or implied. See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014 015 package com.google.common.base; 016 017 import static com.google.common.base.Preconditions.checkArgument; 018 import static com.google.common.base.Preconditions.checkNotNull; 019 020 import com.google.common.annotations.Beta; 021 import com.google.common.annotations.GwtCompatible; 022 023 import java.util.ArrayList; 024 import java.util.Arrays; 025 import java.util.List; 026 027 /** 028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 029 * for any {@link Object}. Also offers basic text processing methods based on this function. 030 * Implementations are strongly encouraged to be side-effect-free and immutable. 031 * 032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 033 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}". 034 * 035 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand 036 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical 037 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher} 038 * treats these just as two separate characters. 039 * 040 * @author Kevin Bourrillion 041 * @since 1 042 */ 043 @Beta // Possibly change from chars to code points; decide constants vs. methods 044 @GwtCompatible 045 public abstract class CharMatcher implements Predicate<Character> { 046 // Constants 047 048 // Excludes 2000-2000a, which is handled as a range 049 private static final String BREAKING_WHITESPACE_CHARS = 050 "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000"; 051 052 // Excludes 2007, which is handled as a gap in a pair of ranges 053 private static final String NON_BREAKING_WHITESPACE_CHARS = 054 "\u00a0\u180e\u202f"; 055 056 /** 057 * Determines whether a character is whitespace according to the latest Unicode standard, as 058 * illustrated 059 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 060 * This is not the same definition used by other Java APIs. (See a 061 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 062 * definitions of "whitespace"</a>.) 063 * 064 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up 065 * to date. 066 */ 067 public static final CharMatcher WHITESPACE = 068 anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS) 069 .or(inRange('\u2000', '\u200a')) 070 .precomputed(); 071 072 /** 073 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 074 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a 075 * discussion of that term. 076 * 077 * @since 2 078 */ 079 public static final CharMatcher BREAKING_WHITESPACE = 080 anyOf(BREAKING_WHITESPACE_CHARS) 081 .or(inRange('\u2000', '\u2006')) 082 .or(inRange('\u2008', '\u200a')) 083 .precomputed(); 084 085 /** 086 * Determines whether a character is ASCII, meaning that its code point is less than 128. 087 */ 088 public static final CharMatcher ASCII = inRange('\0', '\u007f'); 089 090 /** 091 * Determines whether a character is a digit according to 092 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 093 */ 094 public static final CharMatcher DIGIT; 095 096 static { 097 CharMatcher digit = inRange('0', '9'); 098 String zeroes = 099 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66" 100 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946" 101 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 102 for (char base : zeroes.toCharArray()) { 103 digit = digit.or(inRange(base, (char) (base + 9))); 104 } 105 DIGIT = digit.precomputed(); 106 } 107 108 /** 109 * Determines whether a character is whitespace according to {@link Character#isWhitespace(char) 110 * Java's definition}; it is usually preferable to use {@link #WHITESPACE}. (See a 111 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 112 * definitions of "whitespace"</a>.) 113 */ 114 public static final CharMatcher JAVA_WHITESPACE = 115 inRange('\u0009', (char) 13) // \\u000d doesn't work as a char literal 116 .or(inRange('\u001c', '\u0020')) 117 .or(is('\u1680')) 118 .or(is('\u180e')) 119 .or(inRange('\u2000', '\u2006')) 120 .or(inRange('\u2008', '\u200b')) 121 .or(inRange('\u2028', '\u2029')) 122 .or(is('\u205f')) 123 .or(is('\u3000')) 124 .precomputed(); 125 126 /** 127 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's 128 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 129 */ 130 public static final CharMatcher JAVA_DIGIT = new CharMatcher() { 131 @Override public boolean matches(char c) { 132 return Character.isDigit(c); 133 } 134 }; 135 136 /** 137 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's 138 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code 139 * inRange('a', 'z').or(inRange('A', 'Z'))}. 140 */ 141 public static final CharMatcher JAVA_LETTER = new CharMatcher() { 142 @Override public boolean matches(char c) { 143 return Character.isLetter(c); 144 } 145 }; 146 147 /** 148 * Determines whether a character is a letter or digit according to {@link 149 * Character#isLetterOrDigit(char) Java's definition}. 150 */ 151 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() { 152 @Override public boolean matches(char c) { 153 return Character.isLetterOrDigit(c); 154 } 155 }; 156 157 /** 158 * Determines whether a character is upper case according to {@link Character#isUpperCase(char) 159 * Java's definition}. 160 */ 161 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() { 162 @Override public boolean matches(char c) { 163 return Character.isUpperCase(c); 164 } 165 }; 166 167 /** 168 * Determines whether a character is lower case according to {@link Character#isLowerCase(char) 169 * Java's definition}. 170 */ 171 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() { 172 @Override public boolean matches(char c) { 173 return Character.isLowerCase(c); 174 } 175 }; 176 177 /** 178 * Determines whether a character is an ISO control character as specified by {@link 179 * Character#isISOControl(char)}. 180 */ 181 public static final CharMatcher JAVA_ISO_CONTROL = 182 inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f')); 183 184 /** 185 * Determines whether a character is invisible; that is, if its Unicode category is any of 186 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 187 * PRIVATE_USE according to ICU4J. 188 */ 189 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020') 190 .or(inRange('\u007f', '\u00a0')) 191 .or(is('\u00ad')) 192 .or(inRange('\u0600', '\u0603')) 193 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e")) 194 .or(inRange('\u2000', '\u200f')) 195 .or(inRange('\u2028', '\u202f')) 196 .or(inRange('\u205f', '\u2064')) 197 .or(inRange('\u206a', '\u206f')) 198 .or(is('\u3000')) 199 .or(inRange('\ud800', '\uf8ff')) 200 .or(anyOf("\ufeff\ufff9\ufffa\ufffb")) 201 .precomputed(); 202 203 /** 204 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 205 * errs on the side of returning {@code false} (that is, it tends to assume a character is 206 * double-width). 207 * 208 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to 209 * date. 210 */ 211 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9') 212 .or(is('\u05be')) 213 .or(inRange('\u05d0', '\u05ea')) 214 .or(is('\u05f3')) 215 .or(is('\u05f4')) 216 .or(inRange('\u0600', '\u06ff')) 217 .or(inRange('\u0750', '\u077f')) 218 .or(inRange('\u0e00', '\u0e7f')) 219 .or(inRange('\u1e00', '\u20af')) 220 .or(inRange('\u2100', '\u213a')) 221 .or(inRange('\ufb50', '\ufdff')) 222 .or(inRange('\ufe70', '\ufeff')) 223 .or(inRange('\uff61', '\uffdc')) 224 .precomputed(); 225 226 /** Matches any character. */ 227 public static final CharMatcher ANY = 228 new CharMatcher() { 229 @Override public boolean matches(char c) { 230 return true; 231 } 232 233 @Override public int indexIn(CharSequence sequence) { 234 return (sequence.length() == 0) ? -1 : 0; 235 } 236 237 @Override public int indexIn(CharSequence sequence, int start) { 238 int length = sequence.length(); 239 Preconditions.checkPositionIndex(start, length); 240 return (start == length) ? -1 : start; 241 } 242 243 @Override public int lastIndexIn(CharSequence sequence) { 244 return sequence.length() - 1; 245 } 246 247 @Override public boolean matchesAllOf(CharSequence sequence) { 248 checkNotNull(sequence); 249 return true; 250 } 251 252 @Override public boolean matchesNoneOf(CharSequence sequence) { 253 return sequence.length() == 0; 254 } 255 256 @Override public String removeFrom(CharSequence sequence) { 257 checkNotNull(sequence); 258 return ""; 259 } 260 261 @Override public String replaceFrom(CharSequence sequence, char replacement) { 262 char[] array = new char[sequence.length()]; 263 Arrays.fill(array, replacement); 264 return new String(array); 265 } 266 267 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 268 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 269 for (int i = 0; i < sequence.length(); i++) { 270 retval.append(replacement); 271 } 272 return retval.toString(); 273 } 274 275 @Override public String collapseFrom(CharSequence sequence, char replacement) { 276 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 277 } 278 279 @Override public String trimFrom(CharSequence sequence) { 280 checkNotNull(sequence); 281 return ""; 282 } 283 284 @Override public int countIn(CharSequence sequence) { 285 return sequence.length(); 286 } 287 288 @Override public CharMatcher and(CharMatcher other) { 289 return checkNotNull(other); 290 } 291 292 @Override public CharMatcher or(CharMatcher other) { 293 checkNotNull(other); 294 return this; 295 } 296 297 @Override public CharMatcher negate() { 298 return NONE; 299 } 300 301 @Override public CharMatcher precomputed() { 302 return this; 303 } 304 }; 305 306 /** Matches no characters. */ 307 public static final CharMatcher NONE = 308 new CharMatcher() { 309 @Override public boolean matches(char c) { 310 return false; 311 } 312 313 @Override public int indexIn(CharSequence sequence) { 314 checkNotNull(sequence); 315 return -1; 316 } 317 318 @Override public int indexIn(CharSequence sequence, int start) { 319 int length = sequence.length(); 320 Preconditions.checkPositionIndex(start, length); 321 return -1; 322 } 323 324 @Override public int lastIndexIn(CharSequence sequence) { 325 checkNotNull(sequence); 326 return -1; 327 } 328 329 @Override public boolean matchesAllOf(CharSequence sequence) { 330 return sequence.length() == 0; 331 } 332 333 @Override public boolean matchesNoneOf(CharSequence sequence) { 334 checkNotNull(sequence); 335 return true; 336 } 337 338 @Override public String removeFrom(CharSequence sequence) { 339 return sequence.toString(); 340 } 341 342 @Override public String replaceFrom(CharSequence sequence, char replacement) { 343 return sequence.toString(); 344 } 345 346 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 347 checkNotNull(replacement); 348 return sequence.toString(); 349 } 350 351 @Override public String collapseFrom(CharSequence sequence, char replacement) { 352 return sequence.toString(); 353 } 354 355 @Override public String trimFrom(CharSequence sequence) { 356 return sequence.toString(); 357 } 358 359 @Override public int countIn(CharSequence sequence) { 360 checkNotNull(sequence); 361 return 0; 362 } 363 364 @Override public CharMatcher and(CharMatcher other) { 365 checkNotNull(other); 366 return this; 367 } 368 369 @Override public CharMatcher or(CharMatcher other) { 370 return checkNotNull(other); 371 } 372 373 @Override public CharMatcher negate() { 374 return ANY; 375 } 376 377 @Override void setBits(LookupTable table) {} 378 379 @Override public CharMatcher precomputed() { 380 return this; 381 } 382 }; 383 384 // Static factories 385 386 /** 387 * Returns a {@code char} matcher that matches only one specified character. 388 */ 389 public static CharMatcher is(final char match) { 390 return new CharMatcher() { 391 @Override public boolean matches(char c) { 392 return c == match; 393 } 394 395 @Override public String replaceFrom(CharSequence sequence, char replacement) { 396 return sequence.toString().replace(match, replacement); 397 } 398 399 @Override public CharMatcher and(CharMatcher other) { 400 return other.matches(match) ? this : NONE; 401 } 402 403 @Override public CharMatcher or(CharMatcher other) { 404 return other.matches(match) ? other : super.or(other); 405 } 406 407 @Override public CharMatcher negate() { 408 return isNot(match); 409 } 410 411 @Override void setBits(LookupTable table) { 412 table.set(match); 413 } 414 415 @Override public CharMatcher precomputed() { 416 return this; 417 } 418 }; 419 } 420 421 /** 422 * Returns a {@code char} matcher that matches any character except the one specified. 423 * 424 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 425 */ 426 public static CharMatcher isNot(final char match) { 427 return new CharMatcher() { 428 @Override public boolean matches(char c) { 429 return c != match; 430 } 431 432 @Override public CharMatcher and(CharMatcher other) { 433 return other.matches(match) ? super.and(other) : other; 434 } 435 436 @Override public CharMatcher or(CharMatcher other) { 437 return other.matches(match) ? ANY : this; 438 } 439 440 @Override public CharMatcher negate() { 441 return is(match); 442 } 443 }; 444 } 445 446 /** 447 * Returns a {@code char} matcher that matches any character present in the given character 448 * sequence. 449 */ 450 public static CharMatcher anyOf(final CharSequence sequence) { 451 switch (sequence.length()) { 452 case 0: 453 return NONE; 454 case 1: 455 return is(sequence.charAt(0)); 456 case 2: 457 final char match1 = sequence.charAt(0); 458 final char match2 = sequence.charAt(1); 459 return new CharMatcher() { 460 @Override public boolean matches(char c) { 461 return c == match1 || c == match2; 462 } 463 464 @Override void setBits(LookupTable table) { 465 table.set(match1); 466 table.set(match2); 467 } 468 469 @Override public CharMatcher precomputed() { 470 return this; 471 } 472 }; 473 } 474 475 final char[] chars = sequence.toString().toCharArray(); 476 Arrays.sort(chars); // not worth collapsing duplicates 477 478 return new CharMatcher() { 479 @Override public boolean matches(char c) { 480 return Arrays.binarySearch(chars, c) >= 0; 481 } 482 483 @Override void setBits(LookupTable table) { 484 for (char c : chars) { 485 table.set(c); 486 } 487 } 488 }; 489 } 490 491 /** 492 * Returns a {@code char} matcher that matches any character not present in the given character 493 * sequence. 494 */ 495 public static CharMatcher noneOf(CharSequence sequence) { 496 return anyOf(sequence).negate(); 497 } 498 499 /** 500 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are 501 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 502 * CharMatcher.inRange('a', 'z')}. 503 * 504 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 505 */ 506 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 507 checkArgument(endInclusive >= startInclusive); 508 return new CharMatcher() { 509 @Override public boolean matches(char c) { 510 return startInclusive <= c && c <= endInclusive; 511 } 512 513 @Override void setBits(LookupTable table) { 514 char c = startInclusive; 515 while (true) { 516 table.set(c); 517 if (c++ == endInclusive) { 518 break; 519 } 520 } 521 } 522 523 @Override public CharMatcher precomputed() { 524 return this; 525 } 526 }; 527 } 528 529 /** 530 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 531 * which operates on primitive {@code char} instances instead. 532 */ 533 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 534 checkNotNull(predicate); 535 if (predicate instanceof CharMatcher) { 536 return (CharMatcher) predicate; 537 } 538 return new CharMatcher() { 539 @Override public boolean matches(char c) { 540 return predicate.apply(c); 541 } 542 543 @Override public boolean apply(Character character) { 544 return predicate.apply(checkNotNull(character)); 545 } 546 }; 547 } 548 549 // Abstract methods 550 551 /** Determines a true or false value for the given character. */ 552 public abstract boolean matches(char c); 553 554 // Non-static factories 555 556 /** 557 * Returns a matcher that matches any character not matched by this matcher. 558 */ 559 public CharMatcher negate() { 560 final CharMatcher original = this; 561 return new CharMatcher() { 562 @Override public boolean matches(char c) { 563 return !original.matches(c); 564 } 565 566 @Override public boolean matchesAllOf(CharSequence sequence) { 567 return original.matchesNoneOf(sequence); 568 } 569 570 @Override public boolean matchesNoneOf(CharSequence sequence) { 571 return original.matchesAllOf(sequence); 572 } 573 574 @Override public int countIn(CharSequence sequence) { 575 return sequence.length() - original.countIn(sequence); 576 } 577 578 @Override public CharMatcher negate() { 579 return original; 580 } 581 }; 582 } 583 584 /** 585 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 586 */ 587 public CharMatcher and(CharMatcher other) { 588 return new And(Arrays.asList(this, checkNotNull(other))); 589 } 590 591 private static class And extends CharMatcher { 592 List<CharMatcher> components; 593 594 And(List<CharMatcher> components) { 595 this.components = components; // Skip defensive copy (private) 596 } 597 598 @Override public boolean matches(char c) { 599 for (CharMatcher matcher : components) { 600 if (!matcher.matches(c)) { 601 return false; 602 } 603 } 604 return true; 605 } 606 607 @Override public CharMatcher and(CharMatcher other) { 608 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 609 newComponents.add(checkNotNull(other)); 610 return new And(newComponents); 611 } 612 } 613 614 /** 615 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 616 */ 617 public CharMatcher or(CharMatcher other) { 618 return new Or(Arrays.asList(this, checkNotNull(other))); 619 } 620 621 private static class Or extends CharMatcher { 622 List<CharMatcher> components; 623 624 Or(List<CharMatcher> components) { 625 this.components = components; // Skip defensive copy (private) 626 } 627 628 @Override public boolean matches(char c) { 629 for (CharMatcher matcher : components) { 630 if (matcher.matches(c)) { 631 return true; 632 } 633 } 634 return false; 635 } 636 637 @Override public CharMatcher or(CharMatcher other) { 638 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 639 newComponents.add(checkNotNull(other)); 640 return new Or(newComponents); 641 } 642 643 @Override void setBits(LookupTable table) { 644 for (CharMatcher matcher : components) { 645 matcher.setBits(table); 646 } 647 } 648 } 649 650 /** 651 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 652 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 653 * worthwhile only if the precomputed matcher is queried many thousands of times. 654 * 655 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 656 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 657 * worthwhile tradeoff in a browser. 658 */ 659 public CharMatcher precomputed() { 660 return Platform.precomputeCharMatcher(this); 661 } 662 663 /** 664 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 665 * on {@link Platform} so that we can have different behavior in GWT. 666 * 667 * <p>The default precomputation is to cache the configuration of the original matcher in an 668 * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query 669 * than the original. 670 * 671 * <p>The default implementation creates a new bit array and passes it to {@link 672 * #setBits(LookupTable)}. 673 */ 674 CharMatcher precomputedInternal() { 675 final LookupTable table = new LookupTable(); 676 setBits(table); 677 678 return new CharMatcher() { 679 @Override public boolean matches(char c) { 680 return table.get(c); 681 } 682 683 // TODO(kevinb): make methods like negate() smart? 684 685 @Override public CharMatcher precomputed() { 686 return this; 687 } 688 }; 689 } 690 691 /** 692 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal 693 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched. 694 * 695 * <p>The default implementation loops over every possible character value, invoking {@link 696 * #matches} for each one. 697 */ 698 void setBits(LookupTable table) { 699 char c = Character.MIN_VALUE; 700 while (true) { 701 if (matches(c)) { 702 table.set(c); 703 } 704 if (c++ == Character.MAX_VALUE) { 705 break; 706 } 707 } 708 } 709 710 /** 711 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}. 712 * 713 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a 714 * simpler java.util.BitSet. 715 */ 716 private static final class LookupTable { 717 int[] data = new int[2048]; 718 719 void set(char index) { 720 data[index >> 5] |= (1 << index); 721 } 722 723 boolean get(char index) { 724 return (data[index >> 5] & (1 << index)) != 0; 725 } 726 } 727 728 // Text processing routines 729 730 /** 731 * Returns {@code true} if a character sequence contains at least one matching character. 732 * Equivalent to {@code !matchesNoneOf(sequence)}. 733 * 734 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 735 * character, until this returns {@code true} or the end is reached. 736 * 737 * @param sequence the character sequence to examine, possibly empty 738 * @return {@code true} if this matcher matches at least one character in the sequence 739 * @since 8 740 */ 741 public boolean matchesAnyOf(CharSequence sequence) { 742 return !matchesNoneOf(sequence); 743 } 744 745 /** 746 * Returns {@code true} if a character sequence contains only matching characters. 747 * 748 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 749 * character, until this returns {@code false} or the end is reached. 750 * 751 * @param sequence the character sequence to examine, possibly empty 752 * @return {@code true} if this matcher matches every character in the sequence, including when 753 * the sequence is empty 754 */ 755 public boolean matchesAllOf(CharSequence sequence) { 756 for (int i = sequence.length() - 1; i >= 0; i--) { 757 if (!matches(sequence.charAt(i))) { 758 return false; 759 } 760 } 761 return true; 762 } 763 764 /** 765 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to 766 * {@code !matchesAnyOf(sequence)}. 767 * 768 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 769 * character, until this returns {@code false} or the end is reached. 770 * 771 * @param sequence the character sequence to examine, possibly empty 772 * @return {@code true} if this matcher matches every character in the sequence, including when 773 * the sequence is empty 774 */ 775 public boolean matchesNoneOf(CharSequence sequence) { 776 return indexIn(sequence) == -1; 777 } 778 779 // TODO(kevinb): add matchesAnyOf() 780 781 /** 782 * Returns the index of the first matching character in a character sequence, or {@code -1} if no 783 * matching character is present. 784 * 785 * <p>The default implementation iterates over the sequence in forward order calling {@link 786 * #matches} for each character. 787 * 788 * @param sequence the character sequence to examine from the beginning 789 * @return an index, or {@code -1} if no character matches 790 */ 791 public int indexIn(CharSequence sequence) { 792 int length = sequence.length(); 793 for (int i = 0; i < length; i++) { 794 if (matches(sequence.charAt(i))) { 795 return i; 796 } 797 } 798 return -1; 799 } 800 801 /** 802 * Returns the index of the first matching character in a character sequence, starting from a 803 * given position, or {@code -1} if no character matches after that position. 804 * 805 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 806 * start}, calling {@link #matches} for each character. 807 * 808 * @param sequence the character sequence to examine 809 * @param start the first index to examine; must be nonnegative and no greater than {@code 810 * sequence.length()} 811 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 812 * or {@code -1} if no character matches 813 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 814 * sequence.length()} 815 */ 816 public int indexIn(CharSequence sequence, int start) { 817 int length = sequence.length(); 818 Preconditions.checkPositionIndex(start, length); 819 for (int i = start; i < length; i++) { 820 if (matches(sequence.charAt(i))) { 821 return i; 822 } 823 } 824 return -1; 825 } 826 827 /** 828 * Returns the index of the last matching character in a character sequence, or {@code -1} if no 829 * matching character is present. 830 * 831 * <p>The default implementation iterates over the sequence in reverse order calling {@link 832 * #matches} for each character. 833 * 834 * @param sequence the character sequence to examine from the end 835 * @return an index, or {@code -1} if no character matches 836 */ 837 public int lastIndexIn(CharSequence sequence) { 838 for (int i = sequence.length() - 1; i >= 0; i--) { 839 if (matches(sequence.charAt(i))) { 840 return i; 841 } 842 } 843 return -1; 844 } 845 846 /** 847 * Returns the number of matching characters found in a character sequence. 848 */ 849 public int countIn(CharSequence sequence) { 850 int count = 0; 851 for (int i = 0; i < sequence.length(); i++) { 852 if (matches(sequence.charAt(i))) { 853 count++; 854 } 855 } 856 return count; 857 } 858 859 /** 860 * Returns a string containing all non-matching characters of a character sequence, in order. For 861 * example: <pre> {@code 862 * 863 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 864 * 865 * ... returns {@code "bzr"}. 866 */ 867 public String removeFrom(CharSequence sequence) { 868 String string = sequence.toString(); 869 int pos = indexIn(string); 870 if (pos == -1) { 871 return string; 872 } 873 874 char[] chars = string.toCharArray(); 875 int spread = 1; 876 877 // This unusual loop comes from extensive benchmarking 878 OUT: while (true) { 879 pos++; 880 while (true) { 881 if (pos == chars.length) { 882 break OUT; 883 } 884 if (matches(chars[pos])) { 885 break; 886 } 887 chars[pos - spread] = chars[pos]; 888 pos++; 889 } 890 spread++; 891 } 892 return new String(chars, 0, pos - spread); 893 } 894 895 /** 896 * Returns a string containing all matching characters of a character sequence, in order. For 897 * example: <pre> {@code 898 * 899 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 900 * 901 * ... returns {@code "aaa"}. 902 */ 903 public String retainFrom(CharSequence sequence) { 904 return negate().removeFrom(sequence); 905 } 906 907 /** 908 * Returns a string copy of the input character sequence, with each character that matches this 909 * matcher replaced by a given replacement character. For example: <pre> {@code 910 * 911 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 912 * 913 * ... returns {@code "rodor"}. 914 * 915 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 916 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 917 * character. 918 * 919 * @param sequence the character sequence to replace matching characters in 920 * @param replacement the character to append to the result string in place of each matching 921 * character in {@code sequence} 922 * @return the new string 923 */ 924 public String replaceFrom(CharSequence sequence, char replacement) { 925 String string = sequence.toString(); 926 int pos = indexIn(string); 927 if (pos == -1) { 928 return string; 929 } 930 char[] chars = string.toCharArray(); 931 chars[pos] = replacement; 932 for (int i = pos + 1; i < chars.length; i++) { 933 if (matches(chars[i])) { 934 chars[i] = replacement; 935 } 936 } 937 return new String(chars); 938 } 939 940 /** 941 * Returns a string copy of the input character sequence, with each character that matches this 942 * matcher replaced by a given replacement sequence. For example: <pre> {@code 943 * 944 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 945 * 946 * ... returns {@code "yoohoo"}. 947 * 948 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 949 * off calling {@link #replaceFrom(CharSequence, char)} directly. 950 * 951 * @param sequence the character sequence to replace matching characters in 952 * @param replacement the characters to append to the result string in place of each matching 953 * character in {@code sequence} 954 * @return the new string 955 */ 956 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 957 int replacementLen = replacement.length(); 958 if (replacementLen == 0) { 959 return removeFrom(sequence); 960 } 961 if (replacementLen == 1) { 962 return replaceFrom(sequence, replacement.charAt(0)); 963 } 964 965 String string = sequence.toString(); 966 int pos = indexIn(string); 967 if (pos == -1) { 968 return string; 969 } 970 971 int len = string.length(); 972 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 973 974 int oldpos = 0; 975 do { 976 buf.append(string, oldpos, pos); 977 buf.append(replacement); 978 oldpos = pos + 1; 979 pos = indexIn(string, oldpos); 980 } while (pos != -1); 981 982 buf.append(string, oldpos, len); 983 return buf.toString(); 984 } 985 986 /** 987 * Returns a substring of the input character sequence that omits all characters this matcher 988 * matches from the beginning and from the end of the string. For example: <pre> {@code 989 * 990 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 991 * 992 * ... returns {@code "cat"}. 993 * 994 * <p>Note that: <pre> {@code 995 * 996 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 997 * 998 * ... is equivalent to {@link String#trim()}. 999 */ 1000 public String trimFrom(CharSequence sequence) { 1001 int len = sequence.length(); 1002 int first; 1003 int last; 1004 1005 for (first = 0; first < len; first++) { 1006 if (!matches(sequence.charAt(first))) { 1007 break; 1008 } 1009 } 1010 for (last = len - 1; last > first; last--) { 1011 if (!matches(sequence.charAt(last))) { 1012 break; 1013 } 1014 } 1015 1016 return sequence.subSequence(first, last + 1).toString(); 1017 } 1018 1019 /** 1020 * Returns a substring of the input character sequence that omits all characters this matcher 1021 * matches from the beginning of the string. For example: <pre> {@code 1022 * 1023 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1024 * 1025 * ... returns {@code "catbab"}. 1026 */ 1027 public String trimLeadingFrom(CharSequence sequence) { 1028 int len = sequence.length(); 1029 int first; 1030 1031 for (first = 0; first < len; first++) { 1032 if (!matches(sequence.charAt(first))) { 1033 break; 1034 } 1035 } 1036 1037 return sequence.subSequence(first, len).toString(); 1038 } 1039 1040 /** 1041 * Returns a substring of the input character sequence that omits all characters this matcher 1042 * matches from the end of the string. For example: <pre> {@code 1043 * 1044 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1045 * 1046 * ... returns {@code "abacat"}. 1047 */ 1048 public String trimTrailingFrom(CharSequence sequence) { 1049 int len = sequence.length(); 1050 int last; 1051 1052 for (last = len - 1; last >= 0; last--) { 1053 if (!matches(sequence.charAt(last))) { 1054 break; 1055 } 1056 } 1057 1058 return sequence.subSequence(0, last + 1).toString(); 1059 } 1060 1061 /** 1062 * Returns a string copy of the input character sequence, with each group of consecutive 1063 * characters that match this matcher replaced by a single replacement character. For example: 1064 * <pre> {@code 1065 * 1066 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1067 * 1068 * ... returns {@code "b-p-r"}. 1069 * 1070 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1071 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1072 * character. 1073 * 1074 * @param sequence the character sequence to replace matching groups of characters in 1075 * @param replacement the character to append to the result string in place of each group of 1076 * matching characters in {@code sequence} 1077 * @return the new string 1078 */ 1079 public String collapseFrom(CharSequence sequence, char replacement) { 1080 int first = indexIn(sequence); 1081 if (first == -1) { 1082 return sequence.toString(); 1083 } 1084 1085 // TODO(kevinb): see if this implementation can be made faster 1086 StringBuilder builder = new StringBuilder(sequence.length()) 1087 .append(sequence.subSequence(0, first)) 1088 .append(replacement); 1089 boolean in = true; 1090 for (int i = first + 1; i < sequence.length(); i++) { 1091 char c = sequence.charAt(i); 1092 if (apply(c)) { 1093 if (!in) { 1094 builder.append(replacement); 1095 in = true; 1096 } 1097 } else { 1098 builder.append(c); 1099 in = false; 1100 } 1101 } 1102 return builder.toString(); 1103 } 1104 1105 /** 1106 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1107 * groups of matching characters at the start or end of the sequence are removed without 1108 * replacement. 1109 */ 1110 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1111 int first = negate().indexIn(sequence); 1112 if (first == -1) { 1113 return ""; // everything matches. nothing's left. 1114 } 1115 StringBuilder builder = new StringBuilder(sequence.length()); 1116 boolean inMatchingGroup = false; 1117 for (int i = first; i < sequence.length(); i++) { 1118 char c = sequence.charAt(i); 1119 if (apply(c)) { 1120 inMatchingGroup = true; 1121 } else { 1122 if (inMatchingGroup) { 1123 builder.append(replacement); 1124 inMatchingGroup = false; 1125 } 1126 builder.append(c); 1127 } 1128 } 1129 return builder.toString(); 1130 } 1131 1132 // Predicate interface 1133 1134 /** 1135 * Returns {@code true} if this matcher matches the given character. 1136 * 1137 * @throws NullPointerException if {@code character} is null 1138 */ 1139 @Override public boolean apply(Character character) { 1140 return matches(character); 1141 } 1142 }