001 /* 002 * Copyright (C) 2008 Google Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the 010 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 011 * express or implied. See the License for the specific language governing permissions and 012 * limitations under the License. 013 */ 014 015 package com.google.common.base; 016 017 import static com.google.common.base.Preconditions.checkArgument; 018 import static com.google.common.base.Preconditions.checkNotNull; 019 020 import com.google.common.annotations.Beta; 021 import com.google.common.annotations.GwtCompatible; 022 023 import java.util.ArrayList; 024 import java.util.Arrays; 025 import java.util.List; 026 027 /** 028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 029 * for any {@link Object}. Also offers basic text processing methods based on this function. 030 * Implementations are strongly encouraged to be side-effect-free and immutable. 031 * 032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 033 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}". 034 * 035 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand 036 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical 037 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher} 038 * treats these just as two separate characters. 039 * 040 * @author Kevin Bourrillion 041 * @since 1 042 */ 043 // TODO: release as "stable" after changing from chars to code points, and 044 // deciding whether constants should change to methods 045 @Beta 046 @GwtCompatible 047 public abstract class CharMatcher implements Predicate<Character> { 048 // Constants 049 050 // Excludes 2000-2000a, which is handled as a range 051 private static final String BREAKING_WHITESPACE_CHARS = 052 "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000"; 053 054 // Excludes 2007, which is handled as a gap in a pair of ranges 055 private static final String NON_BREAKING_WHITESPACE_CHARS = 056 "\u00a0\u180e\u202f"; 057 058 /** 059 * Determines whether a character is whitespace according to the latest Unicode standard, as 060 * illustrated 061 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 062 * This is not the same definition used by other Java APIs. (See a 063 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 064 * definitions of "whitespace"</a>.) 065 * 066 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up 067 * to date. 068 */ 069 public static final CharMatcher WHITESPACE = 070 anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS) 071 .or(inRange('\u2000', '\u200a')) 072 .precomputed(); 073 074 /** 075 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 076 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a 077 * discussion of that term. 078 * 079 * @since 2 080 */ 081 public static final CharMatcher BREAKING_WHITESPACE = 082 anyOf(BREAKING_WHITESPACE_CHARS) 083 .or(inRange('\u2000', '\u2006')) 084 .or(inRange('\u2008', '\u200a')) 085 .precomputed(); 086 087 /** 088 * Determines whether a character is ASCII, meaning that its code point is less than 128. 089 */ 090 public static final CharMatcher ASCII = inRange('\0', '\u007f'); 091 092 /** 093 * Determines whether a character is a digit according to 094 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 095 */ 096 public static final CharMatcher DIGIT; 097 098 static { 099 CharMatcher digit = inRange('0', '9'); 100 String zeroes = 101 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66" 102 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946" 103 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 104 for (char base : zeroes.toCharArray()) { 105 digit = digit.or(inRange(base, (char) (base + 9))); 106 } 107 DIGIT = digit.precomputed(); 108 } 109 110 /** 111 * Determines whether a character is whitespace according to {@link Character#isWhitespace(char) 112 * Java's definition}; it is usually preferable to use {@link #WHITESPACE}. (See a 113 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 114 * definitions of "whitespace"</a>.) 115 */ 116 public static final CharMatcher JAVA_WHITESPACE = 117 inRange('\u0009', (char) 13) // \\u000d doesn't work as a char literal 118 .or(inRange('\u001c', '\u0020')) 119 .or(is('\u1680')) 120 .or(is('\u180e')) 121 .or(inRange('\u2000', '\u2006')) 122 .or(inRange('\u2008', '\u200b')) 123 .or(inRange('\u2028', '\u2029')) 124 .or(is('\u205f')) 125 .or(is('\u3000')) 126 .precomputed(); 127 128 /** 129 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's 130 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 131 */ 132 public static final CharMatcher JAVA_DIGIT = new CharMatcher() { 133 @Override public boolean matches(char c) { 134 return Character.isDigit(c); 135 } 136 }; 137 138 /** 139 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's 140 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code 141 * inRange('a', 'z').or(inRange('A', 'Z'))}. 142 */ 143 public static final CharMatcher JAVA_LETTER = new CharMatcher() { 144 @Override public boolean matches(char c) { 145 return Character.isLetter(c); 146 } 147 }; 148 149 /** 150 * Determines whether a character is a letter or digit according to {@link 151 * Character#isLetterOrDigit(char) Java's definition}. 152 */ 153 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() { 154 @Override public boolean matches(char c) { 155 return Character.isLetterOrDigit(c); 156 } 157 }; 158 159 /** 160 * Determines whether a character is upper case according to {@link Character#isUpperCase(char) 161 * Java's definition}. 162 */ 163 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() { 164 @Override public boolean matches(char c) { 165 return Character.isUpperCase(c); 166 } 167 }; 168 169 /** 170 * Determines whether a character is lower case according to {@link Character#isLowerCase(char) 171 * Java's definition}. 172 */ 173 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() { 174 @Override public boolean matches(char c) { 175 return Character.isLowerCase(c); 176 } 177 }; 178 179 /** 180 * Determines whether a character is an ISO control character as specified by {@link 181 * Character#isISOControl(char)}. 182 */ 183 public static final CharMatcher JAVA_ISO_CONTROL = 184 inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f')); 185 186 /** 187 * Determines whether a character is invisible; that is, if its Unicode category is any of 188 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 189 * PRIVATE_USE according to ICU4J. 190 */ 191 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020') 192 .or(inRange('\u007f', '\u00a0')) 193 .or(is('\u00ad')) 194 .or(inRange('\u0600', '\u0603')) 195 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e")) 196 .or(inRange('\u2000', '\u200f')) 197 .or(inRange('\u2028', '\u202f')) 198 .or(inRange('\u205f', '\u2064')) 199 .or(inRange('\u206a', '\u206f')) 200 .or(is('\u3000')) 201 .or(inRange('\ud800', '\uf8ff')) 202 .or(anyOf("\ufeff\ufff9\ufffa\ufffb")) 203 .precomputed(); 204 205 /** 206 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 207 * errs on the side of returning {@code false} (that is, it tends to assume a character is 208 * double-width). 209 * 210 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to 211 * date. 212 */ 213 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9') 214 .or(is('\u05be')) 215 .or(inRange('\u05d0', '\u05ea')) 216 .or(is('\u05f3')) 217 .or(is('\u05f4')) 218 .or(inRange('\u0600', '\u06ff')) 219 .or(inRange('\u0750', '\u077f')) 220 .or(inRange('\u0e00', '\u0e7f')) 221 .or(inRange('\u1e00', '\u20af')) 222 .or(inRange('\u2100', '\u213a')) 223 .or(inRange('\ufb50', '\ufdff')) 224 .or(inRange('\ufe70', '\ufeff')) 225 .or(inRange('\uff61', '\uffdc')) 226 .precomputed(); 227 228 /** Matches any character. */ 229 public static final CharMatcher ANY = 230 new CharMatcher() { 231 @Override public boolean matches(char c) { 232 return true; 233 } 234 235 @Override public int indexIn(CharSequence sequence) { 236 return (sequence.length() == 0) ? -1 : 0; 237 } 238 239 @Override public int indexIn(CharSequence sequence, int start) { 240 int length = sequence.length(); 241 Preconditions.checkPositionIndex(start, length); 242 return (start == length) ? -1 : start; 243 } 244 245 @Override public int lastIndexIn(CharSequence sequence) { 246 return sequence.length() - 1; 247 } 248 249 @Override public boolean matchesAllOf(CharSequence sequence) { 250 checkNotNull(sequence); 251 return true; 252 } 253 254 @Override public boolean matchesNoneOf(CharSequence sequence) { 255 return sequence.length() == 0; 256 } 257 258 @Override public String removeFrom(CharSequence sequence) { 259 checkNotNull(sequence); 260 return ""; 261 } 262 263 @Override public String replaceFrom(CharSequence sequence, char replacement) { 264 char[] array = new char[sequence.length()]; 265 Arrays.fill(array, replacement); 266 return new String(array); 267 } 268 269 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 270 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 271 for (int i = 0; i < sequence.length(); i++) { 272 retval.append(replacement); 273 } 274 return retval.toString(); 275 } 276 277 @Override public String collapseFrom(CharSequence sequence, char replacement) { 278 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 279 } 280 281 @Override public String trimFrom(CharSequence sequence) { 282 checkNotNull(sequence); 283 return ""; 284 } 285 286 @Override public int countIn(CharSequence sequence) { 287 return sequence.length(); 288 } 289 290 @Override public CharMatcher and(CharMatcher other) { 291 return checkNotNull(other); 292 } 293 294 @Override public CharMatcher or(CharMatcher other) { 295 checkNotNull(other); 296 return this; 297 } 298 299 @Override public CharMatcher negate() { 300 return NONE; 301 } 302 303 @Override public CharMatcher precomputed() { 304 return this; 305 } 306 }; 307 308 /** Matches no characters. */ 309 public static final CharMatcher NONE = 310 new CharMatcher() { 311 @Override public boolean matches(char c) { 312 return false; 313 } 314 315 @Override public int indexIn(CharSequence sequence) { 316 checkNotNull(sequence); 317 return -1; 318 } 319 320 @Override public int indexIn(CharSequence sequence, int start) { 321 int length = sequence.length(); 322 Preconditions.checkPositionIndex(start, length); 323 return -1; 324 } 325 326 @Override public int lastIndexIn(CharSequence sequence) { 327 checkNotNull(sequence); 328 return -1; 329 } 330 331 @Override public boolean matchesAllOf(CharSequence sequence) { 332 return sequence.length() == 0; 333 } 334 335 @Override public boolean matchesNoneOf(CharSequence sequence) { 336 checkNotNull(sequence); 337 return true; 338 } 339 340 @Override public String removeFrom(CharSequence sequence) { 341 return sequence.toString(); 342 } 343 344 @Override public String replaceFrom(CharSequence sequence, char replacement) { 345 return sequence.toString(); 346 } 347 348 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 349 checkNotNull(replacement); 350 return sequence.toString(); 351 } 352 353 @Override public String collapseFrom(CharSequence sequence, char replacement) { 354 return sequence.toString(); 355 } 356 357 @Override public String trimFrom(CharSequence sequence) { 358 return sequence.toString(); 359 } 360 361 @Override public int countIn(CharSequence sequence) { 362 checkNotNull(sequence); 363 return 0; 364 } 365 366 @Override public CharMatcher and(CharMatcher other) { 367 checkNotNull(other); 368 return this; 369 } 370 371 @Override public CharMatcher or(CharMatcher other) { 372 return checkNotNull(other); 373 } 374 375 @Override public CharMatcher negate() { 376 return ANY; 377 } 378 379 @Override void setBits(LookupTable table) {} 380 381 @Override public CharMatcher precomputed() { 382 return this; 383 } 384 }; 385 386 // Static factories 387 388 /** 389 * Returns a {@code char} matcher that matches only one specified character. 390 */ 391 public static CharMatcher is(final char match) { 392 return new CharMatcher() { 393 @Override public boolean matches(char c) { 394 return c == match; 395 } 396 397 @Override public String replaceFrom(CharSequence sequence, char replacement) { 398 return sequence.toString().replace(match, replacement); 399 } 400 401 @Override public CharMatcher and(CharMatcher other) { 402 return other.matches(match) ? this : NONE; 403 } 404 405 @Override public CharMatcher or(CharMatcher other) { 406 return other.matches(match) ? other : super.or(other); 407 } 408 409 @Override public CharMatcher negate() { 410 return isNot(match); 411 } 412 413 @Override void setBits(LookupTable table) { 414 table.set(match); 415 } 416 417 @Override public CharMatcher precomputed() { 418 return this; 419 } 420 }; 421 } 422 423 /** 424 * Returns a {@code char} matcher that matches any character except the one specified. 425 * 426 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 427 */ 428 public static CharMatcher isNot(final char match) { 429 return new CharMatcher() { 430 @Override public boolean matches(char c) { 431 return c != match; 432 } 433 434 @Override public CharMatcher and(CharMatcher other) { 435 return other.matches(match) ? super.and(other) : other; 436 } 437 438 @Override public CharMatcher or(CharMatcher other) { 439 return other.matches(match) ? ANY : this; 440 } 441 442 @Override public CharMatcher negate() { 443 return is(match); 444 } 445 }; 446 } 447 448 /** 449 * Returns a {@code char} matcher that matches any character present in the given character 450 * sequence. 451 */ 452 public static CharMatcher anyOf(final CharSequence sequence) { 453 switch (sequence.length()) { 454 case 0: 455 return NONE; 456 case 1: 457 return is(sequence.charAt(0)); 458 case 2: 459 final char match1 = sequence.charAt(0); 460 final char match2 = sequence.charAt(1); 461 return new CharMatcher() { 462 @Override public boolean matches(char c) { 463 return c == match1 || c == match2; 464 } 465 466 @Override void setBits(LookupTable table) { 467 table.set(match1); 468 table.set(match2); 469 } 470 471 @Override public CharMatcher precomputed() { 472 return this; 473 } 474 }; 475 } 476 477 final char[] chars = sequence.toString().toCharArray(); 478 Arrays.sort(chars); // not worth collapsing duplicates 479 480 return new CharMatcher() { 481 @Override public boolean matches(char c) { 482 return Arrays.binarySearch(chars, c) >= 0; 483 } 484 485 @Override void setBits(LookupTable table) { 486 for (char c : chars) { 487 table.set(c); 488 } 489 } 490 }; 491 } 492 493 /** 494 * Returns a {@code char} matcher that matches any character not present in the given character 495 * sequence. 496 */ 497 public static CharMatcher noneOf(CharSequence sequence) { 498 return anyOf(sequence).negate(); 499 } 500 501 /** 502 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are 503 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 504 * CharMatcher.inRange('a', 'z')}. 505 * 506 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 507 */ 508 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 509 checkArgument(endInclusive >= startInclusive); 510 return new CharMatcher() { 511 @Override public boolean matches(char c) { 512 return startInclusive <= c && c <= endInclusive; 513 } 514 515 @Override void setBits(LookupTable table) { 516 char c = startInclusive; 517 while (true) { 518 table.set(c); 519 if (c++ == endInclusive) { 520 break; 521 } 522 } 523 } 524 525 @Override public CharMatcher precomputed() { 526 return this; 527 } 528 }; 529 } 530 531 /** 532 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 533 * which operates on primitive {@code char} instances instead. 534 */ 535 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 536 checkNotNull(predicate); 537 if (predicate instanceof CharMatcher) { 538 return (CharMatcher) predicate; 539 } 540 return new CharMatcher() { 541 @Override public boolean matches(char c) { 542 return predicate.apply(c); 543 } 544 545 @Override public boolean apply(Character character) { 546 return predicate.apply(checkNotNull(character)); 547 } 548 }; 549 } 550 551 // Abstract methods 552 553 /** Determines a true or false value for the given character. */ 554 public abstract boolean matches(char c); 555 556 // Non-static factories 557 558 /** 559 * Returns a matcher that matches any character not matched by this matcher. 560 */ 561 public CharMatcher negate() { 562 final CharMatcher original = this; 563 return new CharMatcher() { 564 @Override public boolean matches(char c) { 565 return !original.matches(c); 566 } 567 568 @Override public boolean matchesAllOf(CharSequence sequence) { 569 return original.matchesNoneOf(sequence); 570 } 571 572 @Override public boolean matchesNoneOf(CharSequence sequence) { 573 return original.matchesAllOf(sequence); 574 } 575 576 @Override public int countIn(CharSequence sequence) { 577 return sequence.length() - original.countIn(sequence); 578 } 579 580 @Override public CharMatcher negate() { 581 return original; 582 } 583 }; 584 } 585 586 /** 587 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 588 */ 589 public CharMatcher and(CharMatcher other) { 590 return new And(Arrays.asList(this, checkNotNull(other))); 591 } 592 593 private static class And extends CharMatcher { 594 List<CharMatcher> components; 595 596 And(List<CharMatcher> components) { 597 this.components = components; // Skip defensive copy (private) 598 } 599 600 @Override public boolean matches(char c) { 601 for (CharMatcher matcher : components) { 602 if (!matcher.matches(c)) { 603 return false; 604 } 605 } 606 return true; 607 } 608 609 @Override public CharMatcher and(CharMatcher other) { 610 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 611 newComponents.add(checkNotNull(other)); 612 return new And(newComponents); 613 } 614 } 615 616 /** 617 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 618 */ 619 public CharMatcher or(CharMatcher other) { 620 return new Or(Arrays.asList(this, checkNotNull(other))); 621 } 622 623 private static class Or extends CharMatcher { 624 List<CharMatcher> components; 625 626 Or(List<CharMatcher> components) { 627 this.components = components; // Skip defensive copy (private) 628 } 629 630 @Override public boolean matches(char c) { 631 for (CharMatcher matcher : components) { 632 if (matcher.matches(c)) { 633 return true; 634 } 635 } 636 return false; 637 } 638 639 @Override public CharMatcher or(CharMatcher other) { 640 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 641 newComponents.add(checkNotNull(other)); 642 return new Or(newComponents); 643 } 644 645 @Override void setBits(LookupTable table) { 646 for (CharMatcher matcher : components) { 647 matcher.setBits(table); 648 } 649 } 650 } 651 652 /** 653 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 654 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 655 * worthwhile only if the precomputed matcher is queried many thousands of times. 656 * 657 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 658 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 659 * worthwhile tradeoff in a browser. 660 */ 661 public CharMatcher precomputed() { 662 return Platform.precomputeCharMatcher(this); 663 } 664 665 /** 666 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 667 * on {@link Platform} so that we can have different behavior in GWT. 668 * 669 * <p>The default precomputation is to cache the configuration of the original matcher in an 670 * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query 671 * than the original. 672 * 673 * <p>The default implementation creates a new bit array and passes it to {@link 674 * #setBits(LookupTable)}. 675 */ 676 CharMatcher precomputedInternal() { 677 final LookupTable table = new LookupTable(); 678 setBits(table); 679 680 return new CharMatcher() { 681 @Override public boolean matches(char c) { 682 return table.get(c); 683 } 684 685 // TODO: make methods like negate() smart 686 687 @Override public CharMatcher precomputed() { 688 return this; 689 } 690 }; 691 } 692 693 /** 694 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal 695 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched. 696 * 697 * <p>The default implementation loops over every possible character value, invoking {@link 698 * #matches} for each one. 699 */ 700 void setBits(LookupTable table) { 701 char c = Character.MIN_VALUE; 702 while (true) { 703 if (matches(c)) { 704 table.set(c); 705 } 706 if (c++ == Character.MAX_VALUE) { 707 break; 708 } 709 } 710 } 711 712 /** 713 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}. 714 * 715 * <p>TODO: possibly share a common BitArray class with BloomFilter and others... a simpler 716 * java.util.BitSet. 717 */ 718 private static final class LookupTable { 719 int[] data = new int[2048]; 720 721 void set(char index) { 722 data[index >> 5] |= (1 << index); 723 } 724 725 boolean get(char index) { 726 return (data[index >> 5] & (1 << index)) != 0; 727 } 728 } 729 730 // Text processing routines 731 732 /** 733 * Returns {@code true} if a character sequence contains only matching characters. 734 * 735 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 736 * character, until this returns {@code false} or the end is reached. 737 * 738 * @param sequence the character sequence to examine, possibly empty 739 * @return {@code true} if this matcher matches every character in the sequence, including when 740 * the sequence is empty 741 */ 742 public boolean matchesAllOf(CharSequence sequence) { 743 for (int i = sequence.length() - 1; i >= 0; i--) { 744 if (!matches(sequence.charAt(i))) { 745 return false; 746 } 747 } 748 return true; 749 } 750 751 /** 752 * Returns {@code true} if a character sequence contains no matching characters. 753 * 754 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 755 * character, until this returns {@code false} or the end is reached. 756 * 757 * @param sequence the character sequence to examine, possibly empty 758 * @return {@code true} if this matcher matches every character in the sequence, including when 759 * the sequence is empty 760 */ 761 public boolean matchesNoneOf(CharSequence sequence) { 762 return indexIn(sequence) == -1; 763 } 764 765 // TODO: perhaps add matchesAnyOf() 766 767 /** 768 * Returns the index of the first matching character in a character sequence, or {@code -1} if no 769 * matching character is present. 770 * 771 * <p>The default implementation iterates over the sequence in forward order calling {@link 772 * #matches} for each character. 773 * 774 * @param sequence the character sequence to examine from the beginning 775 * @return an index, or {@code -1} if no character matches 776 */ 777 public int indexIn(CharSequence sequence) { 778 int length = sequence.length(); 779 for (int i = 0; i < length; i++) { 780 if (matches(sequence.charAt(i))) { 781 return i; 782 } 783 } 784 return -1; 785 } 786 787 /** 788 * Returns the index of the first matching character in a character sequence, starting from a 789 * given position, or {@code -1} if no character matches after that position. 790 * 791 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 792 * start}, calling {@link #matches} for each character. 793 * 794 * @param sequence the character sequence to examine 795 * @param start the first index to examine; must be nonnegative and no greater than {@code 796 * sequence.length()} 797 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 798 * or {@code -1} if no character matches 799 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 800 * sequence.length()} 801 */ 802 public int indexIn(CharSequence sequence, int start) { 803 int length = sequence.length(); 804 Preconditions.checkPositionIndex(start, length); 805 for (int i = start; i < length; i++) { 806 if (matches(sequence.charAt(i))) { 807 return i; 808 } 809 } 810 return -1; 811 } 812 813 /** 814 * Returns the index of the last matching character in a character sequence, or {@code -1} if no 815 * matching character is present. 816 * 817 * <p>The default implementation iterates over the sequence in reverse order calling {@link 818 * #matches} for each character. 819 * 820 * @param sequence the character sequence to examine from the end 821 * @return an index, or {@code -1} if no character matches 822 */ 823 public int lastIndexIn(CharSequence sequence) { 824 for (int i = sequence.length() - 1; i >= 0; i--) { 825 if (matches(sequence.charAt(i))) { 826 return i; 827 } 828 } 829 return -1; 830 } 831 832 /** 833 * Returns the number of matching characters found in a character sequence. 834 */ 835 public int countIn(CharSequence sequence) { 836 int count = 0; 837 for (int i = 0; i < sequence.length(); i++) { 838 if (matches(sequence.charAt(i))) { 839 count++; 840 } 841 } 842 return count; 843 } 844 845 /** 846 * Returns a string containing all non-matching characters of a character sequence, in order. For 847 * example: <pre> {@code 848 * 849 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 850 * 851 * ... returns {@code "bzr"}. 852 */ 853 public String removeFrom(CharSequence sequence) { 854 String string = sequence.toString(); 855 int pos = indexIn(string); 856 if (pos == -1) { 857 return string; 858 } 859 860 char[] chars = string.toCharArray(); 861 int spread = 1; 862 863 // This unusual loop comes from extensive benchmarking 864 OUT: while (true) { 865 pos++; 866 while (true) { 867 if (pos == chars.length) { 868 break OUT; 869 } 870 if (matches(chars[pos])) { 871 break; 872 } 873 chars[pos - spread] = chars[pos]; 874 pos++; 875 } 876 spread++; 877 } 878 return new String(chars, 0, pos - spread); 879 } 880 881 /** 882 * Returns a string containing all matching characters of a character sequence, in order. For 883 * example: <pre> {@code 884 * 885 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 886 * 887 * ... returns {@code "aaa"}. 888 */ 889 public String retainFrom(CharSequence sequence) { 890 return negate().removeFrom(sequence); 891 } 892 893 /** 894 * Returns a string copy of the input character sequence, with each character that matches this 895 * matcher replaced by a given replacement character. For example: <pre> {@code 896 * 897 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 898 * 899 * ... returns {@code "rodor"}. 900 * 901 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 902 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 903 * character. 904 * 905 * @param sequence the character sequence to replace matching characters in 906 * @param replacement the character to append to the result string in place of each matching 907 * character in {@code sequence} 908 * @return the new string 909 */ 910 public String replaceFrom(CharSequence sequence, char replacement) { 911 String string = sequence.toString(); 912 int pos = indexIn(string); 913 if (pos == -1) { 914 return string; 915 } 916 char[] chars = string.toCharArray(); 917 chars[pos] = replacement; 918 for (int i = pos + 1; i < chars.length; i++) { 919 if (matches(chars[i])) { 920 chars[i] = replacement; 921 } 922 } 923 return new String(chars); 924 } 925 926 /** 927 * Returns a string copy of the input character sequence, with each character that matches this 928 * matcher replaced by a given replacement sequence. For example: <pre> {@code 929 * 930 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 931 * 932 * ... returns {@code "yoohoo"}. 933 * 934 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 935 * off calling {@link #replaceFrom(CharSequence, char)} directly. 936 * 937 * @param sequence the character sequence to replace matching characters in 938 * @param replacement the characters to append to the result string in place of each matching 939 * character in {@code sequence} 940 * @return the new string 941 */ 942 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 943 int replacementLen = replacement.length(); 944 if (replacementLen == 0) { 945 return removeFrom(sequence); 946 } 947 if (replacementLen == 1) { 948 return replaceFrom(sequence, replacement.charAt(0)); 949 } 950 951 String string = sequence.toString(); 952 int pos = indexIn(string); 953 if (pos == -1) { 954 return string; 955 } 956 957 int len = string.length(); 958 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 959 960 int oldpos = 0; 961 do { 962 buf.append(string, oldpos, pos); 963 buf.append(replacement); 964 oldpos = pos + 1; 965 pos = indexIn(string, oldpos); 966 } while (pos != -1); 967 968 buf.append(string, oldpos, len); 969 return buf.toString(); 970 } 971 972 /** 973 * Returns a substring of the input character sequence that omits all characters this matcher 974 * matches from the beginning and from the end of the string. For example: <pre> {@code 975 * 976 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 977 * 978 * ... returns {@code "cat"}. 979 * 980 * <p>Note that: <pre> {@code 981 * 982 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 983 * 984 * ... is equivalent to {@link String#trim()}. 985 */ 986 public String trimFrom(CharSequence sequence) { 987 int len = sequence.length(); 988 int first; 989 int last; 990 991 for (first = 0; first < len; first++) { 992 if (!matches(sequence.charAt(first))) { 993 break; 994 } 995 } 996 for (last = len - 1; last > first; last--) { 997 if (!matches(sequence.charAt(last))) { 998 break; 999 } 1000 } 1001 1002 return sequence.subSequence(first, last + 1).toString(); 1003 } 1004 1005 /** 1006 * Returns a substring of the input character sequence that omits all characters this matcher 1007 * matches from the beginning of the string. For example: <pre> {@code 1008 * 1009 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1010 * 1011 * ... returns {@code "catbab"}. 1012 */ 1013 public String trimLeadingFrom(CharSequence sequence) { 1014 int len = sequence.length(); 1015 int first; 1016 1017 for (first = 0; first < len; first++) { 1018 if (!matches(sequence.charAt(first))) { 1019 break; 1020 } 1021 } 1022 1023 return sequence.subSequence(first, len).toString(); 1024 } 1025 1026 /** 1027 * Returns a substring of the input character sequence that omits all characters this matcher 1028 * matches from the end of the string. For example: <pre> {@code 1029 * 1030 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1031 * 1032 * ... returns {@code "abacat"}. 1033 */ 1034 public String trimTrailingFrom(CharSequence sequence) { 1035 int len = sequence.length(); 1036 int last; 1037 1038 for (last = len - 1; last >= 0; last--) { 1039 if (!matches(sequence.charAt(last))) { 1040 break; 1041 } 1042 } 1043 1044 return sequence.subSequence(0, last + 1).toString(); 1045 } 1046 1047 /** 1048 * Returns a string copy of the input character sequence, with each group of consecutive 1049 * characters that match this matcher replaced by a single replacement character. For example: 1050 * <pre> {@code 1051 * 1052 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1053 * 1054 * ... returns {@code "b-p-r"}. 1055 * 1056 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1057 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1058 * character. 1059 * 1060 * @param sequence the character sequence to replace matching groups of characters in 1061 * @param replacement the character to append to the result string in place of each group of 1062 * matching characters in {@code sequence} 1063 * @return the new string 1064 */ 1065 public String collapseFrom(CharSequence sequence, char replacement) { 1066 int first = indexIn(sequence); 1067 if (first == -1) { 1068 return sequence.toString(); 1069 } 1070 1071 // TODO: this implementation can probably be made faster. 1072 1073 StringBuilder builder = new StringBuilder(sequence.length()) 1074 .append(sequence.subSequence(0, first)) 1075 .append(replacement); 1076 boolean in = true; 1077 for (int i = first + 1; i < sequence.length(); i++) { 1078 char c = sequence.charAt(i); 1079 if (apply(c)) { 1080 if (!in) { 1081 builder.append(replacement); 1082 in = true; 1083 } 1084 } else { 1085 builder.append(c); 1086 in = false; 1087 } 1088 } 1089 return builder.toString(); 1090 } 1091 1092 /** 1093 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1094 * groups of matching characters at the start or end of the sequence are removed without 1095 * replacement. 1096 */ 1097 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1098 int first = negate().indexIn(sequence); 1099 if (first == -1) { 1100 return ""; // everything matches. nothing's left. 1101 } 1102 StringBuilder builder = new StringBuilder(sequence.length()); 1103 boolean inMatchingGroup = false; 1104 for (int i = first; i < sequence.length(); i++) { 1105 char c = sequence.charAt(i); 1106 if (apply(c)) { 1107 inMatchingGroup = true; 1108 } else { 1109 if (inMatchingGroup) { 1110 builder.append(replacement); 1111 inMatchingGroup = false; 1112 } 1113 builder.append(c); 1114 } 1115 } 1116 return builder.toString(); 1117 } 1118 1119 // Predicate interface 1120 1121 /** 1122 * Returns {@code true} if this matcher matches the given character. 1123 * 1124 * @throws NullPointerException if {@code character} is null 1125 */ 1126 @Override public boolean apply(Character character) { 1127 return matches(character); 1128 } 1129 }