001 /* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.base; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 022 import com.google.common.annotations.Beta; 023 import com.google.common.annotations.GwtCompatible; 024 025 import java.util.ArrayList; 026 import java.util.Arrays; 027 import java.util.List; 028 029 import javax.annotation.CheckReturnValue; 030 031 /** 032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 033 * for any {@link Object}. Also offers basic text processing methods based on this function. 034 * Implementations are strongly encouraged to be side-effect-free and immutable. 035 * 036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}". 038 * 039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand 040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical 041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher} 042 * treats these just as two separate characters. 043 * 044 * <p>Example usages: <pre> 045 * String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput); 046 * if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 047 * 048 * <p>See the Guava User Guide article on <a href= 049 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher"> 050 * {@code CharMatcher}</a>. 051 * 052 * @author Kevin Bourrillion 053 * @since 1.0 054 */ 055 @Beta // Possibly change from chars to code points; decide constants vs. methods 056 @GwtCompatible 057 public abstract class CharMatcher implements Predicate<Character> { 058 // Constants 059 /** 060 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 061 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a 062 * discussion of that term. 063 * 064 * @since 2.0 065 */ 066 public static final CharMatcher BREAKING_WHITESPACE = 067 anyOf("\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000") 068 .or(inRange('\u2000', '\u2006')) 069 .or(inRange('\u2008', '\u200a')) 070 .withToString("CharMatcher.BREAKING_WHITESPACE") 071 .precomputed(); 072 073 /** 074 * Determines whether a character is ASCII, meaning that its code point is less than 128. 075 */ 076 public static final CharMatcher ASCII = inRange('\0', '\u007f') 077 .withToString("CharMatcher.ASCII"); 078 079 /** 080 * Determines whether a character is a digit according to 081 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 082 */ 083 public static final CharMatcher DIGIT; 084 085 static { 086 CharMatcher digit = inRange('0', '9'); 087 String zeroes = 088 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66" 089 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946" 090 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 091 for (char base : zeroes.toCharArray()) { 092 digit = digit.or(inRange(base, (char) (base + 9))); 093 } 094 DIGIT = digit.withToString("CharMatcher.DIGIT").precomputed(); 095 } 096 097 /** 098 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's 099 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 100 */ 101 public static final CharMatcher JAVA_DIGIT = new CharMatcher() { 102 @Override public boolean matches(char c) { 103 return Character.isDigit(c); 104 } 105 106 @Override public String toString() { 107 return "CharMatcher.JAVA_DIGIT"; 108 } 109 }; 110 111 /** 112 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's 113 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code 114 * inRange('a', 'z').or(inRange('A', 'Z'))}. 115 */ 116 public static final CharMatcher JAVA_LETTER = new CharMatcher() { 117 @Override public boolean matches(char c) { 118 return Character.isLetter(c); 119 } 120 121 @Override public String toString() { 122 return "CharMatcher.JAVA_LETTER"; 123 } 124 }; 125 126 /** 127 * Determines whether a character is a letter or digit according to {@link 128 * Character#isLetterOrDigit(char) Java's definition}. 129 */ 130 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() { 131 @Override public boolean matches(char c) { 132 return Character.isLetterOrDigit(c); 133 } 134 135 @Override public String toString() { 136 return "CharMatcher.JAVA_LETTER_OR_DIGIT"; 137 } 138 }; 139 140 /** 141 * Determines whether a character is upper case according to {@link Character#isUpperCase(char) 142 * Java's definition}. 143 */ 144 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() { 145 @Override public boolean matches(char c) { 146 return Character.isUpperCase(c); 147 } 148 149 @Override public String toString() { 150 return "CharMatcher.JAVA_UPPER_CASE"; 151 } 152 }; 153 154 /** 155 * Determines whether a character is lower case according to {@link Character#isLowerCase(char) 156 * Java's definition}. 157 */ 158 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() { 159 @Override public boolean matches(char c) { 160 return Character.isLowerCase(c); 161 } 162 163 @Override public String toString() { 164 return "CharMatcher.JAVA_LOWER_CASE"; 165 } 166 }; 167 168 /** 169 * Determines whether a character is an ISO control character as specified by {@link 170 * Character#isISOControl(char)}. 171 */ 172 public static final CharMatcher JAVA_ISO_CONTROL = 173 inRange('\u0000', '\u001f').or(inRange('\u007f', '\u009f')) 174 .withToString("CharMatcher.JAVA_ISO_CONTROL"); 175 176 /** 177 * Determines whether a character is invisible; that is, if its Unicode category is any of 178 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 179 * PRIVATE_USE according to ICU4J. 180 */ 181 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020') 182 .or(inRange('\u007f', '\u00a0')) 183 .or(is('\u00ad')) 184 .or(inRange('\u0600', '\u0603')) 185 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e")) 186 .or(inRange('\u2000', '\u200f')) 187 .or(inRange('\u2028', '\u202f')) 188 .or(inRange('\u205f', '\u2064')) 189 .or(inRange('\u206a', '\u206f')) 190 .or(is('\u3000')) 191 .or(inRange('\ud800', '\uf8ff')) 192 .or(anyOf("\ufeff\ufff9\ufffa\ufffb")) 193 .withToString("CharMatcher.INVISIBLE") 194 .precomputed(); 195 196 /** 197 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 198 * errs on the side of returning {@code false} (that is, it tends to assume a character is 199 * double-width). 200 * 201 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to 202 * date. 203 */ 204 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9') 205 .or(is('\u05be')) 206 .or(inRange('\u05d0', '\u05ea')) 207 .or(is('\u05f3')) 208 .or(is('\u05f4')) 209 .or(inRange('\u0600', '\u06ff')) 210 .or(inRange('\u0750', '\u077f')) 211 .or(inRange('\u0e00', '\u0e7f')) 212 .or(inRange('\u1e00', '\u20af')) 213 .or(inRange('\u2100', '\u213a')) 214 .or(inRange('\ufb50', '\ufdff')) 215 .or(inRange('\ufe70', '\ufeff')) 216 .or(inRange('\uff61', '\uffdc')) 217 .withToString("CharMatcher.SINGLE_WIDTH") 218 .precomputed(); 219 220 /** Matches any character. */ 221 public static final CharMatcher ANY = 222 new CharMatcher() { 223 @Override public boolean matches(char c) { 224 return true; 225 } 226 227 @Override public int indexIn(CharSequence sequence) { 228 return (sequence.length() == 0) ? -1 : 0; 229 } 230 231 @Override public int indexIn(CharSequence sequence, int start) { 232 int length = sequence.length(); 233 Preconditions.checkPositionIndex(start, length); 234 return (start == length) ? -1 : start; 235 } 236 237 @Override public int lastIndexIn(CharSequence sequence) { 238 return sequence.length() - 1; 239 } 240 241 @Override public boolean matchesAllOf(CharSequence sequence) { 242 checkNotNull(sequence); 243 return true; 244 } 245 246 @Override public boolean matchesNoneOf(CharSequence sequence) { 247 return sequence.length() == 0; 248 } 249 250 @Override public String removeFrom(CharSequence sequence) { 251 checkNotNull(sequence); 252 return ""; 253 } 254 255 @Override public String replaceFrom(CharSequence sequence, char replacement) { 256 char[] array = new char[sequence.length()]; 257 Arrays.fill(array, replacement); 258 return new String(array); 259 } 260 261 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 262 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 263 for (int i = 0; i < sequence.length(); i++) { 264 retval.append(replacement); 265 } 266 return retval.toString(); 267 } 268 269 @Override public String collapseFrom(CharSequence sequence, char replacement) { 270 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 271 } 272 273 @Override public String trimFrom(CharSequence sequence) { 274 checkNotNull(sequence); 275 return ""; 276 } 277 278 @Override public int countIn(CharSequence sequence) { 279 return sequence.length(); 280 } 281 282 @Override public CharMatcher and(CharMatcher other) { 283 return checkNotNull(other); 284 } 285 286 @Override public CharMatcher or(CharMatcher other) { 287 checkNotNull(other); 288 return this; 289 } 290 291 @Override public CharMatcher negate() { 292 return NONE; 293 } 294 295 @Override public CharMatcher precomputed() { 296 return this; 297 } 298 299 @Override public String toString() { 300 return "CharMatcher.ANY"; 301 } 302 }; 303 304 /** Matches no characters. */ 305 public static final CharMatcher NONE = 306 new CharMatcher() { 307 @Override public boolean matches(char c) { 308 return false; 309 } 310 311 @Override public int indexIn(CharSequence sequence) { 312 checkNotNull(sequence); 313 return -1; 314 } 315 316 @Override public int indexIn(CharSequence sequence, int start) { 317 int length = sequence.length(); 318 Preconditions.checkPositionIndex(start, length); 319 return -1; 320 } 321 322 @Override public int lastIndexIn(CharSequence sequence) { 323 checkNotNull(sequence); 324 return -1; 325 } 326 327 @Override public boolean matchesAllOf(CharSequence sequence) { 328 return sequence.length() == 0; 329 } 330 331 @Override public boolean matchesNoneOf(CharSequence sequence) { 332 checkNotNull(sequence); 333 return true; 334 } 335 336 @Override public String removeFrom(CharSequence sequence) { 337 return sequence.toString(); 338 } 339 340 @Override public String replaceFrom(CharSequence sequence, char replacement) { 341 return sequence.toString(); 342 } 343 344 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 345 checkNotNull(replacement); 346 return sequence.toString(); 347 } 348 349 @Override public String collapseFrom(CharSequence sequence, char replacement) { 350 return sequence.toString(); 351 } 352 353 @Override public String trimFrom(CharSequence sequence) { 354 return sequence.toString(); 355 } 356 357 @Override public int countIn(CharSequence sequence) { 358 checkNotNull(sequence); 359 return 0; 360 } 361 362 @Override public CharMatcher and(CharMatcher other) { 363 checkNotNull(other); 364 return this; 365 } 366 367 @Override public CharMatcher or(CharMatcher other) { 368 return checkNotNull(other); 369 } 370 371 @Override public CharMatcher negate() { 372 return ANY; 373 } 374 375 @Override void setBits(LookupTable table) {} 376 377 @Override public CharMatcher precomputed() { 378 return this; 379 } 380 381 @Override public String toString() { 382 return "CharMatcher.NONE"; 383 } 384 }; 385 386 // Static factories 387 388 /** 389 * Returns a {@code char} matcher that matches only one specified character. 390 */ 391 public static CharMatcher is(final char match) { 392 return new CharMatcher() { 393 @Override public boolean matches(char c) { 394 return c == match; 395 } 396 397 @Override public String replaceFrom(CharSequence sequence, char replacement) { 398 return sequence.toString().replace(match, replacement); 399 } 400 401 @Override public CharMatcher and(CharMatcher other) { 402 return other.matches(match) ? this : NONE; 403 } 404 405 @Override public CharMatcher or(CharMatcher other) { 406 return other.matches(match) ? other : super.or(other); 407 } 408 409 @Override public CharMatcher negate() { 410 return isNot(match); 411 } 412 413 @Override void setBits(LookupTable table) { 414 table.set(match); 415 } 416 417 @Override public CharMatcher precomputed() { 418 return this; 419 } 420 421 @Override public String toString() { 422 return new StringBuilder("CharMatcher.is(") 423 .append(Integer.toHexString(match)) 424 .append(")") 425 .toString(); 426 } 427 }; 428 } 429 430 /** 431 * Returns a {@code char} matcher that matches any character except the one specified. 432 * 433 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 434 */ 435 public static CharMatcher isNot(final char match) { 436 return new CharMatcher() { 437 @Override public boolean matches(char c) { 438 return c != match; 439 } 440 441 @Override public CharMatcher and(CharMatcher other) { 442 return other.matches(match) ? super.and(other) : other; 443 } 444 445 @Override public CharMatcher or(CharMatcher other) { 446 return other.matches(match) ? ANY : this; 447 } 448 449 @Override public CharMatcher negate() { 450 return is(match); 451 } 452 453 @Override public String toString() { 454 return new StringBuilder("CharMatcher.isNot(") 455 .append(Integer.toHexString(match)) 456 .append(")") 457 .toString(); 458 } 459 }; 460 } 461 462 /** 463 * Returns a {@code char} matcher that matches any character present in the given character 464 * sequence. 465 */ 466 public static CharMatcher anyOf(final CharSequence sequence) { 467 switch (sequence.length()) { 468 case 0: 469 return NONE; 470 case 1: 471 return is(sequence.charAt(0)); 472 case 2: 473 final char match1 = sequence.charAt(0); 474 final char match2 = sequence.charAt(1); 475 return new CharMatcher() { 476 @Override public boolean matches(char c) { 477 return c == match1 || c == match2; 478 } 479 480 @Override void setBits(LookupTable table) { 481 table.set(match1); 482 table.set(match2); 483 } 484 485 @Override public CharMatcher precomputed() { 486 return this; 487 } 488 }; 489 } 490 491 final char[] chars = sequence.toString().toCharArray(); 492 Arrays.sort(chars); // not worth collapsing duplicates 493 494 return new CharMatcher() { 495 @Override public boolean matches(char c) { 496 return Arrays.binarySearch(chars, c) >= 0; 497 } 498 499 @Override void setBits(LookupTable table) { 500 for (char c : chars) { 501 table.set(c); 502 } 503 } 504 505 @Override public String toString() { 506 return new StringBuilder("CharMatcher.anyOf(\"").append(chars).append("\")").toString(); 507 } 508 }; 509 } 510 511 /** 512 * Returns a {@code char} matcher that matches any character not present in the given character 513 * sequence. 514 */ 515 public static CharMatcher noneOf(CharSequence sequence) { 516 return anyOf(sequence).negate(); 517 } 518 519 /** 520 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are 521 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 522 * CharMatcher.inRange('a', 'z')}. 523 * 524 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 525 */ 526 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 527 checkArgument(endInclusive >= startInclusive); 528 return new CharMatcher() { 529 @Override public boolean matches(char c) { 530 return startInclusive <= c && c <= endInclusive; 531 } 532 533 @Override void setBits(LookupTable table) { 534 char c = startInclusive; 535 while (true) { 536 table.set(c); 537 if (c++ == endInclusive) { 538 break; 539 } 540 } 541 } 542 543 @Override public CharMatcher precomputed() { 544 return this; 545 } 546 547 @Override public String toString() { 548 return new StringBuilder("CharMatcher.inRange(") 549 .append(Integer.toHexString(startInclusive)) 550 .append(", ") 551 .append(Integer.toHexString(endInclusive)) 552 .append(")") 553 .toString(); 554 } 555 }; 556 } 557 558 /** 559 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 560 * which operates on primitive {@code char} instances instead. 561 */ 562 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 563 checkNotNull(predicate); 564 if (predicate instanceof CharMatcher) { 565 return (CharMatcher) predicate; 566 } 567 return new CharMatcher() { 568 @Override public boolean matches(char c) { 569 return predicate.apply(c); 570 } 571 572 @Override public boolean apply(Character character) { 573 return predicate.apply(checkNotNull(character)); 574 } 575 576 @Override public String toString() { 577 return new StringBuilder("CharMatcher.forPredicate(") 578 .append(predicate) 579 .append(')') 580 .toString(); 581 } 582 }; 583 } 584 585 // Constructors 586 587 /** 588 * Constructor for use by subclasses. 589 */ 590 protected CharMatcher() {} 591 592 // Abstract methods 593 594 /** Determines a true or false value for the given character. */ 595 public abstract boolean matches(char c); 596 597 // Non-static factories 598 599 /** 600 * Returns a matcher that matches any character not matched by this matcher. 601 */ 602 public CharMatcher negate() { 603 final CharMatcher original = this; 604 return new CharMatcher() { 605 @Override public boolean matches(char c) { 606 return !original.matches(c); 607 } 608 609 @Override public boolean matchesAllOf(CharSequence sequence) { 610 return original.matchesNoneOf(sequence); 611 } 612 613 @Override public boolean matchesNoneOf(CharSequence sequence) { 614 return original.matchesAllOf(sequence); 615 } 616 617 @Override public int countIn(CharSequence sequence) { 618 return sequence.length() - original.countIn(sequence); 619 } 620 621 @Override public CharMatcher negate() { 622 return original; 623 } 624 625 @Override public String toString() { 626 return original + ".negate()"; 627 } 628 }; 629 } 630 631 /** 632 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 633 */ 634 public CharMatcher and(CharMatcher other) { 635 return new And(Arrays.asList(this, checkNotNull(other))); 636 } 637 638 private static class And extends CharMatcher { 639 List<CharMatcher> components; 640 641 And(List<CharMatcher> components) { 642 this.components = components; // Skip defensive copy (private) 643 } 644 645 @Override public boolean matches(char c) { 646 for (CharMatcher matcher : components) { 647 if (!matcher.matches(c)) { 648 return false; 649 } 650 } 651 return true; 652 } 653 654 @Override public CharMatcher and(CharMatcher other) { 655 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 656 newComponents.add(checkNotNull(other)); 657 return new And(newComponents); 658 } 659 660 @Override public String toString() { 661 StringBuilder builder = new StringBuilder("CharMatcher.and("); 662 Joiner.on(", ").appendTo(builder, components); 663 return builder.append(')').toString(); 664 } 665 } 666 667 /** 668 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 669 */ 670 public CharMatcher or(CharMatcher other) { 671 return new Or(Arrays.asList(this, checkNotNull(other))); 672 } 673 674 private static class Or extends CharMatcher { 675 List<CharMatcher> components; 676 677 Or(List<CharMatcher> components) { 678 this.components = components; // Skip defensive copy (private) 679 } 680 681 @Override public boolean matches(char c) { 682 for (CharMatcher matcher : components) { 683 if (matcher.matches(c)) { 684 return true; 685 } 686 } 687 return false; 688 } 689 690 @Override public CharMatcher or(CharMatcher other) { 691 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 692 newComponents.add(checkNotNull(other)); 693 return new Or(newComponents); 694 } 695 696 @Override void setBits(LookupTable table) { 697 for (CharMatcher matcher : components) { 698 matcher.setBits(table); 699 } 700 } 701 702 @Override public String toString() { 703 StringBuilder builder = new StringBuilder("CharMatcher.or("); 704 Joiner.on(", ").appendTo(builder, components); 705 return builder.append(')').toString(); 706 } 707 } 708 709 /** 710 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 711 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 712 * worthwhile only if the precomputed matcher is queried many thousands of times. 713 * 714 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 715 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 716 * worthwhile tradeoff in a browser. 717 */ 718 public CharMatcher precomputed() { 719 return Platform.precomputeCharMatcher(this); 720 } 721 722 /** 723 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 724 * on {@link Platform} so that we can have different behavior in GWT. 725 * 726 * <p>The default precomputation is to cache the configuration of the original matcher in an 727 * eight-kilobyte bit array. In some situations this produces a matcher which is faster to query 728 * than the original. 729 * 730 * <p>The default implementation creates a new bit array and passes it to {@link 731 * #setBits(LookupTable)}. 732 */ 733 CharMatcher precomputedInternal() { 734 final LookupTable table = new LookupTable(); 735 setBits(table); 736 final CharMatcher outer = this; 737 738 return new CharMatcher() { 739 @Override public boolean matches(char c) { 740 return table.get(c); 741 } 742 743 // TODO(kevinb): make methods like negate() smart? 744 745 @Override public CharMatcher precomputed() { 746 return this; 747 } 748 749 @Override public String toString() { 750 return outer.toString(); 751 } 752 }; 753 } 754 755 CharMatcher withToString(final String toString) { 756 final CharMatcher delegate = this; 757 return new CharMatcher() { 758 @Override 759 public boolean matches(char c) { 760 return delegate.matches(c); 761 } 762 763 @Override 764 void setBits(LookupTable table) { 765 delegate.setBits(table); 766 } 767 768 @Override 769 public String toString() { 770 return toString; 771 } 772 }; 773 } 774 775 /** 776 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal 777 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched. 778 * 779 * <p>The default implementation loops over every possible character value, invoking {@link 780 * #matches} for each one. 781 */ 782 void setBits(LookupTable table) { 783 char c = Character.MIN_VALUE; 784 while (true) { 785 if (matches(c)) { 786 table.set(c); 787 } 788 if (c++ == Character.MAX_VALUE) { 789 break; 790 } 791 } 792 } 793 794 /** 795 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}. 796 * 797 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a 798 * simpler java.util.BitSet. 799 */ 800 private static final class LookupTable { 801 int[] data = new int[2048]; 802 803 void set(char index) { 804 data[index >> 5] |= (1 << index); 805 } 806 807 boolean get(char index) { 808 return (data[index >> 5] & (1 << index)) != 0; 809 } 810 } 811 812 // Text processing routines 813 814 /** 815 * Returns {@code true} if a character sequence contains at least one matching character. 816 * Equivalent to {@code !matchesNoneOf(sequence)}. 817 * 818 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 819 * character, until this returns {@code true} or the end is reached. 820 * 821 * @param sequence the character sequence to examine, possibly empty 822 * @return {@code true} if this matcher matches at least one character in the sequence 823 * @since 8.0 824 */ 825 public boolean matchesAnyOf(CharSequence sequence) { 826 return !matchesNoneOf(sequence); 827 } 828 829 /** 830 * Returns {@code true} if a character sequence contains only matching characters. 831 * 832 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 833 * character, until this returns {@code false} or the end is reached. 834 * 835 * @param sequence the character sequence to examine, possibly empty 836 * @return {@code true} if this matcher matches every character in the sequence, including when 837 * the sequence is empty 838 */ 839 public boolean matchesAllOf(CharSequence sequence) { 840 for (int i = sequence.length() - 1; i >= 0; i--) { 841 if (!matches(sequence.charAt(i))) { 842 return false; 843 } 844 } 845 return true; 846 } 847 848 /** 849 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to 850 * {@code !matchesAnyOf(sequence)}. 851 * 852 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 853 * character, until this returns {@code false} or the end is reached. 854 * 855 * @param sequence the character sequence to examine, possibly empty 856 * @return {@code true} if this matcher matches every character in the sequence, including when 857 * the sequence is empty 858 */ 859 public boolean matchesNoneOf(CharSequence sequence) { 860 return indexIn(sequence) == -1; 861 } 862 863 /** 864 * Returns the index of the first matching character in a character sequence, or {@code -1} if no 865 * matching character is present. 866 * 867 * <p>The default implementation iterates over the sequence in forward order calling {@link 868 * #matches} for each character. 869 * 870 * @param sequence the character sequence to examine from the beginning 871 * @return an index, or {@code -1} if no character matches 872 */ 873 public int indexIn(CharSequence sequence) { 874 int length = sequence.length(); 875 for (int i = 0; i < length; i++) { 876 if (matches(sequence.charAt(i))) { 877 return i; 878 } 879 } 880 return -1; 881 } 882 883 /** 884 * Returns the index of the first matching character in a character sequence, starting from a 885 * given position, or {@code -1} if no character matches after that position. 886 * 887 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 888 * start}, calling {@link #matches} for each character. 889 * 890 * @param sequence the character sequence to examine 891 * @param start the first index to examine; must be nonnegative and no greater than {@code 892 * sequence.length()} 893 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 894 * or {@code -1} if no character matches 895 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 896 * sequence.length()} 897 */ 898 public int indexIn(CharSequence sequence, int start) { 899 int length = sequence.length(); 900 Preconditions.checkPositionIndex(start, length); 901 for (int i = start; i < length; i++) { 902 if (matches(sequence.charAt(i))) { 903 return i; 904 } 905 } 906 return -1; 907 } 908 909 /** 910 * Returns the index of the last matching character in a character sequence, or {@code -1} if no 911 * matching character is present. 912 * 913 * <p>The default implementation iterates over the sequence in reverse order calling {@link 914 * #matches} for each character. 915 * 916 * @param sequence the character sequence to examine from the end 917 * @return an index, or {@code -1} if no character matches 918 */ 919 public int lastIndexIn(CharSequence sequence) { 920 for (int i = sequence.length() - 1; i >= 0; i--) { 921 if (matches(sequence.charAt(i))) { 922 return i; 923 } 924 } 925 return -1; 926 } 927 928 /** 929 * Returns the number of matching characters found in a character sequence. 930 */ 931 public int countIn(CharSequence sequence) { 932 int count = 0; 933 for (int i = 0; i < sequence.length(); i++) { 934 if (matches(sequence.charAt(i))) { 935 count++; 936 } 937 } 938 return count; 939 } 940 941 /** 942 * Returns a string containing all non-matching characters of a character sequence, in order. For 943 * example: <pre> {@code 944 * 945 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 946 * 947 * ... returns {@code "bzr"}. 948 */ 949 @CheckReturnValue 950 public String removeFrom(CharSequence sequence) { 951 String string = sequence.toString(); 952 int pos = indexIn(string); 953 if (pos == -1) { 954 return string; 955 } 956 957 char[] chars = string.toCharArray(); 958 int spread = 1; 959 960 // This unusual loop comes from extensive benchmarking 961 OUT: while (true) { 962 pos++; 963 while (true) { 964 if (pos == chars.length) { 965 break OUT; 966 } 967 if (matches(chars[pos])) { 968 break; 969 } 970 chars[pos - spread] = chars[pos]; 971 pos++; 972 } 973 spread++; 974 } 975 return new String(chars, 0, pos - spread); 976 } 977 978 /** 979 * Returns a string containing all matching characters of a character sequence, in order. For 980 * example: <pre> {@code 981 * 982 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 983 * 984 * ... returns {@code "aaa"}. 985 */ 986 @CheckReturnValue 987 public String retainFrom(CharSequence sequence) { 988 return negate().removeFrom(sequence); 989 } 990 991 /** 992 * Returns a string copy of the input character sequence, with each character that matches this 993 * matcher replaced by a given replacement character. For example: <pre> {@code 994 * 995 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 996 * 997 * ... returns {@code "rodor"}. 998 * 999 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1000 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1001 * character. 1002 * 1003 * @param sequence the character sequence to replace matching characters in 1004 * @param replacement the character to append to the result string in place of each matching 1005 * character in {@code sequence} 1006 * @return the new string 1007 */ 1008 @CheckReturnValue 1009 public String replaceFrom(CharSequence sequence, char replacement) { 1010 String string = sequence.toString(); 1011 int pos = indexIn(string); 1012 if (pos == -1) { 1013 return string; 1014 } 1015 char[] chars = string.toCharArray(); 1016 chars[pos] = replacement; 1017 for (int i = pos + 1; i < chars.length; i++) { 1018 if (matches(chars[i])) { 1019 chars[i] = replacement; 1020 } 1021 } 1022 return new String(chars); 1023 } 1024 1025 /** 1026 * Returns a string copy of the input character sequence, with each character that matches this 1027 * matcher replaced by a given replacement sequence. For example: <pre> {@code 1028 * 1029 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 1030 * 1031 * ... returns {@code "yoohoo"}. 1032 * 1033 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 1034 * off calling {@link #replaceFrom(CharSequence, char)} directly. 1035 * 1036 * @param sequence the character sequence to replace matching characters in 1037 * @param replacement the characters to append to the result string in place of each matching 1038 * character in {@code sequence} 1039 * @return the new string 1040 */ 1041 @CheckReturnValue 1042 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1043 int replacementLen = replacement.length(); 1044 if (replacementLen == 0) { 1045 return removeFrom(sequence); 1046 } 1047 if (replacementLen == 1) { 1048 return replaceFrom(sequence, replacement.charAt(0)); 1049 } 1050 1051 String string = sequence.toString(); 1052 int pos = indexIn(string); 1053 if (pos == -1) { 1054 return string; 1055 } 1056 1057 int len = string.length(); 1058 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 1059 1060 int oldpos = 0; 1061 do { 1062 buf.append(string, oldpos, pos); 1063 buf.append(replacement); 1064 oldpos = pos + 1; 1065 pos = indexIn(string, oldpos); 1066 } while (pos != -1); 1067 1068 buf.append(string, oldpos, len); 1069 return buf.toString(); 1070 } 1071 1072 /** 1073 * Returns a substring of the input character sequence that omits all characters this matcher 1074 * matches from the beginning and from the end of the string. For example: <pre> {@code 1075 * 1076 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 1077 * 1078 * ... returns {@code "cat"}. 1079 * 1080 * <p>Note that: <pre> {@code 1081 * 1082 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 1083 * 1084 * ... is equivalent to {@link String#trim()}. 1085 */ 1086 @CheckReturnValue 1087 public String trimFrom(CharSequence sequence) { 1088 int len = sequence.length(); 1089 int first; 1090 int last; 1091 1092 for (first = 0; first < len; first++) { 1093 if (!matches(sequence.charAt(first))) { 1094 break; 1095 } 1096 } 1097 for (last = len - 1; last > first; last--) { 1098 if (!matches(sequence.charAt(last))) { 1099 break; 1100 } 1101 } 1102 1103 return sequence.subSequence(first, last + 1).toString(); 1104 } 1105 1106 /** 1107 * Returns a substring of the input character sequence that omits all characters this matcher 1108 * matches from the beginning of the string. For example: <pre> {@code 1109 * 1110 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1111 * 1112 * ... returns {@code "catbab"}. 1113 */ 1114 @CheckReturnValue 1115 public String trimLeadingFrom(CharSequence sequence) { 1116 int len = sequence.length(); 1117 int first; 1118 1119 for (first = 0; first < len; first++) { 1120 if (!matches(sequence.charAt(first))) { 1121 break; 1122 } 1123 } 1124 1125 return sequence.subSequence(first, len).toString(); 1126 } 1127 1128 /** 1129 * Returns a substring of the input character sequence that omits all characters this matcher 1130 * matches from the end of the string. For example: <pre> {@code 1131 * 1132 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1133 * 1134 * ... returns {@code "abacat"}. 1135 */ 1136 @CheckReturnValue 1137 public String trimTrailingFrom(CharSequence sequence) { 1138 int len = sequence.length(); 1139 int last; 1140 1141 for (last = len - 1; last >= 0; last--) { 1142 if (!matches(sequence.charAt(last))) { 1143 break; 1144 } 1145 } 1146 1147 return sequence.subSequence(0, last + 1).toString(); 1148 } 1149 1150 /** 1151 * Returns a string copy of the input character sequence, with each group of consecutive 1152 * characters that match this matcher replaced by a single replacement character. For example: 1153 * <pre> {@code 1154 * 1155 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1156 * 1157 * ... returns {@code "b-p-r"}. 1158 * 1159 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1160 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1161 * character. 1162 * 1163 * @param sequence the character sequence to replace matching groups of characters in 1164 * @param replacement the character to append to the result string in place of each group of 1165 * matching characters in {@code sequence} 1166 * @return the new string 1167 */ 1168 @CheckReturnValue 1169 public String collapseFrom(CharSequence sequence, char replacement) { 1170 int first = indexIn(sequence); 1171 if (first == -1) { 1172 return sequence.toString(); 1173 } 1174 1175 // TODO(kevinb): see if this implementation can be made faster 1176 StringBuilder builder = new StringBuilder(sequence.length()) 1177 .append(sequence.subSequence(0, first)) 1178 .append(replacement); 1179 boolean in = true; 1180 for (int i = first + 1; i < sequence.length(); i++) { 1181 char c = sequence.charAt(i); 1182 if (matches(c)) { 1183 if (!in) { 1184 builder.append(replacement); 1185 in = true; 1186 } 1187 } else { 1188 builder.append(c); 1189 in = false; 1190 } 1191 } 1192 return builder.toString(); 1193 } 1194 1195 /** 1196 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1197 * groups of matching characters at the start or end of the sequence are removed without 1198 * replacement. 1199 */ 1200 @CheckReturnValue 1201 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1202 int first = negate().indexIn(sequence); 1203 if (first == -1) { 1204 return ""; // everything matches. nothing's left. 1205 } 1206 StringBuilder builder = new StringBuilder(sequence.length()); 1207 boolean inMatchingGroup = false; 1208 for (int i = first; i < sequence.length(); i++) { 1209 char c = sequence.charAt(i); 1210 if (matches(c)) { 1211 inMatchingGroup = true; 1212 } else { 1213 if (inMatchingGroup) { 1214 builder.append(replacement); 1215 inMatchingGroup = false; 1216 } 1217 builder.append(c); 1218 } 1219 } 1220 return builder.toString(); 1221 } 1222 1223 // Predicate interface 1224 1225 /** 1226 * Returns {@code true} if this matcher matches the given character. 1227 * 1228 * @throws NullPointerException if {@code character} is null 1229 */ 1230 @Override public boolean apply(Character character) { 1231 return matches(character); 1232 } 1233 1234 /** 1235 * Returns a string representation of this {@code CharMatcher}, such as 1236 * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 1237 */ 1238 @Override 1239 public String toString() { 1240 return super.toString(); 1241 } 1242 1243 /** 1244 * Determines whether a character is whitespace according to the latest Unicode standard, as 1245 * illustrated 1246 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 1247 * This is not the same definition used by other Java APIs. (See a 1248 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 1249 * definitions of "whitespace"</a>.) 1250 * 1251 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up 1252 * to date. 1253 */ 1254 public static final CharMatcher WHITESPACE = new CharMatcher() { 1255 /** 1256 * A special-case CharMatcher for Unicode whitespace characters that is extremely 1257 * efficient both in space required and in time to check for matches. 1258 * 1259 * Implementation details. 1260 * It turns out that all current (early 2012) Unicode characters are unique modulo 79: 1261 * so we can construct a lookup table of exactly 79 entries, and just check the character code 1262 * mod 79, and see if that character is in the table. 1263 * 1264 * There is a 1 at the beginning of the table so that the null character is not listed 1265 * as whitespace. 1266 * 1267 * Other things we tried that did not prove to be beneficial, mostly due to speed concerns: 1268 * 1269 * * Binary search into the sorted list of characters, i.e., what 1270 * CharMatcher.anyOf() does</li> 1271 * * Perfect hash function into a table of size 26 (using an offset table and a special 1272 * Jenkins hash function)</li> 1273 * * Perfect-ish hash function that required two lookups into a single table of size 26.</li> 1274 * * Using a power-of-2 sized hash table (size 64) with linear probing.</li> 1275 * 1276 * --Christopher Swenson, February 2012. 1277 */ 1278 1279 // Mod-79 lookup table. 1280 private final char[] table = {1, 0, 160, 0, 0, 0, 0, 0, 0, 9, 10, 11, 12, 13, 0, 0, 1281 8232, 8233, 0, 0, 0, 0, 0, 8239, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1282 12288, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199, 1283 8200, 8201, 8202, 0, 0, 0, 0, 0, 8287, 5760, 0, 0, 6158, 0, 0, 0}; 1284 1285 @Override public boolean matches(char c) { 1286 return table[c % 79] == c; 1287 } 1288 1289 @Override public CharMatcher precomputed() { 1290 return this; 1291 } 1292 1293 @Override public String toString() { 1294 return "CharMatcher.WHITESPACE"; 1295 } 1296 }; 1297 }