001 /* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.base; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 022 import com.google.common.annotations.Beta; 023 import com.google.common.annotations.GwtCompatible; 024 025 import java.util.Arrays; 026 import javax.annotation.CheckReturnValue; 027 028 /** 029 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 030 * for any {@link Object}. Also offers basic text processing methods based on this function. 031 * Implementations are strongly encouraged to be side-effect-free and immutable. 032 * 033 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 034 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}". 035 * 036 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand 037 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical 038 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher} 039 * treats these just as two separate characters. 040 * 041 * <p>Example usages: <pre> 042 * String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput); 043 * if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 044 * 045 * <p>See the Guava User Guide article on <a href= 046 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher"> 047 * {@code CharMatcher}</a>. 048 * 049 * @author Kevin Bourrillion 050 * @since 1.0 051 */ 052 @Beta // Possibly change from chars to code points; decide constants vs. methods 053 @GwtCompatible 054 public abstract class CharMatcher implements Predicate<Character> { 055 // Constants 056 /** 057 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 058 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a 059 * discussion of that term. 060 * 061 * @since 2.0 062 */ 063 public static final CharMatcher BREAKING_WHITESPACE = 064 anyOf("\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000") 065 .or(inRange('\u2000', '\u2006')) 066 .or(inRange('\u2008', '\u200a')) 067 .withToString("CharMatcher.BREAKING_WHITESPACE") 068 .precomputed(); 069 070 /** 071 * Determines whether a character is ASCII, meaning that its code point is less than 128. 072 */ 073 public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII"); 074 075 /** 076 * Determines whether a character is a digit according to 077 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 078 */ 079 public static final CharMatcher DIGIT; 080 081 static { 082 CharMatcher digit = inRange('0', '9'); 083 String zeroes = 084 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66" 085 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946" 086 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 087 for (char base : zeroes.toCharArray()) { 088 digit = digit.or(inRange(base, (char) (base + 9))); 089 } 090 DIGIT = digit.withToString("CharMatcher.DIGIT").precomputed(); 091 } 092 093 /** 094 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's 095 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 096 */ 097 public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") { 098 @Override public boolean matches(char c) { 099 return Character.isDigit(c); 100 } 101 }; 102 103 /** 104 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's 105 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code 106 * inRange('a', 'z').or(inRange('A', 'Z'))}. 107 */ 108 public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") { 109 @Override public boolean matches(char c) { 110 return Character.isLetter(c); 111 } 112 113 @Override public CharMatcher precomputed() { 114 return this; 115 } 116 }; 117 118 /** 119 * Determines whether a character is a letter or digit according to {@link 120 * Character#isLetterOrDigit(char) Java's definition}. 121 */ 122 public static final CharMatcher JAVA_LETTER_OR_DIGIT = 123 new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") { 124 @Override public boolean matches(char c) { 125 return Character.isLetterOrDigit(c); 126 } 127 }; 128 129 /** 130 * Determines whether a character is upper case according to {@link Character#isUpperCase(char) 131 * Java's definition}. 132 */ 133 public static final CharMatcher JAVA_UPPER_CASE = 134 new CharMatcher("CharMatcher.JAVA_UPPER_CASE") { 135 @Override public boolean matches(char c) { 136 return Character.isUpperCase(c); 137 } 138 }; 139 140 /** 141 * Determines whether a character is lower case according to {@link Character#isLowerCase(char) 142 * Java's definition}. 143 */ 144 public static final CharMatcher JAVA_LOWER_CASE = 145 new CharMatcher("CharMatcher.JAVA_LOWER_CASE") { 146 @Override public boolean matches(char c) { 147 return Character.isLowerCase(c); 148 } 149 }; 150 151 /** 152 * Determines whether a character is an ISO control character as specified by {@link 153 * Character#isISOControl(char)}. 154 */ 155 public static final CharMatcher JAVA_ISO_CONTROL = 156 inRange('\u0000', '\u001f') 157 .or(inRange('\u007f', '\u009f')) 158 .withToString("CharMatcher.JAVA_ISO_CONTROL"); 159 160 /** 161 * Determines whether a character is invisible; that is, if its Unicode category is any of 162 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 163 * PRIVATE_USE according to ICU4J. 164 */ 165 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020') 166 .or(inRange('\u007f', '\u00a0')) 167 .or(is('\u00ad')) 168 .or(inRange('\u0600', '\u0604')) 169 .or(anyOf("\u06dd\u070f\u1680\u180e")) 170 .or(inRange('\u2000', '\u200f')) 171 .or(inRange('\u2028', '\u202f')) 172 .or(inRange('\u205f', '\u2064')) 173 .or(inRange('\u206a', '\u206f')) 174 .or(is('\u3000')) 175 .or(inRange('\ud800', '\uf8ff')) 176 .or(anyOf("\ufeff\ufff9\ufffa\ufffb")) 177 .withToString("CharMatcher.INVISIBLE") 178 .precomputed(); 179 180 /** 181 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 182 * errs on the side of returning {@code false} (that is, it tends to assume a character is 183 * double-width). 184 * 185 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to 186 * date. 187 */ 188 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9') 189 .or(is('\u05be')) 190 .or(inRange('\u05d0', '\u05ea')) 191 .or(is('\u05f3')) 192 .or(is('\u05f4')) 193 .or(inRange('\u0600', '\u06ff')) 194 .or(inRange('\u0750', '\u077f')) 195 .or(inRange('\u0e00', '\u0e7f')) 196 .or(inRange('\u1e00', '\u20af')) 197 .or(inRange('\u2100', '\u213a')) 198 .or(inRange('\ufb50', '\ufdff')) 199 .or(inRange('\ufe70', '\ufeff')) 200 .or(inRange('\uff61', '\uffdc')) 201 .withToString("CharMatcher.SINGLE_WIDTH") 202 .precomputed(); 203 204 /** Matches any character. */ 205 public static final CharMatcher ANY = 206 new CharMatcher("CharMatcher.ANY") { 207 @Override public boolean matches(char c) { 208 return true; 209 } 210 211 @Override public int indexIn(CharSequence sequence) { 212 return (sequence.length() == 0) ? -1 : 0; 213 } 214 215 @Override public int indexIn(CharSequence sequence, int start) { 216 int length = sequence.length(); 217 Preconditions.checkPositionIndex(start, length); 218 return (start == length) ? -1 : start; 219 } 220 221 @Override public int lastIndexIn(CharSequence sequence) { 222 return sequence.length() - 1; 223 } 224 225 @Override public boolean matchesAllOf(CharSequence sequence) { 226 checkNotNull(sequence); 227 return true; 228 } 229 230 @Override public boolean matchesNoneOf(CharSequence sequence) { 231 return sequence.length() == 0; 232 } 233 234 @Override public String removeFrom(CharSequence sequence) { 235 checkNotNull(sequence); 236 return ""; 237 } 238 239 @Override public String replaceFrom(CharSequence sequence, char replacement) { 240 char[] array = new char[sequence.length()]; 241 Arrays.fill(array, replacement); 242 return new String(array); 243 } 244 245 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 246 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 247 for (int i = 0; i < sequence.length(); i++) { 248 retval.append(replacement); 249 } 250 return retval.toString(); 251 } 252 253 @Override public String collapseFrom(CharSequence sequence, char replacement) { 254 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 255 } 256 257 @Override public String trimFrom(CharSequence sequence) { 258 checkNotNull(sequence); 259 return ""; 260 } 261 262 @Override public int countIn(CharSequence sequence) { 263 return sequence.length(); 264 } 265 266 @Override public CharMatcher and(CharMatcher other) { 267 return checkNotNull(other); 268 } 269 270 @Override public CharMatcher or(CharMatcher other) { 271 checkNotNull(other); 272 return this; 273 } 274 275 @Override public CharMatcher negate() { 276 return NONE; 277 } 278 279 @Override public CharMatcher precomputed() { 280 return this; 281 } 282 }; 283 284 /** Matches no characters. */ 285 public static final CharMatcher NONE = 286 new CharMatcher("CharMatcher.NONE") { 287 @Override public boolean matches(char c) { 288 return false; 289 } 290 291 @Override public int indexIn(CharSequence sequence) { 292 checkNotNull(sequence); 293 return -1; 294 } 295 296 @Override public int indexIn(CharSequence sequence, int start) { 297 int length = sequence.length(); 298 Preconditions.checkPositionIndex(start, length); 299 return -1; 300 } 301 302 @Override public int lastIndexIn(CharSequence sequence) { 303 checkNotNull(sequence); 304 return -1; 305 } 306 307 @Override public boolean matchesAllOf(CharSequence sequence) { 308 return sequence.length() == 0; 309 } 310 311 @Override public boolean matchesNoneOf(CharSequence sequence) { 312 checkNotNull(sequence); 313 return true; 314 } 315 316 @Override public String removeFrom(CharSequence sequence) { 317 return sequence.toString(); 318 } 319 320 @Override public String replaceFrom(CharSequence sequence, char replacement) { 321 return sequence.toString(); 322 } 323 324 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 325 checkNotNull(replacement); 326 return sequence.toString(); 327 } 328 329 @Override public String collapseFrom(CharSequence sequence, char replacement) { 330 return sequence.toString(); 331 } 332 333 @Override public String trimFrom(CharSequence sequence) { 334 return sequence.toString(); 335 } 336 337 @Override public int countIn(CharSequence sequence) { 338 checkNotNull(sequence); 339 return 0; 340 } 341 342 @Override public CharMatcher and(CharMatcher other) { 343 checkNotNull(other); 344 return this; 345 } 346 347 @Override public CharMatcher or(CharMatcher other) { 348 return checkNotNull(other); 349 } 350 351 @Override public CharMatcher negate() { 352 return ANY; 353 } 354 355 @Override void setBits(LookupTable table) {} 356 357 @Override public CharMatcher precomputed() { 358 return this; 359 } 360 }; 361 362 // Static factories 363 364 /** 365 * Returns a {@code char} matcher that matches only one specified character. 366 */ 367 public static CharMatcher is(final char match) { 368 String description = new StringBuilder("CharMatcher.is(") 369 .append(Integer.toHexString(match)) 370 .append(")") 371 .toString(); 372 return new CharMatcher(description) { 373 @Override public boolean matches(char c) { 374 return c == match; 375 } 376 377 @Override public String replaceFrom(CharSequence sequence, char replacement) { 378 return sequence.toString().replace(match, replacement); 379 } 380 381 @Override public CharMatcher and(CharMatcher other) { 382 return other.matches(match) ? this : NONE; 383 } 384 385 @Override public CharMatcher or(CharMatcher other) { 386 return other.matches(match) ? other : super.or(other); 387 } 388 389 @Override public CharMatcher negate() { 390 return isNot(match); 391 } 392 393 @Override void setBits(LookupTable table) { 394 table.set(match); 395 } 396 397 @Override public CharMatcher precomputed() { 398 return this; 399 } 400 }; 401 } 402 403 /** 404 * Returns a {@code char} matcher that matches any character except the one specified. 405 * 406 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 407 */ 408 public static CharMatcher isNot(final char match) { 409 String description = new StringBuilder("CharMatcher.isNot(") 410 .append(Integer.toHexString(match)) 411 .append(")") 412 .toString(); 413 return new CharMatcher(description) { 414 @Override public boolean matches(char c) { 415 return c != match; 416 } 417 418 @Override public CharMatcher and(CharMatcher other) { 419 return other.matches(match) ? super.and(other) : other; 420 } 421 422 @Override public CharMatcher or(CharMatcher other) { 423 return other.matches(match) ? ANY : this; 424 } 425 426 @Override public CharMatcher negate() { 427 return is(match); 428 } 429 }; 430 } 431 432 /** 433 * Returns a {@code char} matcher that matches any character present in the given character 434 * sequence. 435 */ 436 public static CharMatcher anyOf(final CharSequence sequence) { 437 switch (sequence.length()) { 438 case 0: 439 return NONE; 440 case 1: 441 return is(sequence.charAt(0)); 442 case 2: 443 final char match1 = sequence.charAt(0); 444 final char match2 = sequence.charAt(1); 445 return new CharMatcher( 446 new StringBuilder("CharMatcher.anyOf(\"").append(sequence).append("\")").toString()) { 447 @Override public boolean matches(char c) { 448 return c == match1 || c == match2; 449 } 450 451 @Override void setBits(LookupTable table) { 452 table.set(match1); 453 table.set(match2); 454 } 455 456 @Override public CharMatcher precomputed() { 457 return this; 458 } 459 }; 460 } 461 final char[] chars = sequence.toString().toCharArray(); 462 Arrays.sort(chars); 463 464 return new CharMatcher(new StringBuilder("CharMatcher.anyOf(\"").append(chars) 465 .append("\")").toString()) { 466 @Override public boolean matches(char c) { 467 return Arrays.binarySearch(chars, c) >= 0; 468 } 469 }; 470 } 471 472 /** 473 * Returns a {@code char} matcher that matches any character not present in the given character 474 * sequence. 475 */ 476 public static CharMatcher noneOf(CharSequence sequence) { 477 return anyOf(sequence).negate(); 478 } 479 480 /** 481 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are 482 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 483 * CharMatcher.inRange('a', 'z')}. 484 * 485 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 486 */ 487 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 488 checkArgument(endInclusive >= startInclusive); 489 String description = new StringBuilder("CharMatcher.inRange(") 490 .append(Integer.toHexString(startInclusive)) 491 .append(", ") 492 .append(Integer.toHexString(endInclusive)) 493 .append(")") 494 .toString(); 495 return inRange(startInclusive, endInclusive, description); 496 } 497 498 static CharMatcher inRange(final char startInclusive, final char endInclusive, 499 String description) { 500 return new CharMatcher(description) { 501 @Override public boolean matches(char c) { 502 return startInclusive <= c && c <= endInclusive; 503 } 504 505 @Override void setBits(LookupTable table) { 506 char c = startInclusive; 507 while (true) { 508 table.set(c); 509 if (c++ == endInclusive) { 510 break; 511 } 512 } 513 } 514 515 @Override public CharMatcher precomputed() { 516 return this; 517 } 518 }; 519 } 520 521 /** 522 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 523 * which operates on primitive {@code char} instances instead. 524 */ 525 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 526 checkNotNull(predicate); 527 if (predicate instanceof CharMatcher) { 528 return (CharMatcher) predicate; 529 } 530 String description = new StringBuilder("CharMatcher.forPredicate(") 531 .append(predicate) 532 .append(')') 533 .toString(); 534 return new CharMatcher(description) { 535 @Override public boolean matches(char c) { 536 return predicate.apply(c); 537 } 538 539 @Override public boolean apply(Character character) { 540 return predicate.apply(checkNotNull(character)); 541 } 542 }; 543 } 544 545 // State 546 final String description; 547 548 // Constructors 549 550 /** 551 * Sets the {@code toString()} from the given description. 552 */ 553 CharMatcher(String description) { 554 this.description = description; 555 } 556 557 /** 558 * Constructor for use by subclasses. When subclassing, you may want to override 559 * {@code toString()} to provide a useful description. 560 */ 561 protected CharMatcher() { 562 description = "UnknownCharMatcher"; 563 } 564 565 // Abstract methods 566 567 /** Determines a true or false value for the given character. */ 568 public abstract boolean matches(char c); 569 570 // Non-static factories 571 572 /** 573 * Returns a matcher that matches any character not matched by this matcher. 574 */ 575 public CharMatcher negate() { 576 final CharMatcher original = this; 577 return new CharMatcher(original + ".negate()") { 578 @Override public boolean matches(char c) { 579 return !original.matches(c); 580 } 581 582 @Override public boolean matchesAllOf(CharSequence sequence) { 583 return original.matchesNoneOf(sequence); 584 } 585 586 @Override public boolean matchesNoneOf(CharSequence sequence) { 587 return original.matchesAllOf(sequence); 588 } 589 590 @Override public int countIn(CharSequence sequence) { 591 return sequence.length() - original.countIn(sequence); 592 } 593 594 @Override public CharMatcher negate() { 595 return original; 596 } 597 }; 598 } 599 600 /** 601 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 602 */ 603 public CharMatcher and(CharMatcher other) { 604 return new And(this, checkNotNull(other)); 605 } 606 607 private static class And extends CharMatcher { 608 final CharMatcher first; 609 final CharMatcher second; 610 611 And(CharMatcher a, CharMatcher b) { 612 this(a, b, "CharMatcher.and(" + a + ", " + b + ")"); 613 } 614 615 And(CharMatcher a, CharMatcher b, String description) { 616 super(description); 617 first = checkNotNull(a); 618 second = checkNotNull(b); 619 } 620 621 @Override 622 public CharMatcher and(CharMatcher other) { 623 return new And(this, other); 624 } 625 626 @Override 627 public boolean matches(char c) { 628 return first.matches(c) && second.matches(c); 629 } 630 631 @Override 632 CharMatcher withToString(String description) { 633 return new And(first, second, description); 634 } 635 } 636 637 /** 638 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 639 */ 640 public CharMatcher or(CharMatcher other) { 641 return new Or(this, checkNotNull(other)); 642 } 643 644 private static class Or extends CharMatcher { 645 final CharMatcher first; 646 final CharMatcher second; 647 648 Or(CharMatcher a, CharMatcher b, String description) { 649 super(description); 650 first = checkNotNull(a); 651 second = checkNotNull(b); 652 } 653 654 Or(CharMatcher a, CharMatcher b) { 655 this(a, b, "CharMatcher.or(" + a + ", " + b + ")"); 656 } 657 658 @Override 659 public CharMatcher or(CharMatcher other) { 660 return new Or(this, checkNotNull(other)); 661 } 662 663 @Override 664 public boolean matches(char c) { 665 return first.matches(c) || second.matches(c); 666 } 667 668 @Override 669 CharMatcher withToString(String description) { 670 return new Or(first, second, description); 671 } 672 } 673 674 /** 675 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 676 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 677 * worthwhile only if the precomputed matcher is queried many thousands of times. 678 * 679 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 680 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 681 * worthwhile tradeoff in a browser. 682 */ 683 public CharMatcher precomputed() { 684 return Platform.precomputeCharMatcher(this); 685 } 686 687 /** 688 * Construct an array of all possible chars in the slowest way possible. 689 */ 690 char[] slowGetChars() { 691 char[] allChars = new char[65536]; 692 int size = 0; 693 for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) { 694 if (matches((char) c)) { 695 allChars[size++] = (char) c; 696 } 697 } 698 char[] retValue = new char[size]; 699 System.arraycopy(allChars, 0, retValue, 0, size); 700 return retValue; 701 } 702 703 /** 704 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 705 * on {@link Platform} so that we can have different behavior in GWT. 706 * 707 * <p>If the number of matched characters is small enough, we try to build a small hash 708 * table to contain all of the characters. Otherwise, we record the characters in eight-kilobyte 709 * bit array. In many situations this produces a matcher which is faster to query 710 * than the original. 711 */ 712 CharMatcher precomputedInternal() { 713 final char[] chars = slowGetChars(); 714 int totalCharacters = chars.length; 715 if (totalCharacters == 0) { 716 return NONE; 717 } else if (totalCharacters == 1) { 718 return is(chars[0]); 719 } else if (totalCharacters < SmallCharMatcher.MAX_SIZE) { 720 return SmallCharMatcher.from(chars, toString()); 721 } else if (totalCharacters < MediumCharMatcher.MAX_SIZE) { 722 return MediumCharMatcher.from(chars, toString()); 723 } 724 // Otherwise, make the full lookup table. 725 final LookupTable table = new LookupTable(); 726 setBits(table); 727 final CharMatcher outer = this; 728 729 return new CharMatcher(outer.toString()) { 730 @Override public boolean matches(char c) { 731 return table.get(c); 732 } 733 734 // TODO(kevinb): make methods like negate() smart? 735 736 @Override public CharMatcher precomputed() { 737 return this; 738 } 739 }; 740 } 741 742 /** 743 * Subclasses should provide a new CharMatcher with the same characteristics as {@code this}, 744 * but with their {@code toString} method overridden with the new description. 745 * 746 * <p>This is unsupported by default. 747 */ 748 CharMatcher withToString(String description) { 749 throw new UnsupportedOperationException(); 750 751 } 752 753 /** 754 * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal 755 * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched. 756 * 757 * <p>The default implementation loops over every possible character value, invoking {@link 758 * #matches} for each one. 759 */ 760 void setBits(LookupTable table) { 761 char c = Character.MIN_VALUE; 762 while (true) { 763 if (matches(c)) { 764 table.set(c); 765 } 766 if (c++ == Character.MAX_VALUE) { 767 break; 768 } 769 } 770 } 771 772 /** 773 * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}. 774 * 775 * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a 776 * simpler java.util.BitSet. 777 */ 778 private static final class LookupTable { 779 int[] data = new int[2048]; 780 781 void set(char index) { 782 data[index >> 5] |= (1 << index); 783 } 784 785 boolean get(char index) { 786 return (data[index >> 5] & (1 << index)) != 0; 787 } 788 } 789 790 // Text processing routines 791 792 /** 793 * Returns {@code true} if a character sequence contains at least one matching character. 794 * Equivalent to {@code !matchesNoneOf(sequence)}. 795 * 796 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 797 * character, until this returns {@code true} or the end is reached. 798 * 799 * @param sequence the character sequence to examine, possibly empty 800 * @return {@code true} if this matcher matches at least one character in the sequence 801 * @since 8.0 802 */ 803 public boolean matchesAnyOf(CharSequence sequence) { 804 return !matchesNoneOf(sequence); 805 } 806 807 /** 808 * Returns {@code true} if a character sequence contains only matching characters. 809 * 810 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 811 * character, until this returns {@code false} or the end is reached. 812 * 813 * @param sequence the character sequence to examine, possibly empty 814 * @return {@code true} if this matcher matches every character in the sequence, including when 815 * the sequence is empty 816 */ 817 public boolean matchesAllOf(CharSequence sequence) { 818 for (int i = sequence.length() - 1; i >= 0; i--) { 819 if (!matches(sequence.charAt(i))) { 820 return false; 821 } 822 } 823 return true; 824 } 825 826 /** 827 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to 828 * {@code !matchesAnyOf(sequence)}. 829 * 830 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 831 * character, until this returns {@code false} or the end is reached. 832 * 833 * @param sequence the character sequence to examine, possibly empty 834 * @return {@code true} if this matcher matches every character in the sequence, including when 835 * the sequence is empty 836 */ 837 public boolean matchesNoneOf(CharSequence sequence) { 838 return indexIn(sequence) == -1; 839 } 840 841 /** 842 * Returns the index of the first matching character in a character sequence, or {@code -1} if no 843 * matching character is present. 844 * 845 * <p>The default implementation iterates over the sequence in forward order calling {@link 846 * #matches} for each character. 847 * 848 * @param sequence the character sequence to examine from the beginning 849 * @return an index, or {@code -1} if no character matches 850 */ 851 public int indexIn(CharSequence sequence) { 852 int length = sequence.length(); 853 for (int i = 0; i < length; i++) { 854 if (matches(sequence.charAt(i))) { 855 return i; 856 } 857 } 858 return -1; 859 } 860 861 /** 862 * Returns the index of the first matching character in a character sequence, starting from a 863 * given position, or {@code -1} if no character matches after that position. 864 * 865 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 866 * start}, calling {@link #matches} for each character. 867 * 868 * @param sequence the character sequence to examine 869 * @param start the first index to examine; must be nonnegative and no greater than {@code 870 * sequence.length()} 871 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 872 * or {@code -1} if no character matches 873 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 874 * sequence.length()} 875 */ 876 public int indexIn(CharSequence sequence, int start) { 877 int length = sequence.length(); 878 Preconditions.checkPositionIndex(start, length); 879 for (int i = start; i < length; i++) { 880 if (matches(sequence.charAt(i))) { 881 return i; 882 } 883 } 884 return -1; 885 } 886 887 /** 888 * Returns the index of the last matching character in a character sequence, or {@code -1} if no 889 * matching character is present. 890 * 891 * <p>The default implementation iterates over the sequence in reverse order calling {@link 892 * #matches} for each character. 893 * 894 * @param sequence the character sequence to examine from the end 895 * @return an index, or {@code -1} if no character matches 896 */ 897 public int lastIndexIn(CharSequence sequence) { 898 for (int i = sequence.length() - 1; i >= 0; i--) { 899 if (matches(sequence.charAt(i))) { 900 return i; 901 } 902 } 903 return -1; 904 } 905 906 /** 907 * Returns the number of matching characters found in a character sequence. 908 */ 909 public int countIn(CharSequence sequence) { 910 int count = 0; 911 for (int i = 0; i < sequence.length(); i++) { 912 if (matches(sequence.charAt(i))) { 913 count++; 914 } 915 } 916 return count; 917 } 918 919 /** 920 * Returns a string containing all non-matching characters of a character sequence, in order. For 921 * example: <pre> {@code 922 * 923 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 924 * 925 * ... returns {@code "bzr"}. 926 */ 927 @CheckReturnValue 928 public String removeFrom(CharSequence sequence) { 929 String string = sequence.toString(); 930 int pos = indexIn(string); 931 if (pos == -1) { 932 return string; 933 } 934 935 char[] chars = string.toCharArray(); 936 int spread = 1; 937 938 // This unusual loop comes from extensive benchmarking 939 OUT: while (true) { 940 pos++; 941 while (true) { 942 if (pos == chars.length) { 943 break OUT; 944 } 945 if (matches(chars[pos])) { 946 break; 947 } 948 chars[pos - spread] = chars[pos]; 949 pos++; 950 } 951 spread++; 952 } 953 return new String(chars, 0, pos - spread); 954 } 955 956 /** 957 * Returns a string containing all matching characters of a character sequence, in order. For 958 * example: <pre> {@code 959 * 960 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 961 * 962 * ... returns {@code "aaa"}. 963 */ 964 @CheckReturnValue 965 public String retainFrom(CharSequence sequence) { 966 return negate().removeFrom(sequence); 967 } 968 969 /** 970 * Returns a string copy of the input character sequence, with each character that matches this 971 * matcher replaced by a given replacement character. For example: <pre> {@code 972 * 973 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 974 * 975 * ... returns {@code "rodor"}. 976 * 977 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 978 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 979 * character. 980 * 981 * @param sequence the character sequence to replace matching characters in 982 * @param replacement the character to append to the result string in place of each matching 983 * character in {@code sequence} 984 * @return the new string 985 */ 986 @CheckReturnValue 987 public String replaceFrom(CharSequence sequence, char replacement) { 988 String string = sequence.toString(); 989 int pos = indexIn(string); 990 if (pos == -1) { 991 return string; 992 } 993 char[] chars = string.toCharArray(); 994 chars[pos] = replacement; 995 for (int i = pos + 1; i < chars.length; i++) { 996 if (matches(chars[i])) { 997 chars[i] = replacement; 998 } 999 } 1000 return new String(chars); 1001 } 1002 1003 /** 1004 * Returns a string copy of the input character sequence, with each character that matches this 1005 * matcher replaced by a given replacement sequence. For example: <pre> {@code 1006 * 1007 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 1008 * 1009 * ... returns {@code "yoohoo"}. 1010 * 1011 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 1012 * off calling {@link #replaceFrom(CharSequence, char)} directly. 1013 * 1014 * @param sequence the character sequence to replace matching characters in 1015 * @param replacement the characters to append to the result string in place of each matching 1016 * character in {@code sequence} 1017 * @return the new string 1018 */ 1019 @CheckReturnValue 1020 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1021 int replacementLen = replacement.length(); 1022 if (replacementLen == 0) { 1023 return removeFrom(sequence); 1024 } 1025 if (replacementLen == 1) { 1026 return replaceFrom(sequence, replacement.charAt(0)); 1027 } 1028 1029 String string = sequence.toString(); 1030 int pos = indexIn(string); 1031 if (pos == -1) { 1032 return string; 1033 } 1034 1035 int len = string.length(); 1036 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 1037 1038 int oldpos = 0; 1039 do { 1040 buf.append(string, oldpos, pos); 1041 buf.append(replacement); 1042 oldpos = pos + 1; 1043 pos = indexIn(string, oldpos); 1044 } while (pos != -1); 1045 1046 buf.append(string, oldpos, len); 1047 return buf.toString(); 1048 } 1049 1050 /** 1051 * Returns a substring of the input character sequence that omits all characters this matcher 1052 * matches from the beginning and from the end of the string. For example: <pre> {@code 1053 * 1054 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 1055 * 1056 * ... returns {@code "cat"}. 1057 * 1058 * <p>Note that: <pre> {@code 1059 * 1060 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 1061 * 1062 * ... is equivalent to {@link String#trim()}. 1063 */ 1064 @CheckReturnValue 1065 public String trimFrom(CharSequence sequence) { 1066 int len = sequence.length(); 1067 int first; 1068 int last; 1069 1070 for (first = 0; first < len; first++) { 1071 if (!matches(sequence.charAt(first))) { 1072 break; 1073 } 1074 } 1075 for (last = len - 1; last > first; last--) { 1076 if (!matches(sequence.charAt(last))) { 1077 break; 1078 } 1079 } 1080 1081 return sequence.subSequence(first, last + 1).toString(); 1082 } 1083 1084 /** 1085 * Returns a substring of the input character sequence that omits all characters this matcher 1086 * matches from the beginning of the string. For example: <pre> {@code 1087 * 1088 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1089 * 1090 * ... returns {@code "catbab"}. 1091 */ 1092 @CheckReturnValue 1093 public String trimLeadingFrom(CharSequence sequence) { 1094 int len = sequence.length(); 1095 int first; 1096 1097 for (first = 0; first < len; first++) { 1098 if (!matches(sequence.charAt(first))) { 1099 break; 1100 } 1101 } 1102 1103 return sequence.subSequence(first, len).toString(); 1104 } 1105 1106 /** 1107 * Returns a substring of the input character sequence that omits all characters this matcher 1108 * matches from the end of the string. For example: <pre> {@code 1109 * 1110 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1111 * 1112 * ... returns {@code "abacat"}. 1113 */ 1114 @CheckReturnValue 1115 public String trimTrailingFrom(CharSequence sequence) { 1116 int len = sequence.length(); 1117 int last; 1118 1119 for (last = len - 1; last >= 0; last--) { 1120 if (!matches(sequence.charAt(last))) { 1121 break; 1122 } 1123 } 1124 1125 return sequence.subSequence(0, last + 1).toString(); 1126 } 1127 1128 /** 1129 * Returns a string copy of the input character sequence, with each group of consecutive 1130 * characters that match this matcher replaced by a single replacement character. For example: 1131 * <pre> {@code 1132 * 1133 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1134 * 1135 * ... returns {@code "b-p-r"}. 1136 * 1137 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1138 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1139 * character. 1140 * 1141 * @param sequence the character sequence to replace matching groups of characters in 1142 * @param replacement the character to append to the result string in place of each group of 1143 * matching characters in {@code sequence} 1144 * @return the new string 1145 */ 1146 @CheckReturnValue 1147 public String collapseFrom(CharSequence sequence, char replacement) { 1148 int first = indexIn(sequence); 1149 if (first == -1) { 1150 return sequence.toString(); 1151 } 1152 1153 // TODO(kevinb): see if this implementation can be made faster 1154 StringBuilder builder = new StringBuilder(sequence.length()) 1155 .append(sequence.subSequence(0, first)) 1156 .append(replacement); 1157 boolean in = true; 1158 for (int i = first + 1; i < sequence.length(); i++) { 1159 char c = sequence.charAt(i); 1160 if (matches(c)) { 1161 if (!in) { 1162 builder.append(replacement); 1163 in = true; 1164 } 1165 } else { 1166 builder.append(c); 1167 in = false; 1168 } 1169 } 1170 return builder.toString(); 1171 } 1172 1173 /** 1174 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1175 * groups of matching characters at the start or end of the sequence are removed without 1176 * replacement. 1177 */ 1178 @CheckReturnValue 1179 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1180 int first = negate().indexIn(sequence); 1181 if (first == -1) { 1182 return ""; // everything matches. nothing's left. 1183 } 1184 StringBuilder builder = new StringBuilder(sequence.length()); 1185 boolean inMatchingGroup = false; 1186 for (int i = first; i < sequence.length(); i++) { 1187 char c = sequence.charAt(i); 1188 if (matches(c)) { 1189 inMatchingGroup = true; 1190 } else { 1191 if (inMatchingGroup) { 1192 builder.append(replacement); 1193 inMatchingGroup = false; 1194 } 1195 builder.append(c); 1196 } 1197 } 1198 return builder.toString(); 1199 } 1200 1201 // Predicate interface 1202 1203 /** 1204 * Returns {@code true} if this matcher matches the given character. 1205 * 1206 * @throws NullPointerException if {@code character} is null 1207 */ 1208 @Override public boolean apply(Character character) { 1209 return matches(character); 1210 } 1211 1212 /** 1213 * Returns a string representation of this {@code CharMatcher}, such as 1214 * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 1215 */ 1216 @Override 1217 public String toString() { 1218 return description; 1219 } 1220 1221 /** 1222 * Determines whether a character is whitespace according to the latest Unicode standard, as 1223 * illustrated 1224 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 1225 * This is not the same definition used by other Java APIs. (See a 1226 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 1227 * definitions of "whitespace"</a>.) 1228 * 1229 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up 1230 * to date. 1231 */ 1232 public static final CharMatcher WHITESPACE = new CharMatcher("CharMatcher.WHITESPACE") { 1233 /** 1234 * A special-case CharMatcher for Unicode whitespace characters that is extremely 1235 * efficient both in space required and in time to check for matches. 1236 * 1237 * Implementation details. 1238 * It turns out that all current (early 2012) Unicode characters are unique modulo 79: 1239 * so we can construct a lookup table of exactly 79 entries, and just check the character code 1240 * mod 79, and see if that character is in the table. 1241 * 1242 * There is a 1 at the beginning of the table so that the null character is not listed 1243 * as whitespace. 1244 * 1245 * Other things we tried that did not prove to be beneficial, mostly due to speed concerns: 1246 * 1247 * * Binary search into the sorted list of characters, i.e., what 1248 * CharMatcher.anyOf() does</li> 1249 * * Perfect hash function into a table of size 26 (using an offset table and a special 1250 * Jenkins hash function)</li> 1251 * * Perfect-ish hash function that required two lookups into a single table of size 26.</li> 1252 * * Using a power-of-2 sized hash table (size 64) with linear probing.</li> 1253 * 1254 * --Christopher Swenson, February 2012. 1255 */ 1256 1257 // Mod-79 lookup table. 1258 private final char[] table = {1, 0, 160, 0, 0, 0, 0, 0, 0, 9, 10, 11, 12, 13, 0, 0, 1259 8232, 8233, 0, 0, 0, 0, 0, 8239, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1260 12288, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199, 1261 8200, 8201, 8202, 0, 0, 0, 0, 0, 8287, 5760, 0, 0, 6158, 0, 0, 0}; 1262 1263 @Override public boolean matches(char c) { 1264 return table[c % 79] == c; 1265 } 1266 1267 @Override public CharMatcher precomputed() { 1268 return this; 1269 } 1270 }; 1271 }