001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package com.google.common.base; 018 019import static com.google.common.base.Preconditions.checkArgument; 020import static com.google.common.base.Preconditions.checkNotNull; 021 022import com.google.common.annotations.Beta; 023import com.google.common.annotations.GwtCompatible; 024import com.google.common.annotations.GwtIncompatible; 025 026import java.util.Arrays; 027import java.util.BitSet; 028 029import javax.annotation.CheckReturnValue; 030 031/** 032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 033 * for any {@link Object}. Also offers basic text processing methods based on this function. 034 * Implementations are strongly encouraged to be side-effect-free and immutable. 035 * 036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}". 038 * 039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand 040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical 041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher} 042 * treats these just as two separate characters. 043 * 044 * <p>Example usages: <pre> 045 * String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput); 046 * if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 047 * 048 * <p>See the Guava User Guide article on <a href= 049 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher"> 050 * {@code CharMatcher}</a>. 051 * 052 * @author Kevin Bourrillion 053 * @since 1.0 054 */ 055@Beta // Possibly change from chars to code points; decide constants vs. methods 056@GwtCompatible(emulated = true) 057public abstract class CharMatcher implements Predicate<Character> { 058 // Constants 059 /** 060 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 061 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a 062 * discussion of that term. 063 * 064 * @since 2.0 065 */ 066 public static final CharMatcher BREAKING_WHITESPACE = new CharMatcher() { 067 @Override 068 public boolean matches(char c) { 069 switch (c) { 070 case '\t': 071 case '\n': 072 case '\013': 073 case '\f': 074 case '\r': 075 case ' ': 076 case '\u0085': 077 case '\u1680': 078 case '\u2028': 079 case '\u2029': 080 case '\u205f': 081 case '\u3000': 082 return true; 083 case '\u2007': 084 return false; 085 default: 086 return c >= '\u2000' && c <= '\u200a'; 087 } 088 } 089 090 @Override 091 public String toString() { 092 return "CharMatcher.BREAKING_WHITESPACE"; 093 } 094 }; 095 096 /** 097 * Determines whether a character is ASCII, meaning that its code point is less than 128. 098 */ 099 public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII"); 100 101 private static class RangesMatcher extends CharMatcher { 102 private final char[] rangeStarts; 103 private final char[] rangeEnds; 104 105 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 106 super(description); 107 this.rangeStarts = rangeStarts; 108 this.rangeEnds = rangeEnds; 109 checkArgument(rangeStarts.length == rangeEnds.length); 110 for (int i = 0; i < rangeStarts.length; i++) { 111 checkArgument(rangeStarts[i] <= rangeEnds[i]); 112 if (i + 1 < rangeStarts.length) { 113 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 114 } 115 } 116 } 117 118 @Override 119 public boolean matches(char c) { 120 int index = Arrays.binarySearch(rangeStarts, c); 121 if (index >= 0) { 122 return true; 123 } else { 124 index = ~index - 1; 125 return index >= 0 && c <= rangeEnds[index]; 126 } 127 } 128 } 129 130 // Must be in ascending order. 131 private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6" 132 + "\u0c66\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1b50\u1bb0" 133 + "\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 134 135 private static final String NINES; 136 static { 137 StringBuilder builder = new StringBuilder(ZEROES.length()); 138 for (int i = 0; i < ZEROES.length(); i++) { 139 builder.append((char) (ZEROES.charAt(i) + 9)); 140 } 141 NINES = builder.toString(); 142 } 143 144 /** 145 * Determines whether a character is a digit according to 146 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 147 */ 148 public static final CharMatcher DIGIT = new RangesMatcher( 149 "CharMatcher.DIGIT", ZEROES.toCharArray(), NINES.toCharArray()); 150 151 /** 152 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's 153 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 154 */ 155 public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") { 156 @Override public boolean matches(char c) { 157 return Character.isDigit(c); 158 } 159 }; 160 161 /** 162 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's 163 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code 164 * inRange('a', 'z').or(inRange('A', 'Z'))}. 165 */ 166 public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") { 167 @Override public boolean matches(char c) { 168 return Character.isLetter(c); 169 } 170 }; 171 172 /** 173 * Determines whether a character is a letter or digit according to {@link 174 * Character#isLetterOrDigit(char) Java's definition}. 175 */ 176 public static final CharMatcher JAVA_LETTER_OR_DIGIT = 177 new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") { 178 @Override public boolean matches(char c) { 179 return Character.isLetterOrDigit(c); 180 } 181 }; 182 183 /** 184 * Determines whether a character is upper case according to {@link Character#isUpperCase(char) 185 * Java's definition}. 186 */ 187 public static final CharMatcher JAVA_UPPER_CASE = 188 new CharMatcher("CharMatcher.JAVA_UPPER_CASE") { 189 @Override public boolean matches(char c) { 190 return Character.isUpperCase(c); 191 } 192 }; 193 194 /** 195 * Determines whether a character is lower case according to {@link Character#isLowerCase(char) 196 * Java's definition}. 197 */ 198 public static final CharMatcher JAVA_LOWER_CASE = 199 new CharMatcher("CharMatcher.JAVA_LOWER_CASE") { 200 @Override public boolean matches(char c) { 201 return Character.isLowerCase(c); 202 } 203 }; 204 205 /** 206 * Determines whether a character is an ISO control character as specified by {@link 207 * Character#isISOControl(char)}. 208 */ 209 public static final CharMatcher JAVA_ISO_CONTROL = 210 inRange('\u0000', '\u001f') 211 .or(inRange('\u007f', '\u009f')) 212 .withToString("CharMatcher.JAVA_ISO_CONTROL"); 213 214 /** 215 * Determines whether a character is invisible; that is, if its Unicode category is any of 216 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 217 * PRIVATE_USE according to ICU4J. 218 */ 219 public static final CharMatcher INVISIBLE = new RangesMatcher("CharMatcher.INVISIBLE", ( 220 "\u0000\u007f\u00ad\u0600\u06dd\u070f\u1680\u180e\u2000\u2028\u205f\u206a\u3000\ud800\ufeff" 221 + "\ufff9\ufffa").toCharArray(), ( 222 "\u0020\u00a0\u00ad\u0604\u06dd\u070f\u1680\u180e\u200f\u202f\u2064\u206f\u3000\uf8ff\ufeff" 223 + "\ufff9\ufffb").toCharArray()); 224 225 private static String showCharacter(char c) { 226 String hex = "0123456789ABCDEF"; 227 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 228 for (int i = 0; i < 4; i++) { 229 tmp[5 - i] = hex.charAt(c & 0xF); 230 c >>= 4; 231 } 232 return String.copyValueOf(tmp); 233 234 } 235 236 /** 237 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 238 * errs on the side of returning {@code false} (that is, it tends to assume a character is 239 * double-width). 240 * 241 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to 242 * date. 243 */ 244 public static final CharMatcher SINGLE_WIDTH = new RangesMatcher("CharMatcher.SINGLE_WIDTH", 245 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 246 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 247 248 /** Matches any character. */ 249 public static final CharMatcher ANY = 250 new FastMatcher("CharMatcher.ANY") { 251 @Override public boolean matches(char c) { 252 return true; 253 } 254 255 @Override public int indexIn(CharSequence sequence) { 256 return (sequence.length() == 0) ? -1 : 0; 257 } 258 259 @Override public int indexIn(CharSequence sequence, int start) { 260 int length = sequence.length(); 261 Preconditions.checkPositionIndex(start, length); 262 return (start == length) ? -1 : start; 263 } 264 265 @Override public int lastIndexIn(CharSequence sequence) { 266 return sequence.length() - 1; 267 } 268 269 @Override public boolean matchesAllOf(CharSequence sequence) { 270 checkNotNull(sequence); 271 return true; 272 } 273 274 @Override public boolean matchesNoneOf(CharSequence sequence) { 275 return sequence.length() == 0; 276 } 277 278 @Override public String removeFrom(CharSequence sequence) { 279 checkNotNull(sequence); 280 return ""; 281 } 282 283 @Override public String replaceFrom(CharSequence sequence, char replacement) { 284 char[] array = new char[sequence.length()]; 285 Arrays.fill(array, replacement); 286 return new String(array); 287 } 288 289 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 290 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 291 for (int i = 0; i < sequence.length(); i++) { 292 retval.append(replacement); 293 } 294 return retval.toString(); 295 } 296 297 @Override public String collapseFrom(CharSequence sequence, char replacement) { 298 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 299 } 300 301 @Override public String trimFrom(CharSequence sequence) { 302 checkNotNull(sequence); 303 return ""; 304 } 305 306 @Override public int countIn(CharSequence sequence) { 307 return sequence.length(); 308 } 309 310 @Override public CharMatcher and(CharMatcher other) { 311 return checkNotNull(other); 312 } 313 314 @Override public CharMatcher or(CharMatcher other) { 315 checkNotNull(other); 316 return this; 317 } 318 319 @Override public CharMatcher negate() { 320 return NONE; 321 } 322 }; 323 324 /** Matches no characters. */ 325 public static final CharMatcher NONE = 326 new FastMatcher("CharMatcher.NONE") { 327 @Override public boolean matches(char c) { 328 return false; 329 } 330 331 @Override public int indexIn(CharSequence sequence) { 332 checkNotNull(sequence); 333 return -1; 334 } 335 336 @Override public int indexIn(CharSequence sequence, int start) { 337 int length = sequence.length(); 338 Preconditions.checkPositionIndex(start, length); 339 return -1; 340 } 341 342 @Override public int lastIndexIn(CharSequence sequence) { 343 checkNotNull(sequence); 344 return -1; 345 } 346 347 @Override public boolean matchesAllOf(CharSequence sequence) { 348 return sequence.length() == 0; 349 } 350 351 @Override public boolean matchesNoneOf(CharSequence sequence) { 352 checkNotNull(sequence); 353 return true; 354 } 355 356 @Override public String removeFrom(CharSequence sequence) { 357 return sequence.toString(); 358 } 359 360 @Override public String replaceFrom(CharSequence sequence, char replacement) { 361 return sequence.toString(); 362 } 363 364 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 365 checkNotNull(replacement); 366 return sequence.toString(); 367 } 368 369 @Override public String collapseFrom(CharSequence sequence, char replacement) { 370 return sequence.toString(); 371 } 372 373 @Override public String trimFrom(CharSequence sequence) { 374 return sequence.toString(); 375 } 376 377 @Override 378 public String trimLeadingFrom(CharSequence sequence) { 379 return sequence.toString(); 380 } 381 382 @Override 383 public String trimTrailingFrom(CharSequence sequence) { 384 return sequence.toString(); 385 } 386 387 @Override public int countIn(CharSequence sequence) { 388 checkNotNull(sequence); 389 return 0; 390 } 391 392 @Override public CharMatcher and(CharMatcher other) { 393 checkNotNull(other); 394 return this; 395 } 396 397 @Override public CharMatcher or(CharMatcher other) { 398 return checkNotNull(other); 399 } 400 401 @Override public CharMatcher negate() { 402 return ANY; 403 } 404 }; 405 406 // Static factories 407 408 /** 409 * Returns a {@code char} matcher that matches only one specified character. 410 */ 411 public static CharMatcher is(final char match) { 412 String description = "CharMatcher.is('" + showCharacter(match) + "')"; 413 return new FastMatcher(description) { 414 @Override public boolean matches(char c) { 415 return c == match; 416 } 417 418 @Override public String replaceFrom(CharSequence sequence, char replacement) { 419 return sequence.toString().replace(match, replacement); 420 } 421 422 @Override public CharMatcher and(CharMatcher other) { 423 return other.matches(match) ? this : NONE; 424 } 425 426 @Override public CharMatcher or(CharMatcher other) { 427 return other.matches(match) ? other : super.or(other); 428 } 429 430 @Override public CharMatcher negate() { 431 return isNot(match); 432 } 433 434 @GwtIncompatible("java.util.BitSet") 435 @Override 436 void setBits(BitSet table) { 437 table.set(match); 438 } 439 }; 440 } 441 442 /** 443 * Returns a {@code char} matcher that matches any character except the one specified. 444 * 445 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 446 */ 447 public static CharMatcher isNot(final char match) { 448 String description = "CharMatcher.isNot('" + showCharacter(match) + "')"; 449 return new FastMatcher(description) { 450 @Override public boolean matches(char c) { 451 return c != match; 452 } 453 454 @Override public CharMatcher and(CharMatcher other) { 455 return other.matches(match) ? super.and(other) : other; 456 } 457 458 @Override public CharMatcher or(CharMatcher other) { 459 return other.matches(match) ? ANY : this; 460 } 461 462 @GwtIncompatible("java.util.BitSet") 463 @Override 464 void setBits(BitSet table) { 465 table.set(0, match); 466 table.set(match + 1, Character.MAX_VALUE + 1); 467 } 468 469 @Override public CharMatcher negate() { 470 return is(match); 471 } 472 }; 473 } 474 475 /** 476 * Returns a {@code char} matcher that matches any character present in the given character 477 * sequence. 478 */ 479 public static CharMatcher anyOf(final CharSequence sequence) { 480 switch (sequence.length()) { 481 case 0: 482 return NONE; 483 case 1: 484 return is(sequence.charAt(0)); 485 case 2: 486 return isEither(sequence.charAt(0), sequence.charAt(1)); 487 default: 488 // continue below to handle the general case 489 } 490 // TODO(user): is it potentially worth just going ahead and building a precomputed matcher? 491 final char[] chars = sequence.toString().toCharArray(); 492 Arrays.sort(chars); 493 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 494 for (char c : chars) { 495 description.append(showCharacter(c)); 496 } 497 description.append("\")"); 498 return new CharMatcher(description.toString()) { 499 @Override public boolean matches(char c) { 500 return Arrays.binarySearch(chars, c) >= 0; 501 } 502 503 @Override 504 @GwtIncompatible("java.util.BitSet") 505 void setBits(BitSet table) { 506 for (char c : chars) { 507 table.set(c); 508 } 509 } 510 }; 511 } 512 513 private static CharMatcher isEither( 514 final char match1, 515 final char match2) { 516 String description = "CharMatcher.anyOf(\"" + 517 showCharacter(match1) + showCharacter(match2) + "\")"; 518 return new FastMatcher(description) { 519 @Override public boolean matches(char c) { 520 return c == match1 || c == match2; 521 } 522 523 @GwtIncompatible("java.util.BitSet") 524 @Override void setBits(BitSet table) { 525 table.set(match1); 526 table.set(match2); 527 } 528 }; 529 } 530 531 /** 532 * Returns a {@code char} matcher that matches any character not present in the given character 533 * sequence. 534 */ 535 public static CharMatcher noneOf(CharSequence sequence) { 536 return anyOf(sequence).negate(); 537 } 538 539 /** 540 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are 541 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 542 * CharMatcher.inRange('a', 'z')}. 543 * 544 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 545 */ 546 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 547 checkArgument(endInclusive >= startInclusive); 548 String description = "CharMatcher.inRange('" + 549 showCharacter(startInclusive) + "', '" + 550 showCharacter(endInclusive) + "')"; 551 return inRange(startInclusive, endInclusive, description); 552 } 553 554 static CharMatcher inRange(final char startInclusive, final char endInclusive, 555 String description) { 556 return new FastMatcher(description) { 557 @Override public boolean matches(char c) { 558 return startInclusive <= c && c <= endInclusive; 559 } 560 561 @GwtIncompatible("java.util.BitSet") 562 @Override void setBits(BitSet table) { 563 table.set(startInclusive, endInclusive + 1); 564 } 565 }; 566 } 567 568 /** 569 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 570 * which operates on primitive {@code char} instances instead. 571 */ 572 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 573 checkNotNull(predicate); 574 if (predicate instanceof CharMatcher) { 575 return (CharMatcher) predicate; 576 } 577 String description = "CharMatcher.forPredicate(" + predicate + ")"; 578 return new CharMatcher(description) { 579 @Override public boolean matches(char c) { 580 return predicate.apply(c); 581 } 582 583 @Override public boolean apply(Character character) { 584 return predicate.apply(checkNotNull(character)); 585 } 586 }; 587 } 588 589 // State 590 final String description; 591 592 // Constructors 593 594 /** 595 * Sets the {@code toString()} from the given description. 596 */ 597 CharMatcher(String description) { 598 this.description = description; 599 } 600 601 /** 602 * Constructor for use by subclasses. When subclassing, you may want to override 603 * {@code toString()} to provide a useful description. 604 */ 605 protected CharMatcher() { 606 description = super.toString(); 607 } 608 609 // Abstract methods 610 611 /** Determines a true or false value for the given character. */ 612 public abstract boolean matches(char c); 613 614 // Non-static factories 615 616 /** 617 * Returns a matcher that matches any character not matched by this matcher. 618 */ 619 public CharMatcher negate() { 620 return new NegatedMatcher(this); 621 } 622 623 private static class NegatedMatcher extends CharMatcher { 624 final CharMatcher original; 625 626 NegatedMatcher(String toString, CharMatcher original) { 627 super(toString); 628 this.original = original; 629 } 630 631 NegatedMatcher(CharMatcher original) { 632 this(original + ".negate()", original); 633 } 634 635 @Override public boolean matches(char c) { 636 return !original.matches(c); 637 } 638 639 @Override public boolean matchesAllOf(CharSequence sequence) { 640 return original.matchesNoneOf(sequence); 641 } 642 643 @Override public boolean matchesNoneOf(CharSequence sequence) { 644 return original.matchesAllOf(sequence); 645 } 646 647 @Override public int countIn(CharSequence sequence) { 648 return sequence.length() - original.countIn(sequence); 649 } 650 651 @GwtIncompatible("java.util.BitSet") 652 @Override 653 void setBits(BitSet table) { 654 BitSet tmp = new BitSet(); 655 original.setBits(tmp); 656 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 657 table.or(tmp); 658 } 659 660 @Override public CharMatcher negate() { 661 return original; 662 } 663 664 @Override 665 CharMatcher withToString(String description) { 666 return new NegatedMatcher(description, original); 667 } 668 } 669 670 /** 671 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 672 */ 673 public CharMatcher and(CharMatcher other) { 674 return new And(this, checkNotNull(other)); 675 } 676 677 private static class And extends CharMatcher { 678 final CharMatcher first; 679 final CharMatcher second; 680 681 And(CharMatcher a, CharMatcher b) { 682 this(a, b, "CharMatcher.and(" + a + ", " + b + ")"); 683 } 684 685 And(CharMatcher a, CharMatcher b, String description) { 686 super(description); 687 first = checkNotNull(a); 688 second = checkNotNull(b); 689 } 690 691 @Override 692 public boolean matches(char c) { 693 return first.matches(c) && second.matches(c); 694 } 695 696 @GwtIncompatible("java.util.BitSet") 697 @Override 698 void setBits(BitSet table) { 699 BitSet tmp1 = new BitSet(); 700 first.setBits(tmp1); 701 BitSet tmp2 = new BitSet(); 702 second.setBits(tmp2); 703 tmp1.and(tmp2); 704 table.or(tmp1); 705 } 706 707 @Override 708 CharMatcher withToString(String description) { 709 return new And(first, second, description); 710 } 711 } 712 713 /** 714 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 715 */ 716 public CharMatcher or(CharMatcher other) { 717 return new Or(this, checkNotNull(other)); 718 } 719 720 private static class Or extends CharMatcher { 721 final CharMatcher first; 722 final CharMatcher second; 723 724 Or(CharMatcher a, CharMatcher b, String description) { 725 super(description); 726 first = checkNotNull(a); 727 second = checkNotNull(b); 728 } 729 730 Or(CharMatcher a, CharMatcher b) { 731 this(a, b, "CharMatcher.or(" + a + ", " + b + ")"); 732 } 733 734 @GwtIncompatible("java.util.BitSet") 735 @Override 736 void setBits(BitSet table) { 737 first.setBits(table); 738 second.setBits(table); 739 } 740 741 @Override 742 public boolean matches(char c) { 743 return first.matches(c) || second.matches(c); 744 } 745 746 @Override 747 CharMatcher withToString(String description) { 748 return new Or(first, second, description); 749 } 750 } 751 752 /** 753 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 754 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 755 * worthwhile only if the precomputed matcher is queried many thousands of times. 756 * 757 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 758 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 759 * worthwhile tradeoff in a browser. 760 */ 761 public CharMatcher precomputed() { 762 return Platform.precomputeCharMatcher(this); 763 } 764 765 /** 766 * Subclasses should provide a new CharMatcher with the same characteristics as {@code this}, 767 * but with their {@code toString} method overridden with the new description. 768 * 769 * <p>This is unsupported by default. 770 */ 771 CharMatcher withToString(String description) { 772 throw new UnsupportedOperationException(); 773 } 774 775 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 776 777 /** 778 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a 779 * method on {@link Platform} so that we can have different behavior in GWT. 780 * 781 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where 782 * the negation is cheaper to precompute than the matcher itself; it tries to build small 783 * hash tables for matchers that only match a few characters, and so on. In the worst-case 784 * scenario, it constructs an eight-kilobyte bit array and queries that. 785 * In many situations this produces a matcher which is faster to query than the original. 786 */ 787 @GwtIncompatible("java.util.BitSet") 788 CharMatcher precomputedInternal() { 789 final BitSet table = new BitSet(); 790 setBits(table); 791 int totalCharacters = table.cardinality(); 792 if (totalCharacters * 2 <= DISTINCT_CHARS) { 793 return precomputedPositive(totalCharacters, table, description); 794 } else { 795 // TODO(user): is it worth it to worry about the last character of large matchers? 796 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 797 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 798 String suffix = ".negate()"; 799 String negatedDescription = description.endsWith(suffix) 800 ? description.substring(0, description.length() - suffix.length()) 801 : description + suffix; 802 return new NegatedFastMatcher(toString(), 803 precomputedPositive(negatedCharacters, table, negatedDescription)); 804 } 805 } 806 807 /** 808 * A matcher for which precomputation will not yield any significant benefit. 809 */ 810 abstract static class FastMatcher extends CharMatcher { 811 FastMatcher() { 812 super(); 813 } 814 815 FastMatcher(String description) { 816 super(description); 817 } 818 819 @Override 820 public final CharMatcher precomputed() { 821 return this; 822 } 823 824 @Override 825 public CharMatcher negate() { 826 return new NegatedFastMatcher(this); 827 } 828 } 829 830 static final class NegatedFastMatcher extends NegatedMatcher { 831 NegatedFastMatcher(CharMatcher original) { 832 super(original); 833 } 834 835 NegatedFastMatcher(String toString, CharMatcher original) { 836 super(toString, original); 837 } 838 839 @Override 840 public final CharMatcher precomputed() { 841 return this; 842 } 843 844 @Override 845 CharMatcher withToString(String description) { 846 return new NegatedFastMatcher(description, original); 847 } 848 } 849 850 /** 851 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 852 */ 853 @GwtIncompatible("java.util.BitSet") 854 private static CharMatcher precomputedPositive( 855 int totalCharacters, 856 BitSet table, 857 String description) { 858 switch (totalCharacters) { 859 case 0: 860 return NONE; 861 case 1: 862 return is((char) table.nextSetBit(0)); 863 case 2: 864 char c1 = (char) table.nextSetBit(0); 865 char c2 = (char) table.nextSetBit(c1 + 1); 866 return isEither(c1, c2); 867 default: 868 return isSmall(totalCharacters, table.length()) 869 ? SmallCharMatcher.from(table, description) 870 : new BitSetMatcher(table, description); 871 } 872 } 873 874 private static boolean isSmall(int totalCharacters, int tableLength) { 875 return totalCharacters <= SmallCharMatcher.MAX_SIZE 876 && tableLength > (totalCharacters * 4 * Character.SIZE); 877 // err on the side of BitSetMatcher 878 } 879 880 @GwtIncompatible("java.util.BitSet") 881 private static class BitSetMatcher extends FastMatcher { 882 private final BitSet table; 883 884 private BitSetMatcher(BitSet table, String description) { 885 super(description); 886 if (table.length() + Long.SIZE < table.size()) { 887 table = (BitSet) table.clone(); 888 // If only we could actually call BitSet.trimToSize() ourselves... 889 } 890 this.table = table; 891 } 892 893 @Override public boolean matches(char c) { 894 return table.get(c); 895 } 896 897 @Override 898 void setBits(BitSet bitSet) { 899 bitSet.or(table); 900 } 901 } 902 903 /** 904 * Sets bits in {@code table} matched by this matcher. 905 */ 906 @GwtIncompatible("java.util.BitSet") 907 void setBits(BitSet table) { 908 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 909 if (matches((char) c)) { 910 table.set(c); 911 } 912 } 913 } 914 915 // Text processing routines 916 917 /** 918 * Returns {@code true} if a character sequence contains at least one matching character. 919 * Equivalent to {@code !matchesNoneOf(sequence)}. 920 * 921 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 922 * character, until this returns {@code true} or the end is reached. 923 * 924 * @param sequence the character sequence to examine, possibly empty 925 * @return {@code true} if this matcher matches at least one character in the sequence 926 * @since 8.0 927 */ 928 public boolean matchesAnyOf(CharSequence sequence) { 929 return !matchesNoneOf(sequence); 930 } 931 932 /** 933 * Returns {@code true} if a character sequence contains only matching characters. 934 * 935 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 936 * character, until this returns {@code false} or the end is reached. 937 * 938 * @param sequence the character sequence to examine, possibly empty 939 * @return {@code true} if this matcher matches every character in the sequence, including when 940 * the sequence is empty 941 */ 942 public boolean matchesAllOf(CharSequence sequence) { 943 for (int i = sequence.length() - 1; i >= 0; i--) { 944 if (!matches(sequence.charAt(i))) { 945 return false; 946 } 947 } 948 return true; 949 } 950 951 /** 952 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to 953 * {@code !matchesAnyOf(sequence)}. 954 * 955 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 956 * character, until this returns {@code false} or the end is reached. 957 * 958 * @param sequence the character sequence to examine, possibly empty 959 * @return {@code true} if this matcher matches every character in the sequence, including when 960 * the sequence is empty 961 */ 962 public boolean matchesNoneOf(CharSequence sequence) { 963 return indexIn(sequence) == -1; 964 } 965 966 /** 967 * Returns the index of the first matching character in a character sequence, or {@code -1} if no 968 * matching character is present. 969 * 970 * <p>The default implementation iterates over the sequence in forward order calling {@link 971 * #matches} for each character. 972 * 973 * @param sequence the character sequence to examine from the beginning 974 * @return an index, or {@code -1} if no character matches 975 */ 976 public int indexIn(CharSequence sequence) { 977 int length = sequence.length(); 978 for (int i = 0; i < length; i++) { 979 if (matches(sequence.charAt(i))) { 980 return i; 981 } 982 } 983 return -1; 984 } 985 986 /** 987 * Returns the index of the first matching character in a character sequence, starting from a 988 * given position, or {@code -1} if no character matches after that position. 989 * 990 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 991 * start}, calling {@link #matches} for each character. 992 * 993 * @param sequence the character sequence to examine 994 * @param start the first index to examine; must be nonnegative and no greater than {@code 995 * sequence.length()} 996 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 997 * or {@code -1} if no character matches 998 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 999 * sequence.length()} 1000 */ 1001 public int indexIn(CharSequence sequence, int start) { 1002 int length = sequence.length(); 1003 Preconditions.checkPositionIndex(start, length); 1004 for (int i = start; i < length; i++) { 1005 if (matches(sequence.charAt(i))) { 1006 return i; 1007 } 1008 } 1009 return -1; 1010 } 1011 1012 /** 1013 * Returns the index of the last matching character in a character sequence, or {@code -1} if no 1014 * matching character is present. 1015 * 1016 * <p>The default implementation iterates over the sequence in reverse order calling {@link 1017 * #matches} for each character. 1018 * 1019 * @param sequence the character sequence to examine from the end 1020 * @return an index, or {@code -1} if no character matches 1021 */ 1022 public int lastIndexIn(CharSequence sequence) { 1023 for (int i = sequence.length() - 1; i >= 0; i--) { 1024 if (matches(sequence.charAt(i))) { 1025 return i; 1026 } 1027 } 1028 return -1; 1029 } 1030 1031 /** 1032 * Returns the number of matching characters found in a character sequence. 1033 */ 1034 public int countIn(CharSequence sequence) { 1035 int count = 0; 1036 for (int i = 0; i < sequence.length(); i++) { 1037 if (matches(sequence.charAt(i))) { 1038 count++; 1039 } 1040 } 1041 return count; 1042 } 1043 1044 /** 1045 * Returns a string containing all non-matching characters of a character sequence, in order. For 1046 * example: <pre> {@code 1047 * 1048 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 1049 * 1050 * ... returns {@code "bzr"}. 1051 */ 1052 @CheckReturnValue 1053 public String removeFrom(CharSequence sequence) { 1054 String string = sequence.toString(); 1055 int pos = indexIn(string); 1056 if (pos == -1) { 1057 return string; 1058 } 1059 1060 char[] chars = string.toCharArray(); 1061 int spread = 1; 1062 1063 // This unusual loop comes from extensive benchmarking 1064 OUT: while (true) { 1065 pos++; 1066 while (true) { 1067 if (pos == chars.length) { 1068 break OUT; 1069 } 1070 if (matches(chars[pos])) { 1071 break; 1072 } 1073 chars[pos - spread] = chars[pos]; 1074 pos++; 1075 } 1076 spread++; 1077 } 1078 return new String(chars, 0, pos - spread); 1079 } 1080 1081 /** 1082 * Returns a string containing all matching characters of a character sequence, in order. For 1083 * example: <pre> {@code 1084 * 1085 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 1086 * 1087 * ... returns {@code "aaa"}. 1088 */ 1089 @CheckReturnValue 1090 public String retainFrom(CharSequence sequence) { 1091 return negate().removeFrom(sequence); 1092 } 1093 1094 /** 1095 * Returns a string copy of the input character sequence, with each character that matches this 1096 * matcher replaced by a given replacement character. For example: <pre> {@code 1097 * 1098 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 1099 * 1100 * ... returns {@code "rodor"}. 1101 * 1102 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1103 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1104 * character. 1105 * 1106 * @param sequence the character sequence to replace matching characters in 1107 * @param replacement the character to append to the result string in place of each matching 1108 * character in {@code sequence} 1109 * @return the new string 1110 */ 1111 @CheckReturnValue 1112 public String replaceFrom(CharSequence sequence, char replacement) { 1113 String string = sequence.toString(); 1114 int pos = indexIn(string); 1115 if (pos == -1) { 1116 return string; 1117 } 1118 char[] chars = string.toCharArray(); 1119 chars[pos] = replacement; 1120 for (int i = pos + 1; i < chars.length; i++) { 1121 if (matches(chars[i])) { 1122 chars[i] = replacement; 1123 } 1124 } 1125 return new String(chars); 1126 } 1127 1128 /** 1129 * Returns a string copy of the input character sequence, with each character that matches this 1130 * matcher replaced by a given replacement sequence. For example: <pre> {@code 1131 * 1132 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 1133 * 1134 * ... returns {@code "yoohoo"}. 1135 * 1136 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 1137 * off calling {@link #replaceFrom(CharSequence, char)} directly. 1138 * 1139 * @param sequence the character sequence to replace matching characters in 1140 * @param replacement the characters to append to the result string in place of each matching 1141 * character in {@code sequence} 1142 * @return the new string 1143 */ 1144 @CheckReturnValue 1145 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1146 int replacementLen = replacement.length(); 1147 if (replacementLen == 0) { 1148 return removeFrom(sequence); 1149 } 1150 if (replacementLen == 1) { 1151 return replaceFrom(sequence, replacement.charAt(0)); 1152 } 1153 1154 String string = sequence.toString(); 1155 int pos = indexIn(string); 1156 if (pos == -1) { 1157 return string; 1158 } 1159 1160 int len = string.length(); 1161 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 1162 1163 int oldpos = 0; 1164 do { 1165 buf.append(string, oldpos, pos); 1166 buf.append(replacement); 1167 oldpos = pos + 1; 1168 pos = indexIn(string, oldpos); 1169 } while (pos != -1); 1170 1171 buf.append(string, oldpos, len); 1172 return buf.toString(); 1173 } 1174 1175 /** 1176 * Returns a substring of the input character sequence that omits all characters this matcher 1177 * matches from the beginning and from the end of the string. For example: <pre> {@code 1178 * 1179 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 1180 * 1181 * ... returns {@code "cat"}. 1182 * 1183 * <p>Note that: <pre> {@code 1184 * 1185 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 1186 * 1187 * ... is equivalent to {@link String#trim()}. 1188 */ 1189 @CheckReturnValue 1190 public String trimFrom(CharSequence sequence) { 1191 int len = sequence.length(); 1192 int first; 1193 int last; 1194 1195 for (first = 0; first < len; first++) { 1196 if (!matches(sequence.charAt(first))) { 1197 break; 1198 } 1199 } 1200 for (last = len - 1; last > first; last--) { 1201 if (!matches(sequence.charAt(last))) { 1202 break; 1203 } 1204 } 1205 1206 return sequence.subSequence(first, last + 1).toString(); 1207 } 1208 1209 /** 1210 * Returns a substring of the input character sequence that omits all characters this matcher 1211 * matches from the beginning of the string. For example: <pre> {@code 1212 * 1213 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1214 * 1215 * ... returns {@code "catbab"}. 1216 */ 1217 @CheckReturnValue 1218 public String trimLeadingFrom(CharSequence sequence) { 1219 int len = sequence.length(); 1220 for (int first = 0; first < len; first++) { 1221 if (!matches(sequence.charAt(first))) { 1222 return sequence.subSequence(first, len).toString(); 1223 } 1224 } 1225 return ""; 1226 } 1227 1228 /** 1229 * Returns a substring of the input character sequence that omits all characters this matcher 1230 * matches from the end of the string. For example: <pre> {@code 1231 * 1232 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1233 * 1234 * ... returns {@code "abacat"}. 1235 */ 1236 @CheckReturnValue 1237 public String trimTrailingFrom(CharSequence sequence) { 1238 int len = sequence.length(); 1239 for (int last = len - 1; last >= 0; last--) { 1240 if (!matches(sequence.charAt(last))) { 1241 return sequence.subSequence(0, last + 1).toString(); 1242 } 1243 } 1244 return ""; 1245 } 1246 1247 /** 1248 * Returns a string copy of the input character sequence, with each group of consecutive 1249 * characters that match this matcher replaced by a single replacement character. For example: 1250 * <pre> {@code 1251 * 1252 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1253 * 1254 * ... returns {@code "b-p-r"}. 1255 * 1256 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1257 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1258 * character. 1259 * 1260 * @param sequence the character sequence to replace matching groups of characters in 1261 * @param replacement the character to append to the result string in place of each group of 1262 * matching characters in {@code sequence} 1263 * @return the new string 1264 */ 1265 @CheckReturnValue 1266 public String collapseFrom(CharSequence sequence, char replacement) { 1267 // This implementation avoids unnecessary allocation. 1268 int len = sequence.length(); 1269 for (int i = 0; i < len; i++) { 1270 char c = sequence.charAt(i); 1271 if (matches(c)) { 1272 if (c == replacement 1273 && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 1274 // a no-op replacement 1275 i++; 1276 } else { 1277 StringBuilder builder = new StringBuilder(len) 1278 .append(sequence.subSequence(0, i)) 1279 .append(replacement); 1280 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 1281 } 1282 } 1283 } 1284 // no replacement needed 1285 return sequence.toString(); 1286 } 1287 1288 /** 1289 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1290 * groups of matching characters at the start or end of the sequence are removed without 1291 * replacement. 1292 */ 1293 @CheckReturnValue 1294 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1295 // This implementation avoids unnecessary allocation. 1296 int len = sequence.length(); 1297 int first; 1298 int last; 1299 1300 for (first = 0; first < len && matches(sequence.charAt(first)); first++) {} 1301 for (last = len - 1; last > first && matches(sequence.charAt(last)); last--) {} 1302 1303 return (first == 0 && last == len - 1) 1304 ? collapseFrom(sequence, replacement) 1305 : finishCollapseFrom( 1306 sequence, first, last + 1, replacement, 1307 new StringBuilder(last + 1 - first), 1308 false); 1309 } 1310 1311 private String finishCollapseFrom( 1312 CharSequence sequence, int start, int end, char replacement, 1313 StringBuilder builder, boolean inMatchingGroup) { 1314 for (int i = start; i < end; i++) { 1315 char c = sequence.charAt(i); 1316 if (matches(c)) { 1317 if (!inMatchingGroup) { 1318 builder.append(replacement); 1319 inMatchingGroup = true; 1320 } 1321 } else { 1322 builder.append(c); 1323 inMatchingGroup = false; 1324 } 1325 } 1326 return builder.toString(); 1327 } 1328 1329 // Predicate interface 1330 1331 /** 1332 * Equivalent to {@link #matches}; provided only to satisfy the {@link Predicate} interface. When 1333 * using a reference of type {@code CharMatcher}, invoke {@link #matches} directly instead. 1334 */ 1335 @Override public boolean apply(Character character) { 1336 return matches(character); 1337 } 1338 1339 /** 1340 * Returns a string representation of this {@code CharMatcher}, such as 1341 * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 1342 */ 1343 @Override 1344 public String toString() { 1345 return description; 1346 } 1347 1348 /** 1349 * A special-case CharMatcher for Unicode whitespace characters that is extremely 1350 * efficient both in space required and in time to check for matches. 1351 * 1352 * Implementation details. 1353 * It turns out that all current (early 2012) Unicode characters are unique modulo 79: 1354 * so we can construct a lookup table of exactly 79 entries, and just check the character code 1355 * mod 79, and see if that character is in the table. 1356 * 1357 * There is a 1 at the beginning of the table so that the null character is not listed 1358 * as whitespace. 1359 * 1360 * Other things we tried that did not prove to be beneficial, mostly due to speed concerns: 1361 * 1362 * * Binary search into the sorted list of characters, i.e., what 1363 * CharMatcher.anyOf() does</li> 1364 * * Perfect hash function into a table of size 26 (using an offset table and a special 1365 * Jenkins hash function)</li> 1366 * * Perfect-ish hash function that required two lookups into a single table of size 26.</li> 1367 * * Using a power-of-2 sized hash table (size 64) with linear probing.</li> 1368 * 1369 * --Christopher Swenson, February 2012. 1370 */ 1371 private static final String WHITESPACE_TABLE = "\u0001\u0000\u00a0\u0000\u0000\u0000\u0000\u0000" 1372 + "\u0000\u0009\n\u000b\u000c\r\u0000\u0000\u2028\u2029\u0000\u0000\u0000\u0000\u0000\u202f" 1373 + "\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0020\u0000\u0000\u0000\u0000\u0000" 1374 + "\u0000\u0000\u0000\u0000\u0000\u3000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000" 1375 + "\u0000\u0000\u0085\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a" 1376 + "\u0000\u0000\u0000\u0000\u0000\u205f\u1680\u0000\u0000\u180e\u0000\u0000\u0000"; 1377 1378 /** 1379 * Determines whether a character is whitespace according to the latest Unicode standard, as 1380 * illustrated 1381 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 1382 * This is not the same definition used by other Java APIs. (See a 1383 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 1384 * definitions of "whitespace"</a>.) 1385 * 1386 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up 1387 * to date. 1388 */ 1389 public static final CharMatcher WHITESPACE = new FastMatcher("CharMatcher.WHITESPACE") { 1390 1391 @Override public boolean matches(char c) { 1392 return WHITESPACE_TABLE.charAt(c % 79) == c; 1393 } 1394 }; 1395}