001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package com.google.common.base; 018 019import static com.google.common.base.Preconditions.checkArgument; 020import static com.google.common.base.Preconditions.checkNotNull; 021 022import com.google.common.annotations.Beta; 023import com.google.common.annotations.GwtCompatible; 024import com.google.common.annotations.GwtIncompatible; 025 026import java.util.Arrays; 027import java.util.BitSet; 028 029import javax.annotation.CheckReturnValue; 030 031/** 032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 033 * for any {@link Object}. Also offers basic text processing methods based on this function. 034 * Implementations are strongly encouraged to be side-effect-free and immutable. 035 * 036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}". 038 * 039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand 040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical 041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher} 042 * treats these just as two separate characters. 043 * 044 * <p>Example usages: <pre> 045 * String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput); 046 * if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 047 * 048 * <p>See the Guava User Guide article on <a href= 049 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher"> 050 * {@code CharMatcher}</a>. 051 * 052 * @author Kevin Bourrillion 053 * @since 1.0 054 */ 055@Beta // Possibly change from chars to code points; decide constants vs. methods 056@GwtCompatible(emulated = true) 057public abstract class CharMatcher implements Predicate<Character> { 058 // Constants 059 /** 060 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 061 * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a 062 * discussion of that term. 063 * 064 * @since 2.0 065 */ 066 public static final CharMatcher BREAKING_WHITESPACE = new CharMatcher() { 067 @Override 068 public boolean matches(char c) { 069 switch (c) { 070 case '\t': 071 case '\n': 072 case '\013': 073 case '\f': 074 case '\r': 075 case ' ': 076 case '\u0085': 077 case '\u1680': 078 case '\u2028': 079 case '\u2029': 080 case '\u205f': 081 case '\u3000': 082 return true; 083 case '\u2007': 084 return false; 085 default: 086 return c >= '\u2000' && c <= '\u200a'; 087 } 088 } 089 090 @Override 091 public String toString() { 092 return "CharMatcher.BREAKING_WHITESPACE"; 093 } 094 }; 095 096 /** 097 * Determines whether a character is ASCII, meaning that its code point is less than 128. 098 */ 099 public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII"); 100 101 private static class RangesMatcher extends CharMatcher { 102 private final char[] rangeStarts; 103 private final char[] rangeEnds; 104 105 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 106 super(description); 107 this.rangeStarts = rangeStarts; 108 this.rangeEnds = rangeEnds; 109 checkArgument(rangeStarts.length == rangeEnds.length); 110 for (int i = 0; i < rangeStarts.length; i++) { 111 checkArgument(rangeStarts[i] <= rangeEnds[i]); 112 if (i + 1 < rangeStarts.length) { 113 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 114 } 115 } 116 } 117 118 @Override 119 public boolean matches(char c) { 120 int index = Arrays.binarySearch(rangeStarts, c); 121 if (index >= 0) { 122 return true; 123 } else { 124 index = ~index - 1; 125 return index >= 0 && c <= rangeEnds[index]; 126 } 127 } 128 } 129 130 // Must be in ascending order. 131 private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6" 132 + "\u0c66\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1b50\u1bb0" 133 + "\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 134 135 private static final String NINES; 136 static { 137 StringBuilder builder = new StringBuilder(ZEROES.length()); 138 for (int i = 0; i < ZEROES.length(); i++) { 139 builder.append((char) (ZEROES.charAt(i) + 9)); 140 } 141 NINES = builder.toString(); 142 } 143 144 /** 145 * Determines whether a character is a digit according to 146 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 147 * If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 148 */ 149 public static final CharMatcher DIGIT = new RangesMatcher( 150 "CharMatcher.DIGIT", ZEROES.toCharArray(), NINES.toCharArray()); 151 152 /** 153 * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's 154 * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 155 */ 156 public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") { 157 @Override public boolean matches(char c) { 158 return Character.isDigit(c); 159 } 160 }; 161 162 /** 163 * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's 164 * definition}. If you only care to match letters of the Latin alphabet, you can use {@code 165 * inRange('a', 'z').or(inRange('A', 'Z'))}. 166 */ 167 public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") { 168 @Override public boolean matches(char c) { 169 return Character.isLetter(c); 170 } 171 }; 172 173 /** 174 * Determines whether a character is a letter or digit according to {@link 175 * Character#isLetterOrDigit(char) Java's definition}. 176 */ 177 public static final CharMatcher JAVA_LETTER_OR_DIGIT = 178 new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") { 179 @Override public boolean matches(char c) { 180 return Character.isLetterOrDigit(c); 181 } 182 }; 183 184 /** 185 * Determines whether a character is upper case according to {@link Character#isUpperCase(char) 186 * Java's definition}. 187 */ 188 public static final CharMatcher JAVA_UPPER_CASE = 189 new CharMatcher("CharMatcher.JAVA_UPPER_CASE") { 190 @Override public boolean matches(char c) { 191 return Character.isUpperCase(c); 192 } 193 }; 194 195 /** 196 * Determines whether a character is lower case according to {@link Character#isLowerCase(char) 197 * Java's definition}. 198 */ 199 public static final CharMatcher JAVA_LOWER_CASE = 200 new CharMatcher("CharMatcher.JAVA_LOWER_CASE") { 201 @Override public boolean matches(char c) { 202 return Character.isLowerCase(c); 203 } 204 }; 205 206 /** 207 * Determines whether a character is an ISO control character as specified by {@link 208 * Character#isISOControl(char)}. 209 */ 210 public static final CharMatcher JAVA_ISO_CONTROL = 211 inRange('\u0000', '\u001f') 212 .or(inRange('\u007f', '\u009f')) 213 .withToString("CharMatcher.JAVA_ISO_CONTROL"); 214 215 /** 216 * Determines whether a character is invisible; that is, if its Unicode category is any of 217 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 218 * PRIVATE_USE according to ICU4J. 219 */ 220 public static final CharMatcher INVISIBLE = new RangesMatcher("CharMatcher.INVISIBLE", ( 221 "\u0000\u007f\u00ad\u0600\u06dd\u070f\u1680\u180e\u2000\u2028\u205f\u206a\u3000\ud800\ufeff" 222 + "\ufff9\ufffa").toCharArray(), ( 223 "\u0020\u00a0\u00ad\u0604\u06dd\u070f\u1680\u180e\u200f\u202f\u2064\u206f\u3000\uf8ff\ufeff" 224 + "\ufff9\ufffb").toCharArray()); 225 226 private static String showCharacter(char c) { 227 String hex = "0123456789ABCDEF"; 228 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 229 for (int i = 0; i < 4; i++) { 230 tmp[5 - i] = hex.charAt(c & 0xF); 231 c >>= 4; 232 } 233 return String.copyValueOf(tmp); 234 235 } 236 237 /** 238 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 239 * errs on the side of returning {@code false} (that is, it tends to assume a character is 240 * double-width). 241 * 242 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to 243 * date. 244 */ 245 public static final CharMatcher SINGLE_WIDTH = new RangesMatcher("CharMatcher.SINGLE_WIDTH", 246 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 247 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 248 249 /** Matches any character. */ 250 public static final CharMatcher ANY = 251 new FastMatcher("CharMatcher.ANY") { 252 @Override public boolean matches(char c) { 253 return true; 254 } 255 256 @Override public int indexIn(CharSequence sequence) { 257 return (sequence.length() == 0) ? -1 : 0; 258 } 259 260 @Override public int indexIn(CharSequence sequence, int start) { 261 int length = sequence.length(); 262 Preconditions.checkPositionIndex(start, length); 263 return (start == length) ? -1 : start; 264 } 265 266 @Override public int lastIndexIn(CharSequence sequence) { 267 return sequence.length() - 1; 268 } 269 270 @Override public boolean matchesAllOf(CharSequence sequence) { 271 checkNotNull(sequence); 272 return true; 273 } 274 275 @Override public boolean matchesNoneOf(CharSequence sequence) { 276 return sequence.length() == 0; 277 } 278 279 @Override public String removeFrom(CharSequence sequence) { 280 checkNotNull(sequence); 281 return ""; 282 } 283 284 @Override public String replaceFrom(CharSequence sequence, char replacement) { 285 char[] array = new char[sequence.length()]; 286 Arrays.fill(array, replacement); 287 return new String(array); 288 } 289 290 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 291 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 292 for (int i = 0; i < sequence.length(); i++) { 293 retval.append(replacement); 294 } 295 return retval.toString(); 296 } 297 298 @Override public String collapseFrom(CharSequence sequence, char replacement) { 299 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 300 } 301 302 @Override public String trimFrom(CharSequence sequence) { 303 checkNotNull(sequence); 304 return ""; 305 } 306 307 @Override public int countIn(CharSequence sequence) { 308 return sequence.length(); 309 } 310 311 @Override public CharMatcher and(CharMatcher other) { 312 return checkNotNull(other); 313 } 314 315 @Override public CharMatcher or(CharMatcher other) { 316 checkNotNull(other); 317 return this; 318 } 319 320 @Override public CharMatcher negate() { 321 return NONE; 322 } 323 }; 324 325 /** Matches no characters. */ 326 public static final CharMatcher NONE = 327 new FastMatcher("CharMatcher.NONE") { 328 @Override public boolean matches(char c) { 329 return false; 330 } 331 332 @Override public int indexIn(CharSequence sequence) { 333 checkNotNull(sequence); 334 return -1; 335 } 336 337 @Override public int indexIn(CharSequence sequence, int start) { 338 int length = sequence.length(); 339 Preconditions.checkPositionIndex(start, length); 340 return -1; 341 } 342 343 @Override public int lastIndexIn(CharSequence sequence) { 344 checkNotNull(sequence); 345 return -1; 346 } 347 348 @Override public boolean matchesAllOf(CharSequence sequence) { 349 return sequence.length() == 0; 350 } 351 352 @Override public boolean matchesNoneOf(CharSequence sequence) { 353 checkNotNull(sequence); 354 return true; 355 } 356 357 @Override public String removeFrom(CharSequence sequence) { 358 return sequence.toString(); 359 } 360 361 @Override public String replaceFrom(CharSequence sequence, char replacement) { 362 return sequence.toString(); 363 } 364 365 @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) { 366 checkNotNull(replacement); 367 return sequence.toString(); 368 } 369 370 @Override public String collapseFrom(CharSequence sequence, char replacement) { 371 return sequence.toString(); 372 } 373 374 @Override public String trimFrom(CharSequence sequence) { 375 return sequence.toString(); 376 } 377 378 @Override 379 public String trimLeadingFrom(CharSequence sequence) { 380 return sequence.toString(); 381 } 382 383 @Override 384 public String trimTrailingFrom(CharSequence sequence) { 385 return sequence.toString(); 386 } 387 388 @Override public int countIn(CharSequence sequence) { 389 checkNotNull(sequence); 390 return 0; 391 } 392 393 @Override public CharMatcher and(CharMatcher other) { 394 checkNotNull(other); 395 return this; 396 } 397 398 @Override public CharMatcher or(CharMatcher other) { 399 return checkNotNull(other); 400 } 401 402 @Override public CharMatcher negate() { 403 return ANY; 404 } 405 }; 406 407 // Static factories 408 409 /** 410 * Returns a {@code char} matcher that matches only one specified character. 411 */ 412 public static CharMatcher is(final char match) { 413 String description = "CharMatcher.is('" + showCharacter(match) + "')"; 414 return new FastMatcher(description) { 415 @Override public boolean matches(char c) { 416 return c == match; 417 } 418 419 @Override public String replaceFrom(CharSequence sequence, char replacement) { 420 return sequence.toString().replace(match, replacement); 421 } 422 423 @Override public CharMatcher and(CharMatcher other) { 424 return other.matches(match) ? this : NONE; 425 } 426 427 @Override public CharMatcher or(CharMatcher other) { 428 return other.matches(match) ? other : super.or(other); 429 } 430 431 @Override public CharMatcher negate() { 432 return isNot(match); 433 } 434 435 @GwtIncompatible("java.util.BitSet") 436 @Override 437 void setBits(BitSet table) { 438 table.set(match); 439 } 440 }; 441 } 442 443 /** 444 * Returns a {@code char} matcher that matches any character except the one specified. 445 * 446 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 447 */ 448 public static CharMatcher isNot(final char match) { 449 String description = "CharMatcher.isNot('" + showCharacter(match) + "')"; 450 return new FastMatcher(description) { 451 @Override public boolean matches(char c) { 452 return c != match; 453 } 454 455 @Override public CharMatcher and(CharMatcher other) { 456 return other.matches(match) ? super.and(other) : other; 457 } 458 459 @Override public CharMatcher or(CharMatcher other) { 460 return other.matches(match) ? ANY : this; 461 } 462 463 @GwtIncompatible("java.util.BitSet") 464 @Override 465 void setBits(BitSet table) { 466 table.set(0, match); 467 table.set(match + 1, Character.MAX_VALUE + 1); 468 } 469 470 @Override public CharMatcher negate() { 471 return is(match); 472 } 473 }; 474 } 475 476 /** 477 * Returns a {@code char} matcher that matches any character present in the given character 478 * sequence. 479 */ 480 public static CharMatcher anyOf(final CharSequence sequence) { 481 switch (sequence.length()) { 482 case 0: 483 return NONE; 484 case 1: 485 return is(sequence.charAt(0)); 486 case 2: 487 return isEither(sequence.charAt(0), sequence.charAt(1)); 488 default: 489 // continue below to handle the general case 490 } 491 // TODO(user): is it potentially worth just going ahead and building a precomputed matcher? 492 final char[] chars = sequence.toString().toCharArray(); 493 Arrays.sort(chars); 494 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 495 for (char c : chars) { 496 description.append(showCharacter(c)); 497 } 498 description.append("\")"); 499 return new CharMatcher(description.toString()) { 500 @Override public boolean matches(char c) { 501 return Arrays.binarySearch(chars, c) >= 0; 502 } 503 504 @Override 505 @GwtIncompatible("java.util.BitSet") 506 void setBits(BitSet table) { 507 for (char c : chars) { 508 table.set(c); 509 } 510 } 511 }; 512 } 513 514 private static CharMatcher isEither( 515 final char match1, 516 final char match2) { 517 String description = "CharMatcher.anyOf(\"" + 518 showCharacter(match1) + showCharacter(match2) + "\")"; 519 return new FastMatcher(description) { 520 @Override public boolean matches(char c) { 521 return c == match1 || c == match2; 522 } 523 524 @GwtIncompatible("java.util.BitSet") 525 @Override void setBits(BitSet table) { 526 table.set(match1); 527 table.set(match2); 528 } 529 }; 530 } 531 532 /** 533 * Returns a {@code char} matcher that matches any character not present in the given character 534 * sequence. 535 */ 536 public static CharMatcher noneOf(CharSequence sequence) { 537 return anyOf(sequence).negate(); 538 } 539 540 /** 541 * Returns a {@code char} matcher that matches any character in a given range (both endpoints are 542 * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 543 * CharMatcher.inRange('a', 'z')}. 544 * 545 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 546 */ 547 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 548 checkArgument(endInclusive >= startInclusive); 549 String description = "CharMatcher.inRange('" + 550 showCharacter(startInclusive) + "', '" + 551 showCharacter(endInclusive) + "')"; 552 return inRange(startInclusive, endInclusive, description); 553 } 554 555 static CharMatcher inRange(final char startInclusive, final char endInclusive, 556 String description) { 557 return new FastMatcher(description) { 558 @Override public boolean matches(char c) { 559 return startInclusive <= c && c <= endInclusive; 560 } 561 562 @GwtIncompatible("java.util.BitSet") 563 @Override void setBits(BitSet table) { 564 table.set(startInclusive, endInclusive + 1); 565 } 566 }; 567 } 568 569 /** 570 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 571 * which operates on primitive {@code char} instances instead. 572 */ 573 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 574 checkNotNull(predicate); 575 if (predicate instanceof CharMatcher) { 576 return (CharMatcher) predicate; 577 } 578 String description = "CharMatcher.forPredicate(" + predicate + ")"; 579 return new CharMatcher(description) { 580 @Override public boolean matches(char c) { 581 return predicate.apply(c); 582 } 583 584 @Override public boolean apply(Character character) { 585 return predicate.apply(checkNotNull(character)); 586 } 587 }; 588 } 589 590 // State 591 final String description; 592 593 // Constructors 594 595 /** 596 * Sets the {@code toString()} from the given description. 597 */ 598 CharMatcher(String description) { 599 this.description = description; 600 } 601 602 /** 603 * Constructor for use by subclasses. When subclassing, you may want to override 604 * {@code toString()} to provide a useful description. 605 */ 606 protected CharMatcher() { 607 description = super.toString(); 608 } 609 610 // Abstract methods 611 612 /** Determines a true or false value for the given character. */ 613 public abstract boolean matches(char c); 614 615 // Non-static factories 616 617 /** 618 * Returns a matcher that matches any character not matched by this matcher. 619 */ 620 public CharMatcher negate() { 621 return new NegatedMatcher(this); 622 } 623 624 private static class NegatedMatcher extends CharMatcher { 625 final CharMatcher original; 626 627 NegatedMatcher(String toString, CharMatcher original) { 628 super(toString); 629 this.original = original; 630 } 631 632 NegatedMatcher(CharMatcher original) { 633 this(original + ".negate()", original); 634 } 635 636 @Override public boolean matches(char c) { 637 return !original.matches(c); 638 } 639 640 @Override public boolean matchesAllOf(CharSequence sequence) { 641 return original.matchesNoneOf(sequence); 642 } 643 644 @Override public boolean matchesNoneOf(CharSequence sequence) { 645 return original.matchesAllOf(sequence); 646 } 647 648 @Override public int countIn(CharSequence sequence) { 649 return sequence.length() - original.countIn(sequence); 650 } 651 652 @GwtIncompatible("java.util.BitSet") 653 @Override 654 void setBits(BitSet table) { 655 BitSet tmp = new BitSet(); 656 original.setBits(tmp); 657 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 658 table.or(tmp); 659 } 660 661 @Override public CharMatcher negate() { 662 return original; 663 } 664 665 @Override 666 CharMatcher withToString(String description) { 667 return new NegatedMatcher(description, original); 668 } 669 } 670 671 /** 672 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 673 */ 674 public CharMatcher and(CharMatcher other) { 675 return new And(this, checkNotNull(other)); 676 } 677 678 private static class And extends CharMatcher { 679 final CharMatcher first; 680 final CharMatcher second; 681 682 And(CharMatcher a, CharMatcher b) { 683 this(a, b, "CharMatcher.and(" + a + ", " + b + ")"); 684 } 685 686 And(CharMatcher a, CharMatcher b, String description) { 687 super(description); 688 first = checkNotNull(a); 689 second = checkNotNull(b); 690 } 691 692 @Override 693 public boolean matches(char c) { 694 return first.matches(c) && second.matches(c); 695 } 696 697 @GwtIncompatible("java.util.BitSet") 698 @Override 699 void setBits(BitSet table) { 700 BitSet tmp1 = new BitSet(); 701 first.setBits(tmp1); 702 BitSet tmp2 = new BitSet(); 703 second.setBits(tmp2); 704 tmp1.and(tmp2); 705 table.or(tmp1); 706 } 707 708 @Override 709 CharMatcher withToString(String description) { 710 return new And(first, second, description); 711 } 712 } 713 714 /** 715 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 716 */ 717 public CharMatcher or(CharMatcher other) { 718 return new Or(this, checkNotNull(other)); 719 } 720 721 private static class Or extends CharMatcher { 722 final CharMatcher first; 723 final CharMatcher second; 724 725 Or(CharMatcher a, CharMatcher b, String description) { 726 super(description); 727 first = checkNotNull(a); 728 second = checkNotNull(b); 729 } 730 731 Or(CharMatcher a, CharMatcher b) { 732 this(a, b, "CharMatcher.or(" + a + ", " + b + ")"); 733 } 734 735 @GwtIncompatible("java.util.BitSet") 736 @Override 737 void setBits(BitSet table) { 738 first.setBits(table); 739 second.setBits(table); 740 } 741 742 @Override 743 public boolean matches(char c) { 744 return first.matches(c) || second.matches(c); 745 } 746 747 @Override 748 CharMatcher withToString(String description) { 749 return new Or(first, second, description); 750 } 751 } 752 753 /** 754 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 755 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 756 * worthwhile only if the precomputed matcher is queried many thousands of times. 757 * 758 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 759 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 760 * worthwhile tradeoff in a browser. 761 */ 762 public CharMatcher precomputed() { 763 return Platform.precomputeCharMatcher(this); 764 } 765 766 /** 767 * Subclasses should provide a new CharMatcher with the same characteristics as {@code this}, 768 * but with their {@code toString} method overridden with the new description. 769 * 770 * <p>This is unsupported by default. 771 */ 772 CharMatcher withToString(String description) { 773 throw new UnsupportedOperationException(); 774 } 775 776 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 777 778 /** 779 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a 780 * method on {@link Platform} so that we can have different behavior in GWT. 781 * 782 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where 783 * the negation is cheaper to precompute than the matcher itself; it tries to build small 784 * hash tables for matchers that only match a few characters, and so on. In the worst-case 785 * scenario, it constructs an eight-kilobyte bit array and queries that. 786 * In many situations this produces a matcher which is faster to query than the original. 787 */ 788 @GwtIncompatible("java.util.BitSet") 789 CharMatcher precomputedInternal() { 790 final BitSet table = new BitSet(); 791 setBits(table); 792 int totalCharacters = table.cardinality(); 793 if (totalCharacters * 2 <= DISTINCT_CHARS) { 794 return precomputedPositive(totalCharacters, table, description); 795 } else { 796 // TODO(user): is it worth it to worry about the last character of large matchers? 797 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 798 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 799 String suffix = ".negate()"; 800 String negatedDescription = description.endsWith(suffix) 801 ? description.substring(0, description.length() - suffix.length()) 802 : description + suffix; 803 return new NegatedFastMatcher(toString(), 804 precomputedPositive(negatedCharacters, table, negatedDescription)); 805 } 806 } 807 808 /** 809 * A matcher for which precomputation will not yield any significant benefit. 810 */ 811 abstract static class FastMatcher extends CharMatcher { 812 FastMatcher() { 813 super(); 814 } 815 816 FastMatcher(String description) { 817 super(description); 818 } 819 820 @Override 821 public final CharMatcher precomputed() { 822 return this; 823 } 824 825 @Override 826 public CharMatcher negate() { 827 return new NegatedFastMatcher(this); 828 } 829 } 830 831 static final class NegatedFastMatcher extends NegatedMatcher { 832 NegatedFastMatcher(CharMatcher original) { 833 super(original); 834 } 835 836 NegatedFastMatcher(String toString, CharMatcher original) { 837 super(toString, original); 838 } 839 840 @Override 841 public final CharMatcher precomputed() { 842 return this; 843 } 844 845 @Override 846 CharMatcher withToString(String description) { 847 return new NegatedFastMatcher(description, original); 848 } 849 } 850 851 /** 852 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 853 */ 854 @GwtIncompatible("java.util.BitSet") 855 private static CharMatcher precomputedPositive( 856 int totalCharacters, 857 BitSet table, 858 String description) { 859 switch (totalCharacters) { 860 case 0: 861 return NONE; 862 case 1: 863 return is((char) table.nextSetBit(0)); 864 case 2: 865 char c1 = (char) table.nextSetBit(0); 866 char c2 = (char) table.nextSetBit(c1 + 1); 867 return isEither(c1, c2); 868 default: 869 return isSmall(totalCharacters, table.length()) 870 ? SmallCharMatcher.from(table, description) 871 : new BitSetMatcher(table, description); 872 } 873 } 874 875 @GwtIncompatible("SmallCharMatcher") 876 private static boolean isSmall(int totalCharacters, int tableLength) { 877 return totalCharacters <= SmallCharMatcher.MAX_SIZE 878 && tableLength > (totalCharacters * 4 * Character.SIZE); 879 // err on the side of BitSetMatcher 880 } 881 882 @GwtIncompatible("java.util.BitSet") 883 private static class BitSetMatcher extends FastMatcher { 884 private final BitSet table; 885 886 private BitSetMatcher(BitSet table, String description) { 887 super(description); 888 if (table.length() + Long.SIZE < table.size()) { 889 table = (BitSet) table.clone(); 890 // If only we could actually call BitSet.trimToSize() ourselves... 891 } 892 this.table = table; 893 } 894 895 @Override public boolean matches(char c) { 896 return table.get(c); 897 } 898 899 @Override 900 void setBits(BitSet bitSet) { 901 bitSet.or(table); 902 } 903 } 904 905 /** 906 * Sets bits in {@code table} matched by this matcher. 907 */ 908 @GwtIncompatible("java.util.BitSet") 909 void setBits(BitSet table) { 910 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 911 if (matches((char) c)) { 912 table.set(c); 913 } 914 } 915 } 916 917 // Text processing routines 918 919 /** 920 * Returns {@code true} if a character sequence contains at least one matching character. 921 * Equivalent to {@code !matchesNoneOf(sequence)}. 922 * 923 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 924 * character, until this returns {@code true} or the end is reached. 925 * 926 * @param sequence the character sequence to examine, possibly empty 927 * @return {@code true} if this matcher matches at least one character in the sequence 928 * @since 8.0 929 */ 930 public boolean matchesAnyOf(CharSequence sequence) { 931 return !matchesNoneOf(sequence); 932 } 933 934 /** 935 * Returns {@code true} if a character sequence contains only matching characters. 936 * 937 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 938 * character, until this returns {@code false} or the end is reached. 939 * 940 * @param sequence the character sequence to examine, possibly empty 941 * @return {@code true} if this matcher matches every character in the sequence, including when 942 * the sequence is empty 943 */ 944 public boolean matchesAllOf(CharSequence sequence) { 945 for (int i = sequence.length() - 1; i >= 0; i--) { 946 if (!matches(sequence.charAt(i))) { 947 return false; 948 } 949 } 950 return true; 951 } 952 953 /** 954 * Returns {@code true} if a character sequence contains no matching characters. Equivalent to 955 * {@code !matchesAnyOf(sequence)}. 956 * 957 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 958 * character, until this returns {@code false} or the end is reached. 959 * 960 * @param sequence the character sequence to examine, possibly empty 961 * @return {@code true} if this matcher matches every character in the sequence, including when 962 * the sequence is empty 963 */ 964 public boolean matchesNoneOf(CharSequence sequence) { 965 return indexIn(sequence) == -1; 966 } 967 968 /** 969 * Returns the index of the first matching character in a character sequence, or {@code -1} if no 970 * matching character is present. 971 * 972 * <p>The default implementation iterates over the sequence in forward order calling {@link 973 * #matches} for each character. 974 * 975 * @param sequence the character sequence to examine from the beginning 976 * @return an index, or {@code -1} if no character matches 977 */ 978 public int indexIn(CharSequence sequence) { 979 int length = sequence.length(); 980 for (int i = 0; i < length; i++) { 981 if (matches(sequence.charAt(i))) { 982 return i; 983 } 984 } 985 return -1; 986 } 987 988 /** 989 * Returns the index of the first matching character in a character sequence, starting from a 990 * given position, or {@code -1} if no character matches after that position. 991 * 992 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 993 * start}, calling {@link #matches} for each character. 994 * 995 * @param sequence the character sequence to examine 996 * @param start the first index to examine; must be nonnegative and no greater than {@code 997 * sequence.length()} 998 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 999 * or {@code -1} if no character matches 1000 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 1001 * sequence.length()} 1002 */ 1003 public int indexIn(CharSequence sequence, int start) { 1004 int length = sequence.length(); 1005 Preconditions.checkPositionIndex(start, length); 1006 for (int i = start; i < length; i++) { 1007 if (matches(sequence.charAt(i))) { 1008 return i; 1009 } 1010 } 1011 return -1; 1012 } 1013 1014 /** 1015 * Returns the index of the last matching character in a character sequence, or {@code -1} if no 1016 * matching character is present. 1017 * 1018 * <p>The default implementation iterates over the sequence in reverse order calling {@link 1019 * #matches} for each character. 1020 * 1021 * @param sequence the character sequence to examine from the end 1022 * @return an index, or {@code -1} if no character matches 1023 */ 1024 public int lastIndexIn(CharSequence sequence) { 1025 for (int i = sequence.length() - 1; i >= 0; i--) { 1026 if (matches(sequence.charAt(i))) { 1027 return i; 1028 } 1029 } 1030 return -1; 1031 } 1032 1033 /** 1034 * Returns the number of matching characters found in a character sequence. 1035 */ 1036 public int countIn(CharSequence sequence) { 1037 int count = 0; 1038 for (int i = 0; i < sequence.length(); i++) { 1039 if (matches(sequence.charAt(i))) { 1040 count++; 1041 } 1042 } 1043 return count; 1044 } 1045 1046 /** 1047 * Returns a string containing all non-matching characters of a character sequence, in order. For 1048 * example: <pre> {@code 1049 * 1050 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 1051 * 1052 * ... returns {@code "bzr"}. 1053 */ 1054 @CheckReturnValue 1055 public String removeFrom(CharSequence sequence) { 1056 String string = sequence.toString(); 1057 int pos = indexIn(string); 1058 if (pos == -1) { 1059 return string; 1060 } 1061 1062 char[] chars = string.toCharArray(); 1063 int spread = 1; 1064 1065 // This unusual loop comes from extensive benchmarking 1066 OUT: while (true) { 1067 pos++; 1068 while (true) { 1069 if (pos == chars.length) { 1070 break OUT; 1071 } 1072 if (matches(chars[pos])) { 1073 break; 1074 } 1075 chars[pos - spread] = chars[pos]; 1076 pos++; 1077 } 1078 spread++; 1079 } 1080 return new String(chars, 0, pos - spread); 1081 } 1082 1083 /** 1084 * Returns a string containing all matching characters of a character sequence, in order. For 1085 * example: <pre> {@code 1086 * 1087 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 1088 * 1089 * ... returns {@code "aaa"}. 1090 */ 1091 @CheckReturnValue 1092 public String retainFrom(CharSequence sequence) { 1093 return negate().removeFrom(sequence); 1094 } 1095 1096 /** 1097 * Returns a string copy of the input character sequence, with each character that matches this 1098 * matcher replaced by a given replacement character. For example: <pre> {@code 1099 * 1100 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 1101 * 1102 * ... returns {@code "rodor"}. 1103 * 1104 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1105 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1106 * character. 1107 * 1108 * @param sequence the character sequence to replace matching characters in 1109 * @param replacement the character to append to the result string in place of each matching 1110 * character in {@code sequence} 1111 * @return the new string 1112 */ 1113 @CheckReturnValue 1114 public String replaceFrom(CharSequence sequence, char replacement) { 1115 String string = sequence.toString(); 1116 int pos = indexIn(string); 1117 if (pos == -1) { 1118 return string; 1119 } 1120 char[] chars = string.toCharArray(); 1121 chars[pos] = replacement; 1122 for (int i = pos + 1; i < chars.length; i++) { 1123 if (matches(chars[i])) { 1124 chars[i] = replacement; 1125 } 1126 } 1127 return new String(chars); 1128 } 1129 1130 /** 1131 * Returns a string copy of the input character sequence, with each character that matches this 1132 * matcher replaced by a given replacement sequence. For example: <pre> {@code 1133 * 1134 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 1135 * 1136 * ... returns {@code "yoohoo"}. 1137 * 1138 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 1139 * off calling {@link #replaceFrom(CharSequence, char)} directly. 1140 * 1141 * @param sequence the character sequence to replace matching characters in 1142 * @param replacement the characters to append to the result string in place of each matching 1143 * character in {@code sequence} 1144 * @return the new string 1145 */ 1146 @CheckReturnValue 1147 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1148 int replacementLen = replacement.length(); 1149 if (replacementLen == 0) { 1150 return removeFrom(sequence); 1151 } 1152 if (replacementLen == 1) { 1153 return replaceFrom(sequence, replacement.charAt(0)); 1154 } 1155 1156 String string = sequence.toString(); 1157 int pos = indexIn(string); 1158 if (pos == -1) { 1159 return string; 1160 } 1161 1162 int len = string.length(); 1163 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 1164 1165 int oldpos = 0; 1166 do { 1167 buf.append(string, oldpos, pos); 1168 buf.append(replacement); 1169 oldpos = pos + 1; 1170 pos = indexIn(string, oldpos); 1171 } while (pos != -1); 1172 1173 buf.append(string, oldpos, len); 1174 return buf.toString(); 1175 } 1176 1177 /** 1178 * Returns a substring of the input character sequence that omits all characters this matcher 1179 * matches from the beginning and from the end of the string. For example: <pre> {@code 1180 * 1181 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 1182 * 1183 * ... returns {@code "cat"}. 1184 * 1185 * <p>Note that: <pre> {@code 1186 * 1187 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 1188 * 1189 * ... is equivalent to {@link String#trim()}. 1190 */ 1191 @CheckReturnValue 1192 public String trimFrom(CharSequence sequence) { 1193 int len = sequence.length(); 1194 int first; 1195 int last; 1196 1197 for (first = 0; first < len; first++) { 1198 if (!matches(sequence.charAt(first))) { 1199 break; 1200 } 1201 } 1202 for (last = len - 1; last > first; last--) { 1203 if (!matches(sequence.charAt(last))) { 1204 break; 1205 } 1206 } 1207 1208 return sequence.subSequence(first, last + 1).toString(); 1209 } 1210 1211 /** 1212 * Returns a substring of the input character sequence that omits all characters this matcher 1213 * matches from the beginning of the string. For example: <pre> {@code 1214 * 1215 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1216 * 1217 * ... returns {@code "catbab"}. 1218 */ 1219 @CheckReturnValue 1220 public String trimLeadingFrom(CharSequence sequence) { 1221 int len = sequence.length(); 1222 for (int first = 0; first < len; first++) { 1223 if (!matches(sequence.charAt(first))) { 1224 return sequence.subSequence(first, len).toString(); 1225 } 1226 } 1227 return ""; 1228 } 1229 1230 /** 1231 * Returns a substring of the input character sequence that omits all characters this matcher 1232 * matches from the end of the string. For example: <pre> {@code 1233 * 1234 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1235 * 1236 * ... returns {@code "abacat"}. 1237 */ 1238 @CheckReturnValue 1239 public String trimTrailingFrom(CharSequence sequence) { 1240 int len = sequence.length(); 1241 for (int last = len - 1; last >= 0; last--) { 1242 if (!matches(sequence.charAt(last))) { 1243 return sequence.subSequence(0, last + 1).toString(); 1244 } 1245 } 1246 return ""; 1247 } 1248 1249 /** 1250 * Returns a string copy of the input character sequence, with each group of consecutive 1251 * characters that match this matcher replaced by a single replacement character. For example: 1252 * <pre> {@code 1253 * 1254 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1255 * 1256 * ... returns {@code "b-p-r"}. 1257 * 1258 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1259 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1260 * character. 1261 * 1262 * @param sequence the character sequence to replace matching groups of characters in 1263 * @param replacement the character to append to the result string in place of each group of 1264 * matching characters in {@code sequence} 1265 * @return the new string 1266 */ 1267 @CheckReturnValue 1268 public String collapseFrom(CharSequence sequence, char replacement) { 1269 // This implementation avoids unnecessary allocation. 1270 int len = sequence.length(); 1271 for (int i = 0; i < len; i++) { 1272 char c = sequence.charAt(i); 1273 if (matches(c)) { 1274 if (c == replacement 1275 && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 1276 // a no-op replacement 1277 i++; 1278 } else { 1279 StringBuilder builder = new StringBuilder(len) 1280 .append(sequence.subSequence(0, i)) 1281 .append(replacement); 1282 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 1283 } 1284 } 1285 } 1286 // no replacement needed 1287 return sequence.toString(); 1288 } 1289 1290 /** 1291 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1292 * groups of matching characters at the start or end of the sequence are removed without 1293 * replacement. 1294 */ 1295 @CheckReturnValue 1296 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1297 // This implementation avoids unnecessary allocation. 1298 int len = sequence.length(); 1299 int first; 1300 int last; 1301 1302 for (first = 0; first < len && matches(sequence.charAt(first)); first++) {} 1303 for (last = len - 1; last > first && matches(sequence.charAt(last)); last--) {} 1304 1305 return (first == 0 && last == len - 1) 1306 ? collapseFrom(sequence, replacement) 1307 : finishCollapseFrom( 1308 sequence, first, last + 1, replacement, 1309 new StringBuilder(last + 1 - first), 1310 false); 1311 } 1312 1313 private String finishCollapseFrom( 1314 CharSequence sequence, int start, int end, char replacement, 1315 StringBuilder builder, boolean inMatchingGroup) { 1316 for (int i = start; i < end; i++) { 1317 char c = sequence.charAt(i); 1318 if (matches(c)) { 1319 if (!inMatchingGroup) { 1320 builder.append(replacement); 1321 inMatchingGroup = true; 1322 } 1323 } else { 1324 builder.append(c); 1325 inMatchingGroup = false; 1326 } 1327 } 1328 return builder.toString(); 1329 } 1330 1331 /** 1332 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 1333 * instead. 1334 */ 1335 @Deprecated 1336 @Override 1337 public boolean apply(Character character) { 1338 return matches(character); 1339 } 1340 1341 /** 1342 * Returns a string representation of this {@code CharMatcher}, such as 1343 * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 1344 */ 1345 @Override 1346 public String toString() { 1347 return description; 1348 } 1349 1350 /** 1351 * Determines whether a character is whitespace according to the latest Unicode standard, as 1352 * illustrated 1353 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 1354 * This is not the same definition used by other Java APIs. (See a 1355 * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several 1356 * definitions of "whitespace"</a>.) 1357 * 1358 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up 1359 * to date. 1360 */ 1361 public static final CharMatcher WHITESPACE = new FastMatcher("WHITESPACE") { 1362 private static final String TABLE = "\u0009\u3000\n\u0009\u0009\u0009\u202F\u0009" 1363 + "\u0009\u2001\u2006\u0009\u0009\u0009\u0009\u0009" 1364 + "\u180E\u0009\u2029\u0009\u0009\u0009\u2000\u2005" 1365 + "\u200A\u0009\u0009\u0009\r\u0009\u0009\u2028" 1366 + "\u1680\u0009\u00A0\u0009\u2004\u2009\u0009\u0009" 1367 + "\u0009\u000C\u205F\u0009\u0009\u0020\u0009\u0009" 1368 + "\u2003\u2008\u0009\u0009\u0009\u000B\u0085\u0009" 1369 + "\u0009\u0009\u0009\u0009\u0009\u2002\u2007\u0009"; 1370 1371 @Override 1372 public boolean matches(char c) { 1373 return TABLE.charAt((-844444961 * c) >>> 26) == c; 1374 } 1375 1376 @GwtIncompatible("java.util.BitSet") 1377 @Override 1378 void setBits(BitSet table) { 1379 for (int i = 0; i < TABLE.length(); i++) { 1380 table.set(TABLE.charAt(i)); 1381 } 1382 } 1383 }; 1384}