001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkPositionIndex; 020 021import com.google.common.annotations.GwtCompatible; 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.VisibleForTesting; 024import java.util.Arrays; 025import java.util.BitSet; 026 027/** 028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 029 * for any {@link Object}. Also offers basic text processing methods based on this function. 030 * Implementations are strongly encouraged to be side-effect-free and immutable. 031 * 032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 033 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". 034 * 035 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a 036 * href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand 037 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code 038 * points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of 039 * assigned characters, including important CJK characters and emoji. 040 * 041 * <p>Supplementary characters are <a 042 * href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded 043 * into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as 044 * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. 045 * 046 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for 047 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For 048 * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. 049 * 050 * <p>Example usages: 051 * 052 * <pre> 053 * String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput); 054 * if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 055 * 056 * <p>See the Guava User Guide article on <a 057 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher} 058 * </a>. 059 * 060 * @author Kevin Bourrillion 061 * @since 1.0 062 */ 063@GwtCompatible(emulated = true) 064@ElementTypesAreNonnullByDefault 065public abstract class CharMatcher implements Predicate<Character> { 066 /* 067 * N777777777NO 068 * N7777777777777N 069 * M777777777777777N 070 * $N877777777D77777M 071 * N M77777777ONND777M 072 * MN777777777NN D777 073 * N7ZN777777777NN ~M7778 074 * N777777777777MMNN88777N 075 * N777777777777MNZZZ7777O 076 * DZN7777O77777777777777 077 * N7OONND7777777D77777N 078 * 8$M++++?N???$77777$ 079 * M7++++N+M77777777N 080 * N77O777777777777$ M 081 * DNNM$$$$777777N D 082 * N$N:=N$777N7777M NZ 083 * 77Z::::N777777777 ODZZZ 084 * 77N::::::N77777777M NNZZZ$ 085 * $777:::::::77777777MN ZM8ZZZZZ 086 * 777M::::::Z7777777Z77 N++ZZZZNN 087 * 7777M:::::M7777777$777M $++IZZZZM 088 * M777$:::::N777777$M7777M +++++ZZZDN 089 * NN$::::::7777$$M777777N N+++ZZZZNZ 090 * N::::::N:7$O:77777777 N++++ZZZZN 091 * M::::::::::::N77777777+ +?+++++ZZZM 092 * 8::::::::::::D77777777M O+++++ZZ 093 * ::::::::::::M777777777N O+?D 094 * M:::::::::::M77777777778 77= 095 * D=::::::::::N7777777777N 777 096 * INN===::::::=77777777777N I777N 097 * ?777N========N7777777777787M N7777 098 * 77777$D======N77777777777N777N? N777777 099 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N 100 * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON 101 * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND 102 * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N 103 * 7 :N MNN$$$$M$ $$$777$8 8D8I 104 * NMM.:7O 777777778 105 * 7777777MN 106 * M NO .7: 107 * M : M 108 * 8 109 */ 110 111 // Constant matcher factory methods 112 113 /** 114 * Matches any character. 115 * 116 * @since 19.0 (since 1.0 as constant {@code ANY}) 117 */ 118 public static CharMatcher any() { 119 return Any.INSTANCE; 120 } 121 122 /** 123 * Matches no characters. 124 * 125 * @since 19.0 (since 1.0 as constant {@code NONE}) 126 */ 127 public static CharMatcher none() { 128 return None.INSTANCE; 129 } 130 131 /** 132 * Determines whether a character is whitespace according to the latest Unicode standard, as 133 * illustrated <a 134 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 135 * This is not the same definition used by other Java APIs. (See a <a 136 * href="https://docs.google.com/spreadsheets/d/1kq4ECwPjHX9B8QUCTPclgsDCXYaj7T-FlT4tB5q3ahk/edit">comparison 137 * of several definitions of "whitespace"</a>.) 138 * 139 * <p>All Unicode White_Space characters are on the BMP and thus supported by this API. 140 * 141 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to 142 * date. 143 * 144 * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) 145 */ 146 public static CharMatcher whitespace() { 147 return Whitespace.INSTANCE; 148 } 149 150 /** 151 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 152 * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a 153 * discussion of that term. 154 * 155 * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) 156 */ 157 public static CharMatcher breakingWhitespace() { 158 return BreakingWhitespace.INSTANCE; 159 } 160 161 /** 162 * Determines whether a character is ASCII, meaning that its code point is less than 128. 163 * 164 * @since 19.0 (since 1.0 as constant {@code ASCII}) 165 */ 166 public static CharMatcher ascii() { 167 return Ascii.INSTANCE; 168 } 169 170 /** 171 * Determines whether a character is a BMP digit according to <a 172 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If 173 * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 174 * 175 * @deprecated Many digits are supplementary characters; see the class documentation. 176 * @since 19.0 (since 1.0 as constant {@code DIGIT}) 177 */ 178 @Deprecated 179 public static CharMatcher digit() { 180 return Digit.INSTANCE; 181 } 182 183 /** 184 * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) 185 * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', 186 * '9')}. 187 * 188 * @deprecated Many digits are supplementary characters; see the class documentation. 189 * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) 190 */ 191 @Deprecated 192 public static CharMatcher javaDigit() { 193 return JavaDigit.INSTANCE; 194 } 195 196 /** 197 * Determines whether a character is a BMP letter according to {@linkplain 198 * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin 199 * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 200 * 201 * @deprecated Most letters are supplementary characters; see the class documentation. 202 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) 203 */ 204 @Deprecated 205 public static CharMatcher javaLetter() { 206 return JavaLetter.INSTANCE; 207 } 208 209 /** 210 * Determines whether a character is a BMP letter or digit according to {@linkplain 211 * Character#isLetterOrDigit(char) Java's definition}. 212 * 213 * @deprecated Most letters and digits are supplementary characters; see the class documentation. 214 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). 215 */ 216 @Deprecated 217 public static CharMatcher javaLetterOrDigit() { 218 return JavaLetterOrDigit.INSTANCE; 219 } 220 221 /** 222 * Determines whether a BMP character is upper case according to {@linkplain 223 * Character#isUpperCase(char) Java's definition}. 224 * 225 * @deprecated Some uppercase characters are supplementary characters; see the class 226 * documentation. 227 * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) 228 */ 229 @Deprecated 230 public static CharMatcher javaUpperCase() { 231 return JavaUpperCase.INSTANCE; 232 } 233 234 /** 235 * Determines whether a BMP character is lower case according to {@linkplain 236 * Character#isLowerCase(char) Java's definition}. 237 * 238 * @deprecated Some lowercase characters are supplementary characters; see the class 239 * documentation. 240 * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) 241 */ 242 @Deprecated 243 public static CharMatcher javaLowerCase() { 244 return JavaLowerCase.INSTANCE; 245 } 246 247 /** 248 * Determines whether a character is an ISO control character as specified by {@link 249 * Character#isISOControl(char)}. 250 * 251 * <p>All ISO control codes are on the BMP and thus supported by this API. 252 * 253 * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) 254 */ 255 public static CharMatcher javaIsoControl() { 256 return JavaIsoControl.INSTANCE; 257 } 258 259 /** 260 * Determines whether a character is invisible; that is, if its Unicode category is any of 261 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 262 * PRIVATE_USE according to ICU4J. 263 * 264 * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU). 265 * 266 * @deprecated Most invisible characters are supplementary characters; see the class 267 * documentation. 268 * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) 269 */ 270 @Deprecated 271 public static CharMatcher invisible() { 272 return Invisible.INSTANCE; 273 } 274 275 /** 276 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 277 * errs on the side of returning {@code false} (that is, it tends to assume a character is 278 * double-width). 279 * 280 * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to 281 * date. 282 * 283 * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>. 284 * 285 * @deprecated Many such characters are supplementary characters; see the class documentation. 286 * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) 287 */ 288 @Deprecated 289 public static CharMatcher singleWidth() { 290 return SingleWidth.INSTANCE; 291 } 292 293 // Static factories 294 295 /** Returns a {@code char} matcher that matches only one specified BMP character. */ 296 public static CharMatcher is(final char match) { 297 return new Is(match); 298 } 299 300 /** 301 * Returns a {@code char} matcher that matches any character except the BMP character specified. 302 * 303 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 304 */ 305 public static CharMatcher isNot(final char match) { 306 return new IsNot(match); 307 } 308 309 /** 310 * Returns a {@code char} matcher that matches any BMP character present in the given character 311 * sequence. Returns a bogus matcher if the sequence contains supplementary characters. 312 */ 313 public static CharMatcher anyOf(final CharSequence sequence) { 314 switch (sequence.length()) { 315 case 0: 316 return none(); 317 case 1: 318 return is(sequence.charAt(0)); 319 case 2: 320 return isEither(sequence.charAt(0), sequence.charAt(1)); 321 default: 322 // TODO(lowasser): is it potentially worth just going ahead and building a precomputed 323 // matcher? 324 return new AnyOf(sequence); 325 } 326 } 327 328 /** 329 * Returns a {@code char} matcher that matches any BMP character not present in the given 330 * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. 331 */ 332 public static CharMatcher noneOf(CharSequence sequence) { 333 return anyOf(sequence).negate(); 334 } 335 336 /** 337 * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints 338 * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 339 * CharMatcher.inRange('a', 'z')}. 340 * 341 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 342 */ 343 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 344 return new InRange(startInclusive, endInclusive); 345 } 346 347 /** 348 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 349 * which operates on primitive {@code char} instances instead. 350 */ 351 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 352 return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); 353 } 354 355 // Constructors 356 357 /** 358 * Constructor for use by subclasses. When subclassing, you may want to override {@code 359 * toString()} to provide a useful description. 360 */ 361 protected CharMatcher() {} 362 363 // Abstract methods 364 365 /** Determines a true or false value for the given character. */ 366 public abstract boolean matches(char c); 367 368 // Non-static factories 369 370 /** Returns a matcher that matches any character not matched by this matcher. */ 371 // This is not an override in java7, where Guava's Predicate does not extend the JDK's Predicate. 372 @SuppressWarnings("MissingOverride") 373 public CharMatcher negate() { 374 return new Negated(this); 375 } 376 377 /** 378 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 379 */ 380 public CharMatcher and(CharMatcher other) { 381 return new And(this, other); 382 } 383 384 /** 385 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 386 */ 387 public CharMatcher or(CharMatcher other) { 388 return new Or(this, other); 389 } 390 391 /** 392 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 393 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 394 * worthwhile only if the precomputed matcher is queried many thousands of times. 395 * 396 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 397 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 398 * worthwhile tradeoff in a browser. 399 */ 400 public CharMatcher precomputed() { 401 return Platform.precomputeCharMatcher(this); 402 } 403 404 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 405 406 /** 407 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 408 * on {@link Platform} so that we can have different behavior in GWT. 409 * 410 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the 411 * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables 412 * for matchers that only match a few characters, and so on. In the worst-case scenario, it 413 * constructs an eight-kilobyte bit array and queries that. In many situations this produces a 414 * matcher which is faster to query than the original. 415 */ 416 @GwtIncompatible // SmallCharMatcher 417 CharMatcher precomputedInternal() { 418 final BitSet table = new BitSet(); 419 setBits(table); 420 int totalCharacters = table.cardinality(); 421 if (totalCharacters * 2 <= DISTINCT_CHARS) { 422 return precomputedPositive(totalCharacters, table, toString()); 423 } else { 424 // TODO(lowasser): is it worth it to worry about the last character of large matchers? 425 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 426 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 427 String suffix = ".negate()"; 428 final String description = toString(); 429 String negatedDescription = 430 description.endsWith(suffix) 431 ? description.substring(0, description.length() - suffix.length()) 432 : description + suffix; 433 return new NegatedFastMatcher( 434 precomputedPositive(negatedCharacters, table, negatedDescription)) { 435 @Override 436 public String toString() { 437 return description; 438 } 439 }; 440 } 441 } 442 443 /** 444 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 445 */ 446 @GwtIncompatible // SmallCharMatcher 447 private static CharMatcher precomputedPositive( 448 int totalCharacters, BitSet table, String description) { 449 switch (totalCharacters) { 450 case 0: 451 return none(); 452 case 1: 453 return is((char) table.nextSetBit(0)); 454 case 2: 455 char c1 = (char) table.nextSetBit(0); 456 char c2 = (char) table.nextSetBit(c1 + 1); 457 return isEither(c1, c2); 458 default: 459 return isSmall(totalCharacters, table.length()) 460 ? SmallCharMatcher.from(table, description) 461 : new BitSetMatcher(table, description); 462 } 463 } 464 465 @GwtIncompatible // SmallCharMatcher 466 private static boolean isSmall(int totalCharacters, int tableLength) { 467 return totalCharacters <= SmallCharMatcher.MAX_SIZE 468 && tableLength > (totalCharacters * 4 * Character.SIZE); 469 // err on the side of BitSetMatcher 470 } 471 472 /** Sets bits in {@code table} matched by this matcher. */ 473 @GwtIncompatible // used only from other GwtIncompatible code 474 void setBits(BitSet table) { 475 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 476 if (matches((char) c)) { 477 table.set(c); 478 } 479 } 480 } 481 482 // Text processing routines 483 484 /** 485 * Returns {@code true} if a character sequence contains at least one matching BMP character. 486 * Equivalent to {@code !matchesNoneOf(sequence)}. 487 * 488 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 489 * character, until this returns {@code true} or the end is reached. 490 * 491 * @param sequence the character sequence to examine, possibly empty 492 * @return {@code true} if this matcher matches at least one character in the sequence 493 * @since 8.0 494 */ 495 public boolean matchesAnyOf(CharSequence sequence) { 496 return !matchesNoneOf(sequence); 497 } 498 499 /** 500 * Returns {@code true} if a character sequence contains only matching BMP characters. 501 * 502 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 503 * character, until this returns {@code false} or the end is reached. 504 * 505 * @param sequence the character sequence to examine, possibly empty 506 * @return {@code true} if this matcher matches every character in the sequence, including when 507 * the sequence is empty 508 */ 509 public boolean matchesAllOf(CharSequence sequence) { 510 for (int i = sequence.length() - 1; i >= 0; i--) { 511 if (!matches(sequence.charAt(i))) { 512 return false; 513 } 514 } 515 return true; 516 } 517 518 /** 519 * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to 520 * {@code !matchesAnyOf(sequence)}. 521 * 522 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 523 * character, until this returns {@code true} or the end is reached. 524 * 525 * @param sequence the character sequence to examine, possibly empty 526 * @return {@code true} if this matcher matches no characters in the sequence, including when the 527 * sequence is empty 528 */ 529 public boolean matchesNoneOf(CharSequence sequence) { 530 return indexIn(sequence) == -1; 531 } 532 533 /** 534 * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if 535 * no matching character is present. 536 * 537 * <p>The default implementation iterates over the sequence in forward order calling {@link 538 * #matches} for each character. 539 * 540 * @param sequence the character sequence to examine from the beginning 541 * @return an index, or {@code -1} if no character matches 542 */ 543 public int indexIn(CharSequence sequence) { 544 return indexIn(sequence, 0); 545 } 546 547 /** 548 * Returns the index of the first matching BMP character in a character sequence, starting from a 549 * given position, or {@code -1} if no character matches after that position. 550 * 551 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 552 * start}, calling {@link #matches} for each character. 553 * 554 * @param sequence the character sequence to examine 555 * @param start the first index to examine; must be nonnegative and no greater than {@code 556 * sequence.length()} 557 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 558 * or {@code -1} if no character matches 559 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 560 * sequence.length()} 561 */ 562 public int indexIn(CharSequence sequence, int start) { 563 int length = sequence.length(); 564 checkPositionIndex(start, length); 565 for (int i = start; i < length; i++) { 566 if (matches(sequence.charAt(i))) { 567 return i; 568 } 569 } 570 return -1; 571 } 572 573 /** 574 * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if 575 * no matching character is present. 576 * 577 * <p>The default implementation iterates over the sequence in reverse order calling {@link 578 * #matches} for each character. 579 * 580 * @param sequence the character sequence to examine from the end 581 * @return an index, or {@code -1} if no character matches 582 */ 583 public int lastIndexIn(CharSequence sequence) { 584 for (int i = sequence.length() - 1; i >= 0; i--) { 585 if (matches(sequence.charAt(i))) { 586 return i; 587 } 588 } 589 return -1; 590 } 591 592 /** 593 * Returns the number of matching {@code char}s found in a character sequence. 594 * 595 * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). 596 */ 597 public int countIn(CharSequence sequence) { 598 int count = 0; 599 for (int i = 0; i < sequence.length(); i++) { 600 if (matches(sequence.charAt(i))) { 601 count++; 602 } 603 } 604 return count; 605 } 606 607 /** 608 * Returns a string containing all non-matching characters of a character sequence, in order. For 609 * example: 610 * 611 * <pre>{@code 612 * CharMatcher.is('a').removeFrom("bazaar") 613 * }</pre> 614 * 615 * ... returns {@code "bzr"}. 616 */ 617 public String removeFrom(CharSequence sequence) { 618 String string = sequence.toString(); 619 int pos = indexIn(string); 620 if (pos == -1) { 621 return string; 622 } 623 624 char[] chars = string.toCharArray(); 625 int spread = 1; 626 627 // This unusual loop comes from extensive benchmarking 628 OUT: 629 while (true) { 630 pos++; 631 while (true) { 632 if (pos == chars.length) { 633 break OUT; 634 } 635 if (matches(chars[pos])) { 636 break; 637 } 638 chars[pos - spread] = chars[pos]; 639 pos++; 640 } 641 spread++; 642 } 643 return new String(chars, 0, pos - spread); 644 } 645 646 /** 647 * Returns a string containing all matching BMP characters of a character sequence, in order. For 648 * example: 649 * 650 * <pre>{@code 651 * CharMatcher.is('a').retainFrom("bazaar") 652 * }</pre> 653 * 654 * ... returns {@code "aaa"}. 655 */ 656 public String retainFrom(CharSequence sequence) { 657 return negate().removeFrom(sequence); 658 } 659 660 /** 661 * Returns a string copy of the input character sequence, with each matching BMP character 662 * replaced by a given replacement character. For example: 663 * 664 * <pre>{@code 665 * CharMatcher.is('a').replaceFrom("radar", 'o') 666 * }</pre> 667 * 668 * ... returns {@code "rodor"}. 669 * 670 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 671 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 672 * character. 673 * 674 * @param sequence the character sequence to replace matching characters in 675 * @param replacement the character to append to the result string in place of each matching 676 * character in {@code sequence} 677 * @return the new string 678 */ 679 public String replaceFrom(CharSequence sequence, char replacement) { 680 String string = sequence.toString(); 681 int pos = indexIn(string); 682 if (pos == -1) { 683 return string; 684 } 685 char[] chars = string.toCharArray(); 686 chars[pos] = replacement; 687 for (int i = pos + 1; i < chars.length; i++) { 688 if (matches(chars[i])) { 689 chars[i] = replacement; 690 } 691 } 692 return new String(chars); 693 } 694 695 /** 696 * Returns a string copy of the input character sequence, with each matching BMP character 697 * replaced by a given replacement sequence. For example: 698 * 699 * <pre>{@code 700 * CharMatcher.is('a').replaceFrom("yaha", "oo") 701 * }</pre> 702 * 703 * ... returns {@code "yoohoo"}. 704 * 705 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 706 * off calling {@link #replaceFrom(CharSequence, char)} directly. 707 * 708 * @param sequence the character sequence to replace matching characters in 709 * @param replacement the characters to append to the result string in place of each matching 710 * character in {@code sequence} 711 * @return the new string 712 */ 713 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 714 int replacementLen = replacement.length(); 715 if (replacementLen == 0) { 716 return removeFrom(sequence); 717 } 718 if (replacementLen == 1) { 719 return replaceFrom(sequence, replacement.charAt(0)); 720 } 721 722 String string = sequence.toString(); 723 int pos = indexIn(string); 724 if (pos == -1) { 725 return string; 726 } 727 728 int len = string.length(); 729 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 730 731 int oldpos = 0; 732 do { 733 buf.append(string, oldpos, pos); 734 buf.append(replacement); 735 oldpos = pos + 1; 736 pos = indexIn(string, oldpos); 737 } while (pos != -1); 738 739 buf.append(string, oldpos, len); 740 return buf.toString(); 741 } 742 743 /** 744 * Returns a substring of the input character sequence that omits all matching BMP characters from 745 * the beginning and from the end of the string. For example: 746 * 747 * <pre>{@code 748 * CharMatcher.anyOf("ab").trimFrom("abacatbab") 749 * }</pre> 750 * 751 * ... returns {@code "cat"}. 752 * 753 * <p>Note that: 754 * 755 * <pre>{@code 756 * CharMatcher.inRange('\0', ' ').trimFrom(str) 757 * }</pre> 758 * 759 * ... is equivalent to {@link String#trim()}. 760 */ 761 public String trimFrom(CharSequence sequence) { 762 int len = sequence.length(); 763 int first; 764 int last; 765 766 for (first = 0; first < len; first++) { 767 if (!matches(sequence.charAt(first))) { 768 break; 769 } 770 } 771 for (last = len - 1; last > first; last--) { 772 if (!matches(sequence.charAt(last))) { 773 break; 774 } 775 } 776 777 return sequence.subSequence(first, last + 1).toString(); 778 } 779 780 /** 781 * Returns a substring of the input character sequence that omits all matching BMP characters from 782 * the beginning of the string. For example: 783 * 784 * <pre>{@code 785 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab") 786 * }</pre> 787 * 788 * ... returns {@code "catbab"}. 789 */ 790 public String trimLeadingFrom(CharSequence sequence) { 791 int len = sequence.length(); 792 for (int first = 0; first < len; first++) { 793 if (!matches(sequence.charAt(first))) { 794 return sequence.subSequence(first, len).toString(); 795 } 796 } 797 return ""; 798 } 799 800 /** 801 * Returns a substring of the input character sequence that omits all matching BMP characters from 802 * the end of the string. For example: 803 * 804 * <pre>{@code 805 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab") 806 * }</pre> 807 * 808 * ... returns {@code "abacat"}. 809 */ 810 public String trimTrailingFrom(CharSequence sequence) { 811 int len = sequence.length(); 812 for (int last = len - 1; last >= 0; last--) { 813 if (!matches(sequence.charAt(last))) { 814 return sequence.subSequence(0, last + 1).toString(); 815 } 816 } 817 return ""; 818 } 819 820 /** 821 * Returns a string copy of the input character sequence, with each group of consecutive matching 822 * BMP characters replaced by a single replacement character. For example: 823 * 824 * <pre>{@code 825 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-') 826 * }</pre> 827 * 828 * ... returns {@code "b-p-r"}. 829 * 830 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 831 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 832 * character. 833 * 834 * @param sequence the character sequence to replace matching groups of characters in 835 * @param replacement the character to append to the result string in place of each group of 836 * matching characters in {@code sequence} 837 * @return the new string 838 */ 839 public String collapseFrom(CharSequence sequence, char replacement) { 840 // This implementation avoids unnecessary allocation. 841 int len = sequence.length(); 842 for (int i = 0; i < len; i++) { 843 char c = sequence.charAt(i); 844 if (matches(c)) { 845 if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 846 // a no-op replacement 847 i++; 848 } else { 849 StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); 850 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 851 } 852 } 853 } 854 // no replacement needed 855 return sequence.toString(); 856 } 857 858 /** 859 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 860 * groups of matching BMP characters at the start or end of the sequence are removed without 861 * replacement. 862 */ 863 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 864 // This implementation avoids unnecessary allocation. 865 int len = sequence.length(); 866 int first = 0; 867 int last = len - 1; 868 869 while (first < len && matches(sequence.charAt(first))) { 870 first++; 871 } 872 873 while (last > first && matches(sequence.charAt(last))) { 874 last--; 875 } 876 877 return (first == 0 && last == len - 1) 878 ? collapseFrom(sequence, replacement) 879 : finishCollapseFrom( 880 sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); 881 } 882 883 private String finishCollapseFrom( 884 CharSequence sequence, 885 int start, 886 int end, 887 char replacement, 888 StringBuilder builder, 889 boolean inMatchingGroup) { 890 for (int i = start; i < end; i++) { 891 char c = sequence.charAt(i); 892 if (matches(c)) { 893 if (!inMatchingGroup) { 894 builder.append(replacement); 895 inMatchingGroup = true; 896 } 897 } else { 898 builder.append(c); 899 inMatchingGroup = false; 900 } 901 } 902 return builder.toString(); 903 } 904 905 /** 906 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 907 * instead. 908 */ 909 @Deprecated 910 @Override 911 public boolean apply(Character character) { 912 return matches(character); 913 } 914 915 /** 916 * Returns a string representation of this {@code CharMatcher}, such as {@code 917 * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 918 */ 919 @Override 920 public String toString() { 921 return super.toString(); 922 } 923 924 /** 925 * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where 926 * "12AB" is the four hexadecimal digits representing the 16-bit code unit. 927 */ 928 private static String showCharacter(char c) { 929 String hex = "0123456789ABCDEF"; 930 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 931 for (int i = 0; i < 4; i++) { 932 tmp[5 - i] = hex.charAt(c & 0xF); 933 c = (char) (c >> 4); 934 } 935 return String.copyValueOf(tmp); 936 } 937 938 // Fast matchers 939 940 /** A matcher for which precomputation will not yield any significant benefit. */ 941 abstract static class FastMatcher extends CharMatcher { 942 943 @Override 944 public final CharMatcher precomputed() { 945 return this; 946 } 947 948 @Override 949 public CharMatcher negate() { 950 return new NegatedFastMatcher(this); 951 } 952 } 953 954 /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ 955 abstract static class NamedFastMatcher extends FastMatcher { 956 957 private final String description; 958 959 NamedFastMatcher(String description) { 960 this.description = checkNotNull(description); 961 } 962 963 @Override 964 public final String toString() { 965 return description; 966 } 967 } 968 969 /** Negation of a {@link FastMatcher}. */ 970 private static class NegatedFastMatcher extends Negated { 971 972 NegatedFastMatcher(CharMatcher original) { 973 super(original); 974 } 975 976 @Override 977 public final CharMatcher precomputed() { 978 return this; 979 } 980 } 981 982 /** Fast matcher using a {@link BitSet} table of matching characters. */ 983 @GwtIncompatible // used only from other GwtIncompatible code 984 private static final class BitSetMatcher extends NamedFastMatcher { 985 986 private final BitSet table; 987 988 private BitSetMatcher(BitSet table, String description) { 989 super(description); 990 if (table.length() + Long.SIZE < table.size()) { 991 table = (BitSet) table.clone(); 992 // If only we could actually call BitSet.trimToSize() ourselves... 993 } 994 this.table = table; 995 } 996 997 @Override 998 public boolean matches(char c) { 999 return table.get(c); 1000 } 1001 1002 @Override 1003 void setBits(BitSet bitSet) { 1004 bitSet.or(table); 1005 } 1006 } 1007 1008 // Static constant implementation classes 1009 1010 /** Implementation of {@link #any()}. */ 1011 private static final class Any extends NamedFastMatcher { 1012 1013 static final CharMatcher INSTANCE = new Any(); 1014 1015 private Any() { 1016 super("CharMatcher.any()"); 1017 } 1018 1019 @Override 1020 public boolean matches(char c) { 1021 return true; 1022 } 1023 1024 @Override 1025 public int indexIn(CharSequence sequence) { 1026 return (sequence.length() == 0) ? -1 : 0; 1027 } 1028 1029 @Override 1030 public int indexIn(CharSequence sequence, int start) { 1031 int length = sequence.length(); 1032 checkPositionIndex(start, length); 1033 return (start == length) ? -1 : start; 1034 } 1035 1036 @Override 1037 public int lastIndexIn(CharSequence sequence) { 1038 return sequence.length() - 1; 1039 } 1040 1041 @Override 1042 public boolean matchesAllOf(CharSequence sequence) { 1043 checkNotNull(sequence); 1044 return true; 1045 } 1046 1047 @Override 1048 public boolean matchesNoneOf(CharSequence sequence) { 1049 return sequence.length() == 0; 1050 } 1051 1052 @Override 1053 public String removeFrom(CharSequence sequence) { 1054 checkNotNull(sequence); 1055 return ""; 1056 } 1057 1058 @Override 1059 public String replaceFrom(CharSequence sequence, char replacement) { 1060 char[] array = new char[sequence.length()]; 1061 Arrays.fill(array, replacement); 1062 return new String(array); 1063 } 1064 1065 @Override 1066 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1067 StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); 1068 for (int i = 0; i < sequence.length(); i++) { 1069 result.append(replacement); 1070 } 1071 return result.toString(); 1072 } 1073 1074 @Override 1075 public String collapseFrom(CharSequence sequence, char replacement) { 1076 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 1077 } 1078 1079 @Override 1080 public String trimFrom(CharSequence sequence) { 1081 checkNotNull(sequence); 1082 return ""; 1083 } 1084 1085 @Override 1086 public int countIn(CharSequence sequence) { 1087 return sequence.length(); 1088 } 1089 1090 @Override 1091 public CharMatcher and(CharMatcher other) { 1092 return checkNotNull(other); 1093 } 1094 1095 @Override 1096 public CharMatcher or(CharMatcher other) { 1097 checkNotNull(other); 1098 return this; 1099 } 1100 1101 @Override 1102 public CharMatcher negate() { 1103 return none(); 1104 } 1105 } 1106 1107 /** Implementation of {@link #none()}. */ 1108 private static final class None extends NamedFastMatcher { 1109 1110 static final CharMatcher INSTANCE = new None(); 1111 1112 private None() { 1113 super("CharMatcher.none()"); 1114 } 1115 1116 @Override 1117 public boolean matches(char c) { 1118 return false; 1119 } 1120 1121 @Override 1122 public int indexIn(CharSequence sequence) { 1123 checkNotNull(sequence); 1124 return -1; 1125 } 1126 1127 @Override 1128 public int indexIn(CharSequence sequence, int start) { 1129 int length = sequence.length(); 1130 checkPositionIndex(start, length); 1131 return -1; 1132 } 1133 1134 @Override 1135 public int lastIndexIn(CharSequence sequence) { 1136 checkNotNull(sequence); 1137 return -1; 1138 } 1139 1140 @Override 1141 public boolean matchesAllOf(CharSequence sequence) { 1142 return sequence.length() == 0; 1143 } 1144 1145 @Override 1146 public boolean matchesNoneOf(CharSequence sequence) { 1147 checkNotNull(sequence); 1148 return true; 1149 } 1150 1151 @Override 1152 public String removeFrom(CharSequence sequence) { 1153 return sequence.toString(); 1154 } 1155 1156 @Override 1157 public String replaceFrom(CharSequence sequence, char replacement) { 1158 return sequence.toString(); 1159 } 1160 1161 @Override 1162 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1163 checkNotNull(replacement); 1164 return sequence.toString(); 1165 } 1166 1167 @Override 1168 public String collapseFrom(CharSequence sequence, char replacement) { 1169 return sequence.toString(); 1170 } 1171 1172 @Override 1173 public String trimFrom(CharSequence sequence) { 1174 return sequence.toString(); 1175 } 1176 1177 @Override 1178 public String trimLeadingFrom(CharSequence sequence) { 1179 return sequence.toString(); 1180 } 1181 1182 @Override 1183 public String trimTrailingFrom(CharSequence sequence) { 1184 return sequence.toString(); 1185 } 1186 1187 @Override 1188 public int countIn(CharSequence sequence) { 1189 checkNotNull(sequence); 1190 return 0; 1191 } 1192 1193 @Override 1194 public CharMatcher and(CharMatcher other) { 1195 checkNotNull(other); 1196 return this; 1197 } 1198 1199 @Override 1200 public CharMatcher or(CharMatcher other) { 1201 return checkNotNull(other); 1202 } 1203 1204 @Override 1205 public CharMatcher negate() { 1206 return any(); 1207 } 1208 } 1209 1210 /** Implementation of {@link #whitespace()}. */ 1211 @VisibleForTesting 1212 static final class Whitespace extends NamedFastMatcher { 1213 1214 // TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function 1215 // whose key property is that it maps 25 characters into the 32-slot table without collision. 1216 // Basically this is an opportunistic fast implementation as opposed to "good code". For most 1217 // other use-cases, the reduction in readability isn't worth it. 1218 static final String TABLE = 1219 "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1220 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1221 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1222 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1223 static final int MULTIPLIER = 1682554634; 1224 static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); 1225 1226 static final CharMatcher INSTANCE = new Whitespace(); 1227 1228 Whitespace() { 1229 super("CharMatcher.whitespace()"); 1230 } 1231 1232 @Override 1233 public boolean matches(char c) { 1234 return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; 1235 } 1236 1237 @GwtIncompatible // used only from other GwtIncompatible code 1238 @Override 1239 void setBits(BitSet table) { 1240 for (int i = 0; i < TABLE.length(); i++) { 1241 table.set(TABLE.charAt(i)); 1242 } 1243 } 1244 } 1245 1246 /** Implementation of {@link #breakingWhitespace()}. */ 1247 private static final class BreakingWhitespace extends CharMatcher { 1248 1249 static final CharMatcher INSTANCE = new BreakingWhitespace(); 1250 1251 @Override 1252 public boolean matches(char c) { 1253 switch (c) { 1254 case '\t': 1255 case '\n': 1256 case '\013': 1257 case '\f': 1258 case '\r': 1259 case ' ': 1260 case '\u0085': 1261 case '\u1680': 1262 case '\u2028': 1263 case '\u2029': 1264 case '\u205f': 1265 case '\u3000': 1266 return true; 1267 case '\u2007': 1268 return false; 1269 default: 1270 return c >= '\u2000' && c <= '\u200a'; 1271 } 1272 } 1273 1274 @Override 1275 public String toString() { 1276 return "CharMatcher.breakingWhitespace()"; 1277 } 1278 } 1279 1280 /** Implementation of {@link #ascii()}. */ 1281 private static final class Ascii extends NamedFastMatcher { 1282 1283 static final CharMatcher INSTANCE = new Ascii(); 1284 1285 Ascii() { 1286 super("CharMatcher.ascii()"); 1287 } 1288 1289 @Override 1290 public boolean matches(char c) { 1291 return c <= '\u007f'; 1292 } 1293 } 1294 1295 /** Implementation that matches characters that fall within multiple ranges. */ 1296 private static class RangesMatcher extends CharMatcher { 1297 1298 private final String description; 1299 private final char[] rangeStarts; 1300 private final char[] rangeEnds; 1301 1302 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 1303 this.description = description; 1304 this.rangeStarts = rangeStarts; 1305 this.rangeEnds = rangeEnds; 1306 checkArgument(rangeStarts.length == rangeEnds.length); 1307 for (int i = 0; i < rangeStarts.length; i++) { 1308 checkArgument(rangeStarts[i] <= rangeEnds[i]); 1309 if (i + 1 < rangeStarts.length) { 1310 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 1311 } 1312 } 1313 } 1314 1315 @Override 1316 public boolean matches(char c) { 1317 int index = Arrays.binarySearch(rangeStarts, c); 1318 if (index >= 0) { 1319 return true; 1320 } else { 1321 index = ~index - 1; 1322 return index >= 0 && c <= rangeEnds[index]; 1323 } 1324 } 1325 1326 @Override 1327 public String toString() { 1328 return description; 1329 } 1330 } 1331 1332 /** Implementation of {@link #digit()}. */ 1333 private static final class Digit extends RangesMatcher { 1334 // Plug the following UnicodeSet pattern into 1335 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1336 // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] 1337 // and get the zeroes from there. 1338 1339 // Must be in ascending order. 1340 private static final String ZEROES = 1341 "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" 1342 + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" 1343 + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; 1344 1345 private static char[] zeroes() { 1346 return ZEROES.toCharArray(); 1347 } 1348 1349 private static char[] nines() { 1350 char[] nines = new char[ZEROES.length()]; 1351 for (int i = 0; i < ZEROES.length(); i++) { 1352 nines[i] = (char) (ZEROES.charAt(i) + 9); 1353 } 1354 return nines; 1355 } 1356 1357 static final CharMatcher INSTANCE = new Digit(); 1358 1359 private Digit() { 1360 super("CharMatcher.digit()", zeroes(), nines()); 1361 } 1362 } 1363 1364 /** Implementation of {@link #javaDigit()}. */ 1365 private static final class JavaDigit extends CharMatcher { 1366 1367 static final CharMatcher INSTANCE = new JavaDigit(); 1368 1369 @Override 1370 public boolean matches(char c) { 1371 return Character.isDigit(c); 1372 } 1373 1374 @Override 1375 public String toString() { 1376 return "CharMatcher.javaDigit()"; 1377 } 1378 } 1379 1380 /** Implementation of {@link #javaLetter()}. */ 1381 private static final class JavaLetter extends CharMatcher { 1382 1383 static final CharMatcher INSTANCE = new JavaLetter(); 1384 1385 @Override 1386 public boolean matches(char c) { 1387 return Character.isLetter(c); 1388 } 1389 1390 @Override 1391 public String toString() { 1392 return "CharMatcher.javaLetter()"; 1393 } 1394 } 1395 1396 /** Implementation of {@link #javaLetterOrDigit()}. */ 1397 private static final class JavaLetterOrDigit extends CharMatcher { 1398 1399 static final CharMatcher INSTANCE = new JavaLetterOrDigit(); 1400 1401 @Override 1402 public boolean matches(char c) { 1403 return Character.isLetterOrDigit(c); 1404 } 1405 1406 @Override 1407 public String toString() { 1408 return "CharMatcher.javaLetterOrDigit()"; 1409 } 1410 } 1411 1412 /** Implementation of {@link #javaUpperCase()}. */ 1413 private static final class JavaUpperCase extends CharMatcher { 1414 1415 static final CharMatcher INSTANCE = new JavaUpperCase(); 1416 1417 @Override 1418 public boolean matches(char c) { 1419 return Character.isUpperCase(c); 1420 } 1421 1422 @Override 1423 public String toString() { 1424 return "CharMatcher.javaUpperCase()"; 1425 } 1426 } 1427 1428 /** Implementation of {@link #javaLowerCase()}. */ 1429 private static final class JavaLowerCase extends CharMatcher { 1430 1431 static final CharMatcher INSTANCE = new JavaLowerCase(); 1432 1433 @Override 1434 public boolean matches(char c) { 1435 return Character.isLowerCase(c); 1436 } 1437 1438 @Override 1439 public String toString() { 1440 return "CharMatcher.javaLowerCase()"; 1441 } 1442 } 1443 1444 /** Implementation of {@link #javaIsoControl()}. */ 1445 private static final class JavaIsoControl extends NamedFastMatcher { 1446 1447 static final CharMatcher INSTANCE = new JavaIsoControl(); 1448 1449 private JavaIsoControl() { 1450 super("CharMatcher.javaIsoControl()"); 1451 } 1452 1453 @Override 1454 public boolean matches(char c) { 1455 return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); 1456 } 1457 } 1458 1459 /** Implementation of {@link #invisible()}. */ 1460 private static final class Invisible extends RangesMatcher { 1461 // Plug the following UnicodeSet pattern into 1462 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1463 // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] 1464 // with the "Abbreviate" option, and get the ranges from there. 1465 private static final String RANGE_STARTS = 1466 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u0890\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" 1467 + "\u3000\ud800\ufeff\ufff9"; 1468 private static final String RANGE_ENDS = // inclusive ends 1469 "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u0891\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" 1470 + "\u3000\uf8ff\ufeff\ufffb"; 1471 1472 static final CharMatcher INSTANCE = new Invisible(); 1473 1474 private Invisible() { 1475 super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); 1476 } 1477 } 1478 1479 /** Implementation of {@link #singleWidth()}. */ 1480 private static final class SingleWidth extends RangesMatcher { 1481 1482 static final CharMatcher INSTANCE = new SingleWidth(); 1483 1484 private SingleWidth() { 1485 super( 1486 "CharMatcher.singleWidth()", 1487 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 1488 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 1489 } 1490 } 1491 1492 // Non-static factory implementation classes 1493 1494 /** Implementation of {@link #negate()}. */ 1495 private static class Negated extends CharMatcher { 1496 1497 final CharMatcher original; 1498 1499 Negated(CharMatcher original) { 1500 this.original = checkNotNull(original); 1501 } 1502 1503 @Override 1504 public boolean matches(char c) { 1505 return !original.matches(c); 1506 } 1507 1508 @Override 1509 public boolean matchesAllOf(CharSequence sequence) { 1510 return original.matchesNoneOf(sequence); 1511 } 1512 1513 @Override 1514 public boolean matchesNoneOf(CharSequence sequence) { 1515 return original.matchesAllOf(sequence); 1516 } 1517 1518 @Override 1519 public int countIn(CharSequence sequence) { 1520 return sequence.length() - original.countIn(sequence); 1521 } 1522 1523 @GwtIncompatible // used only from other GwtIncompatible code 1524 @Override 1525 void setBits(BitSet table) { 1526 BitSet tmp = new BitSet(); 1527 original.setBits(tmp); 1528 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 1529 table.or(tmp); 1530 } 1531 1532 @Override 1533 public CharMatcher negate() { 1534 return original; 1535 } 1536 1537 @Override 1538 public String toString() { 1539 return original + ".negate()"; 1540 } 1541 } 1542 1543 /** Implementation of {@link #and(CharMatcher)}. */ 1544 private static final class And extends CharMatcher { 1545 1546 final CharMatcher first; 1547 final CharMatcher second; 1548 1549 And(CharMatcher a, CharMatcher b) { 1550 first = checkNotNull(a); 1551 second = checkNotNull(b); 1552 } 1553 1554 @Override 1555 public boolean matches(char c) { 1556 return first.matches(c) && second.matches(c); 1557 } 1558 1559 @GwtIncompatible // used only from other GwtIncompatible code 1560 @Override 1561 void setBits(BitSet table) { 1562 BitSet tmp1 = new BitSet(); 1563 first.setBits(tmp1); 1564 BitSet tmp2 = new BitSet(); 1565 second.setBits(tmp2); 1566 tmp1.and(tmp2); 1567 table.or(tmp1); 1568 } 1569 1570 @Override 1571 public String toString() { 1572 return "CharMatcher.and(" + first + ", " + second + ")"; 1573 } 1574 } 1575 1576 /** Implementation of {@link #or(CharMatcher)}. */ 1577 private static final class Or extends CharMatcher { 1578 1579 final CharMatcher first; 1580 final CharMatcher second; 1581 1582 Or(CharMatcher a, CharMatcher b) { 1583 first = checkNotNull(a); 1584 second = checkNotNull(b); 1585 } 1586 1587 @GwtIncompatible // used only from other GwtIncompatible code 1588 @Override 1589 void setBits(BitSet table) { 1590 first.setBits(table); 1591 second.setBits(table); 1592 } 1593 1594 @Override 1595 public boolean matches(char c) { 1596 return first.matches(c) || second.matches(c); 1597 } 1598 1599 @Override 1600 public String toString() { 1601 return "CharMatcher.or(" + first + ", " + second + ")"; 1602 } 1603 } 1604 1605 // Static factory implementations 1606 1607 /** Implementation of {@link #is(char)}. */ 1608 private static final class Is extends FastMatcher { 1609 1610 private final char match; 1611 1612 Is(char match) { 1613 this.match = match; 1614 } 1615 1616 @Override 1617 public boolean matches(char c) { 1618 return c == match; 1619 } 1620 1621 @Override 1622 public String replaceFrom(CharSequence sequence, char replacement) { 1623 return sequence.toString().replace(match, replacement); 1624 } 1625 1626 @Override 1627 public CharMatcher and(CharMatcher other) { 1628 return other.matches(match) ? this : none(); 1629 } 1630 1631 @Override 1632 public CharMatcher or(CharMatcher other) { 1633 return other.matches(match) ? other : super.or(other); 1634 } 1635 1636 @Override 1637 public CharMatcher negate() { 1638 return isNot(match); 1639 } 1640 1641 @GwtIncompatible // used only from other GwtIncompatible code 1642 @Override 1643 void setBits(BitSet table) { 1644 table.set(match); 1645 } 1646 1647 @Override 1648 public String toString() { 1649 return "CharMatcher.is('" + showCharacter(match) + "')"; 1650 } 1651 } 1652 1653 /** Implementation of {@link #isNot(char)}. */ 1654 private static final class IsNot extends FastMatcher { 1655 1656 private final char match; 1657 1658 IsNot(char match) { 1659 this.match = match; 1660 } 1661 1662 @Override 1663 public boolean matches(char c) { 1664 return c != match; 1665 } 1666 1667 @Override 1668 public CharMatcher and(CharMatcher other) { 1669 return other.matches(match) ? super.and(other) : other; 1670 } 1671 1672 @Override 1673 public CharMatcher or(CharMatcher other) { 1674 return other.matches(match) ? any() : this; 1675 } 1676 1677 @GwtIncompatible // used only from other GwtIncompatible code 1678 @Override 1679 void setBits(BitSet table) { 1680 table.set(0, match); 1681 table.set(match + 1, Character.MAX_VALUE + 1); 1682 } 1683 1684 @Override 1685 public CharMatcher negate() { 1686 return is(match); 1687 } 1688 1689 @Override 1690 public String toString() { 1691 return "CharMatcher.isNot('" + showCharacter(match) + "')"; 1692 } 1693 } 1694 1695 private static CharMatcher.IsEither isEither(char c1, char c2) { 1696 return new CharMatcher.IsEither(c1, c2); 1697 } 1698 1699 /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ 1700 private static final class IsEither extends FastMatcher { 1701 1702 private final char match1; 1703 private final char match2; 1704 1705 IsEither(char match1, char match2) { 1706 this.match1 = match1; 1707 this.match2 = match2; 1708 } 1709 1710 @Override 1711 public boolean matches(char c) { 1712 return c == match1 || c == match2; 1713 } 1714 1715 @GwtIncompatible // used only from other GwtIncompatible code 1716 @Override 1717 void setBits(BitSet table) { 1718 table.set(match1); 1719 table.set(match2); 1720 } 1721 1722 @Override 1723 public String toString() { 1724 return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; 1725 } 1726 } 1727 1728 /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ 1729 private static final class AnyOf extends CharMatcher { 1730 1731 private final char[] chars; 1732 1733 public AnyOf(CharSequence chars) { 1734 this.chars = chars.toString().toCharArray(); 1735 Arrays.sort(this.chars); 1736 } 1737 1738 @Override 1739 public boolean matches(char c) { 1740 return Arrays.binarySearch(chars, c) >= 0; 1741 } 1742 1743 @Override 1744 @GwtIncompatible // used only from other GwtIncompatible code 1745 void setBits(BitSet table) { 1746 for (char c : chars) { 1747 table.set(c); 1748 } 1749 } 1750 1751 @Override 1752 public String toString() { 1753 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 1754 for (char c : chars) { 1755 description.append(showCharacter(c)); 1756 } 1757 description.append("\")"); 1758 return description.toString(); 1759 } 1760 } 1761 1762 /** Implementation of {@link #inRange(char, char)}. */ 1763 private static final class InRange extends FastMatcher { 1764 1765 private final char startInclusive; 1766 private final char endInclusive; 1767 1768 InRange(char startInclusive, char endInclusive) { 1769 checkArgument(endInclusive >= startInclusive); 1770 this.startInclusive = startInclusive; 1771 this.endInclusive = endInclusive; 1772 } 1773 1774 @Override 1775 public boolean matches(char c) { 1776 return startInclusive <= c && c <= endInclusive; 1777 } 1778 1779 @GwtIncompatible // used only from other GwtIncompatible code 1780 @Override 1781 void setBits(BitSet table) { 1782 table.set(startInclusive, endInclusive + 1); 1783 } 1784 1785 @Override 1786 public String toString() { 1787 return "CharMatcher.inRange('" 1788 + showCharacter(startInclusive) 1789 + "', '" 1790 + showCharacter(endInclusive) 1791 + "')"; 1792 } 1793 } 1794 1795 /** Implementation of {@link #forPredicate(Predicate)}. */ 1796 private static final class ForPredicate extends CharMatcher { 1797 1798 private final Predicate<? super Character> predicate; 1799 1800 ForPredicate(Predicate<? super Character> predicate) { 1801 this.predicate = checkNotNull(predicate); 1802 } 1803 1804 @Override 1805 public boolean matches(char c) { 1806 return predicate.apply(c); 1807 } 1808 1809 @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily 1810 @Override 1811 public boolean apply(Character character) { 1812 return predicate.apply(checkNotNull(character)); 1813 } 1814 1815 @Override 1816 public String toString() { 1817 return "CharMatcher.forPredicate(" + predicate + ")"; 1818 } 1819 } 1820}