001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkPositionIndex; 020 021import com.google.common.annotations.GwtCompatible; 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.VisibleForTesting; 024import java.util.Arrays; 025import java.util.BitSet; 026 027/** 028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 029 * for any {@link Object}. Also offers basic text processing methods based on this function. 030 * Implementations are strongly encouraged to be side-effect-free and immutable. 031 * 032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 033 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". 034 * 035 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a 036 * href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand 037 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code 038 * points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of 039 * assigned characters, including important CJK characters and emoji. 040 * 041 * <p>Supplementary characters are <a 042 * href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded 043 * into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as 044 * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. 045 * 046 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for 047 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For 048 * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. 049 * 050 * <p>Example usages: 051 * 052 * <pre> 053 * String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput); 054 * if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 055 * 056 * <p>See the Guava User Guide article on <a 057 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher} 058 * </a>. 059 * 060 * @author Kevin Bourrillion 061 * @since 1.0 062 */ 063@GwtCompatible(emulated = true) 064@ElementTypesAreNonnullByDefault 065public abstract class CharMatcher implements Predicate<Character> { 066 /* 067 * N777777777NO 068 * N7777777777777N 069 * M777777777777777N 070 * $N877777777D77777M 071 * N M77777777ONND777M 072 * MN777777777NN D777 073 * N7ZN777777777NN ~M7778 074 * N777777777777MMNN88777N 075 * N777777777777MNZZZ7777O 076 * DZN7777O77777777777777 077 * N7OONND7777777D77777N 078 * 8$M++++?N???$77777$ 079 * M7++++N+M77777777N 080 * N77O777777777777$ M 081 * DNNM$$$$777777N D 082 * N$N:=N$777N7777M NZ 083 * 77Z::::N777777777 ODZZZ 084 * 77N::::::N77777777M NNZZZ$ 085 * $777:::::::77777777MN ZM8ZZZZZ 086 * 777M::::::Z7777777Z77 N++ZZZZNN 087 * 7777M:::::M7777777$777M $++IZZZZM 088 * M777$:::::N777777$M7777M +++++ZZZDN 089 * NN$::::::7777$$M777777N N+++ZZZZNZ 090 * N::::::N:7$O:77777777 N++++ZZZZN 091 * M::::::::::::N77777777+ +?+++++ZZZM 092 * 8::::::::::::D77777777M O+++++ZZ 093 * ::::::::::::M777777777N O+?D 094 * M:::::::::::M77777777778 77= 095 * D=::::::::::N7777777777N 777 096 * INN===::::::=77777777777N I777N 097 * ?777N========N7777777777787M N7777 098 * 77777$D======N77777777777N777N? N777777 099 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N 100 * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON 101 * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND 102 * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N 103 * 7 :N MNN$$$$M$ $$$777$8 8D8I 104 * NMM.:7O 777777778 105 * 7777777MN 106 * M NO .7: 107 * M : M 108 * 8 109 */ 110 111 // Constant matcher factory methods 112 113 /** 114 * Matches any character. 115 * 116 * @since 19.0 (since 1.0 as constant {@code ANY}) 117 */ 118 public static CharMatcher any() { 119 return Any.INSTANCE; 120 } 121 122 /** 123 * Matches no characters. 124 * 125 * @since 19.0 (since 1.0 as constant {@code NONE}) 126 */ 127 public static CharMatcher none() { 128 return None.INSTANCE; 129 } 130 131 /** 132 * Determines whether a character is whitespace according to the latest Unicode standard, as 133 * illustrated <a 134 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 135 * This is not the same definition used by other Java APIs. (See a <a 136 * href="https://goo.gl/Y6SLWx">comparison of several definitions of "whitespace"</a>.) 137 * 138 * <p>All Unicode White_Space characters are on the BMP and thus supported by this API. 139 * 140 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to 141 * date. 142 * 143 * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) 144 */ 145 public static CharMatcher whitespace() { 146 return Whitespace.INSTANCE; 147 } 148 149 /** 150 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 151 * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a 152 * discussion of that term. 153 * 154 * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) 155 */ 156 public static CharMatcher breakingWhitespace() { 157 return BreakingWhitespace.INSTANCE; 158 } 159 160 /** 161 * Determines whether a character is ASCII, meaning that its code point is less than 128. 162 * 163 * @since 19.0 (since 1.0 as constant {@code ASCII}) 164 */ 165 public static CharMatcher ascii() { 166 return Ascii.INSTANCE; 167 } 168 169 /** 170 * Determines whether a character is a BMP digit according to <a 171 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If 172 * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 173 * 174 * @deprecated Many digits are supplementary characters; see the class documentation. 175 * @since 19.0 (since 1.0 as constant {@code DIGIT}) 176 */ 177 @Deprecated 178 public static CharMatcher digit() { 179 return Digit.INSTANCE; 180 } 181 182 /** 183 * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) 184 * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', 185 * '9')}. 186 * 187 * @deprecated Many digits are supplementary characters; see the class documentation. 188 * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) 189 */ 190 @Deprecated 191 public static CharMatcher javaDigit() { 192 return JavaDigit.INSTANCE; 193 } 194 195 /** 196 * Determines whether a character is a BMP letter according to {@linkplain 197 * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin 198 * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 199 * 200 * @deprecated Most letters are supplementary characters; see the class documentation. 201 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) 202 */ 203 @Deprecated 204 public static CharMatcher javaLetter() { 205 return JavaLetter.INSTANCE; 206 } 207 208 /** 209 * Determines whether a character is a BMP letter or digit according to {@linkplain 210 * Character#isLetterOrDigit(char) Java's definition}. 211 * 212 * @deprecated Most letters and digits are supplementary characters; see the class documentation. 213 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). 214 */ 215 @Deprecated 216 public static CharMatcher javaLetterOrDigit() { 217 return JavaLetterOrDigit.INSTANCE; 218 } 219 220 /** 221 * Determines whether a BMP character is upper case according to {@linkplain 222 * Character#isUpperCase(char) Java's definition}. 223 * 224 * @deprecated Some uppercase characters are supplementary characters; see the class 225 * documentation. 226 * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) 227 */ 228 @Deprecated 229 public static CharMatcher javaUpperCase() { 230 return JavaUpperCase.INSTANCE; 231 } 232 233 /** 234 * Determines whether a BMP character is lower case according to {@linkplain 235 * Character#isLowerCase(char) Java's definition}. 236 * 237 * @deprecated Some lowercase characters are supplementary characters; see the class 238 * documentation. 239 * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) 240 */ 241 @Deprecated 242 public static CharMatcher javaLowerCase() { 243 return JavaLowerCase.INSTANCE; 244 } 245 246 /** 247 * Determines whether a character is an ISO control character as specified by {@link 248 * Character#isISOControl(char)}. 249 * 250 * <p>All ISO control codes are on the BMP and thus supported by this API. 251 * 252 * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) 253 */ 254 public static CharMatcher javaIsoControl() { 255 return JavaIsoControl.INSTANCE; 256 } 257 258 /** 259 * Determines whether a character is invisible; that is, if its Unicode category is any of 260 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 261 * PRIVATE_USE according to ICU4J. 262 * 263 * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU). 264 * 265 * @deprecated Most invisible characters are supplementary characters; see the class 266 * documentation. 267 * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) 268 */ 269 @Deprecated 270 public static CharMatcher invisible() { 271 return Invisible.INSTANCE; 272 } 273 274 /** 275 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 276 * errs on the side of returning {@code false} (that is, it tends to assume a character is 277 * double-width). 278 * 279 * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to 280 * date. 281 * 282 * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>. 283 * 284 * @deprecated Many such characters are supplementary characters; see the class documentation. 285 * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) 286 */ 287 @Deprecated 288 public static CharMatcher singleWidth() { 289 return SingleWidth.INSTANCE; 290 } 291 292 // Static factories 293 294 /** Returns a {@code char} matcher that matches only one specified BMP character. */ 295 public static CharMatcher is(final char match) { 296 return new Is(match); 297 } 298 299 /** 300 * Returns a {@code char} matcher that matches any character except the BMP character specified. 301 * 302 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 303 */ 304 public static CharMatcher isNot(final char match) { 305 return new IsNot(match); 306 } 307 308 /** 309 * Returns a {@code char} matcher that matches any BMP character present in the given character 310 * sequence. Returns a bogus matcher if the sequence contains supplementary characters. 311 */ 312 public static CharMatcher anyOf(final CharSequence sequence) { 313 switch (sequence.length()) { 314 case 0: 315 return none(); 316 case 1: 317 return is(sequence.charAt(0)); 318 case 2: 319 return isEither(sequence.charAt(0), sequence.charAt(1)); 320 default: 321 // TODO(lowasser): is it potentially worth just going ahead and building a precomputed 322 // matcher? 323 return new AnyOf(sequence); 324 } 325 } 326 327 /** 328 * Returns a {@code char} matcher that matches any BMP character not present in the given 329 * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. 330 */ 331 public static CharMatcher noneOf(CharSequence sequence) { 332 return anyOf(sequence).negate(); 333 } 334 335 /** 336 * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints 337 * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 338 * CharMatcher.inRange('a', 'z')}. 339 * 340 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 341 */ 342 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 343 return new InRange(startInclusive, endInclusive); 344 } 345 346 /** 347 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 348 * which operates on primitive {@code char} instances instead. 349 */ 350 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 351 return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); 352 } 353 354 // Constructors 355 356 /** 357 * Constructor for use by subclasses. When subclassing, you may want to override {@code 358 * toString()} to provide a useful description. 359 */ 360 protected CharMatcher() {} 361 362 // Abstract methods 363 364 /** Determines a true or false value for the given character. */ 365 public abstract boolean matches(char c); 366 367 // Non-static factories 368 369 /** Returns a matcher that matches any character not matched by this matcher. */ 370 // @Override under Java 8 but not under Java 7 371 public CharMatcher negate() { 372 return new Negated(this); 373 } 374 375 /** 376 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 377 */ 378 public CharMatcher and(CharMatcher other) { 379 return new And(this, other); 380 } 381 382 /** 383 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 384 */ 385 public CharMatcher or(CharMatcher other) { 386 return new Or(this, other); 387 } 388 389 /** 390 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 391 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 392 * worthwhile only if the precomputed matcher is queried many thousands of times. 393 * 394 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 395 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 396 * worthwhile tradeoff in a browser. 397 */ 398 public CharMatcher precomputed() { 399 return Platform.precomputeCharMatcher(this); 400 } 401 402 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 403 404 /** 405 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 406 * on {@link Platform} so that we can have different behavior in GWT. 407 * 408 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the 409 * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables 410 * for matchers that only match a few characters, and so on. In the worst-case scenario, it 411 * constructs an eight-kilobyte bit array and queries that. In many situations this produces a 412 * matcher which is faster to query than the original. 413 */ 414 @GwtIncompatible // SmallCharMatcher 415 CharMatcher precomputedInternal() { 416 final BitSet table = new BitSet(); 417 setBits(table); 418 int totalCharacters = table.cardinality(); 419 if (totalCharacters * 2 <= DISTINCT_CHARS) { 420 return precomputedPositive(totalCharacters, table, toString()); 421 } else { 422 // TODO(lowasser): is it worth it to worry about the last character of large matchers? 423 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 424 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 425 String suffix = ".negate()"; 426 final String description = toString(); 427 String negatedDescription = 428 description.endsWith(suffix) 429 ? description.substring(0, description.length() - suffix.length()) 430 : description + suffix; 431 return new NegatedFastMatcher( 432 precomputedPositive(negatedCharacters, table, negatedDescription)) { 433 @Override 434 public String toString() { 435 return description; 436 } 437 }; 438 } 439 } 440 441 /** 442 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 443 */ 444 @GwtIncompatible // SmallCharMatcher 445 private static CharMatcher precomputedPositive( 446 int totalCharacters, BitSet table, String description) { 447 switch (totalCharacters) { 448 case 0: 449 return none(); 450 case 1: 451 return is((char) table.nextSetBit(0)); 452 case 2: 453 char c1 = (char) table.nextSetBit(0); 454 char c2 = (char) table.nextSetBit(c1 + 1); 455 return isEither(c1, c2); 456 default: 457 return isSmall(totalCharacters, table.length()) 458 ? SmallCharMatcher.from(table, description) 459 : new BitSetMatcher(table, description); 460 } 461 } 462 463 @GwtIncompatible // SmallCharMatcher 464 private static boolean isSmall(int totalCharacters, int tableLength) { 465 return totalCharacters <= SmallCharMatcher.MAX_SIZE 466 && tableLength > (totalCharacters * 4 * Character.SIZE); 467 // err on the side of BitSetMatcher 468 } 469 470 /** Sets bits in {@code table} matched by this matcher. */ 471 @GwtIncompatible // used only from other GwtIncompatible code 472 void setBits(BitSet table) { 473 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 474 if (matches((char) c)) { 475 table.set(c); 476 } 477 } 478 } 479 480 // Text processing routines 481 482 /** 483 * Returns {@code true} if a character sequence contains at least one matching BMP character. 484 * Equivalent to {@code !matchesNoneOf(sequence)}. 485 * 486 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 487 * character, until this returns {@code true} or the end is reached. 488 * 489 * @param sequence the character sequence to examine, possibly empty 490 * @return {@code true} if this matcher matches at least one character in the sequence 491 * @since 8.0 492 */ 493 public boolean matchesAnyOf(CharSequence sequence) { 494 return !matchesNoneOf(sequence); 495 } 496 497 /** 498 * Returns {@code true} if a character sequence contains only matching BMP characters. 499 * 500 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 501 * character, until this returns {@code false} or the end is reached. 502 * 503 * @param sequence the character sequence to examine, possibly empty 504 * @return {@code true} if this matcher matches every character in the sequence, including when 505 * the sequence is empty 506 */ 507 public boolean matchesAllOf(CharSequence sequence) { 508 for (int i = sequence.length() - 1; i >= 0; i--) { 509 if (!matches(sequence.charAt(i))) { 510 return false; 511 } 512 } 513 return true; 514 } 515 516 /** 517 * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to 518 * {@code !matchesAnyOf(sequence)}. 519 * 520 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 521 * character, until this returns {@code true} or the end is reached. 522 * 523 * @param sequence the character sequence to examine, possibly empty 524 * @return {@code true} if this matcher matches no characters in the sequence, including when the 525 * sequence is empty 526 */ 527 public boolean matchesNoneOf(CharSequence sequence) { 528 return indexIn(sequence) == -1; 529 } 530 531 /** 532 * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if 533 * no matching character is present. 534 * 535 * <p>The default implementation iterates over the sequence in forward order calling {@link 536 * #matches} for each character. 537 * 538 * @param sequence the character sequence to examine from the beginning 539 * @return an index, or {@code -1} if no character matches 540 */ 541 public int indexIn(CharSequence sequence) { 542 return indexIn(sequence, 0); 543 } 544 545 /** 546 * Returns the index of the first matching BMP character in a character sequence, starting from a 547 * given position, or {@code -1} if no character matches after that position. 548 * 549 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 550 * start}, calling {@link #matches} for each character. 551 * 552 * @param sequence the character sequence to examine 553 * @param start the first index to examine; must be nonnegative and no greater than {@code 554 * sequence.length()} 555 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 556 * or {@code -1} if no character matches 557 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 558 * sequence.length()} 559 */ 560 public int indexIn(CharSequence sequence, int start) { 561 int length = sequence.length(); 562 checkPositionIndex(start, length); 563 for (int i = start; i < length; i++) { 564 if (matches(sequence.charAt(i))) { 565 return i; 566 } 567 } 568 return -1; 569 } 570 571 /** 572 * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if 573 * no matching character is present. 574 * 575 * <p>The default implementation iterates over the sequence in reverse order calling {@link 576 * #matches} for each character. 577 * 578 * @param sequence the character sequence to examine from the end 579 * @return an index, or {@code -1} if no character matches 580 */ 581 public int lastIndexIn(CharSequence sequence) { 582 for (int i = sequence.length() - 1; i >= 0; i--) { 583 if (matches(sequence.charAt(i))) { 584 return i; 585 } 586 } 587 return -1; 588 } 589 590 /** 591 * Returns the number of matching {@code char}s found in a character sequence. 592 * 593 * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). 594 */ 595 public int countIn(CharSequence sequence) { 596 int count = 0; 597 for (int i = 0; i < sequence.length(); i++) { 598 if (matches(sequence.charAt(i))) { 599 count++; 600 } 601 } 602 return count; 603 } 604 605 /** 606 * Returns a string containing all non-matching characters of a character sequence, in order. For 607 * example: 608 * 609 * <pre>{@code 610 * CharMatcher.is('a').removeFrom("bazaar") 611 * }</pre> 612 * 613 * ... returns {@code "bzr"}. 614 */ 615 public String removeFrom(CharSequence sequence) { 616 String string = sequence.toString(); 617 int pos = indexIn(string); 618 if (pos == -1) { 619 return string; 620 } 621 622 char[] chars = string.toCharArray(); 623 int spread = 1; 624 625 // This unusual loop comes from extensive benchmarking 626 OUT: 627 while (true) { 628 pos++; 629 while (true) { 630 if (pos == chars.length) { 631 break OUT; 632 } 633 if (matches(chars[pos])) { 634 break; 635 } 636 chars[pos - spread] = chars[pos]; 637 pos++; 638 } 639 spread++; 640 } 641 return new String(chars, 0, pos - spread); 642 } 643 644 /** 645 * Returns a string containing all matching BMP characters of a character sequence, in order. For 646 * example: 647 * 648 * <pre>{@code 649 * CharMatcher.is('a').retainFrom("bazaar") 650 * }</pre> 651 * 652 * ... returns {@code "aaa"}. 653 */ 654 public String retainFrom(CharSequence sequence) { 655 return negate().removeFrom(sequence); 656 } 657 658 /** 659 * Returns a string copy of the input character sequence, with each matching BMP character 660 * replaced by a given replacement character. For example: 661 * 662 * <pre>{@code 663 * CharMatcher.is('a').replaceFrom("radar", 'o') 664 * }</pre> 665 * 666 * ... returns {@code "rodor"}. 667 * 668 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 669 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 670 * character. 671 * 672 * @param sequence the character sequence to replace matching characters in 673 * @param replacement the character to append to the result string in place of each matching 674 * character in {@code sequence} 675 * @return the new string 676 */ 677 public String replaceFrom(CharSequence sequence, char replacement) { 678 String string = sequence.toString(); 679 int pos = indexIn(string); 680 if (pos == -1) { 681 return string; 682 } 683 char[] chars = string.toCharArray(); 684 chars[pos] = replacement; 685 for (int i = pos + 1; i < chars.length; i++) { 686 if (matches(chars[i])) { 687 chars[i] = replacement; 688 } 689 } 690 return new String(chars); 691 } 692 693 /** 694 * Returns a string copy of the input character sequence, with each matching BMP character 695 * replaced by a given replacement sequence. For example: 696 * 697 * <pre>{@code 698 * CharMatcher.is('a').replaceFrom("yaha", "oo") 699 * }</pre> 700 * 701 * ... returns {@code "yoohoo"}. 702 * 703 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 704 * off calling {@link #replaceFrom(CharSequence, char)} directly. 705 * 706 * @param sequence the character sequence to replace matching characters in 707 * @param replacement the characters to append to the result string in place of each matching 708 * character in {@code sequence} 709 * @return the new string 710 */ 711 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 712 int replacementLen = replacement.length(); 713 if (replacementLen == 0) { 714 return removeFrom(sequence); 715 } 716 if (replacementLen == 1) { 717 return replaceFrom(sequence, replacement.charAt(0)); 718 } 719 720 String string = sequence.toString(); 721 int pos = indexIn(string); 722 if (pos == -1) { 723 return string; 724 } 725 726 int len = string.length(); 727 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 728 729 int oldpos = 0; 730 do { 731 buf.append(string, oldpos, pos); 732 buf.append(replacement); 733 oldpos = pos + 1; 734 pos = indexIn(string, oldpos); 735 } while (pos != -1); 736 737 buf.append(string, oldpos, len); 738 return buf.toString(); 739 } 740 741 /** 742 * Returns a substring of the input character sequence that omits all matching BMP characters from 743 * the beginning and from the end of the string. For example: 744 * 745 * <pre>{@code 746 * CharMatcher.anyOf("ab").trimFrom("abacatbab") 747 * }</pre> 748 * 749 * ... returns {@code "cat"}. 750 * 751 * <p>Note that: 752 * 753 * <pre>{@code 754 * CharMatcher.inRange('\0', ' ').trimFrom(str) 755 * }</pre> 756 * 757 * ... is equivalent to {@link String#trim()}. 758 */ 759 public String trimFrom(CharSequence sequence) { 760 int len = sequence.length(); 761 int first; 762 int last; 763 764 for (first = 0; first < len; first++) { 765 if (!matches(sequence.charAt(first))) { 766 break; 767 } 768 } 769 for (last = len - 1; last > first; last--) { 770 if (!matches(sequence.charAt(last))) { 771 break; 772 } 773 } 774 775 return sequence.subSequence(first, last + 1).toString(); 776 } 777 778 /** 779 * Returns a substring of the input character sequence that omits all matching BMP characters from 780 * the beginning of the string. For example: 781 * 782 * <pre>{@code 783 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab") 784 * }</pre> 785 * 786 * ... returns {@code "catbab"}. 787 */ 788 public String trimLeadingFrom(CharSequence sequence) { 789 int len = sequence.length(); 790 for (int first = 0; first < len; first++) { 791 if (!matches(sequence.charAt(first))) { 792 return sequence.subSequence(first, len).toString(); 793 } 794 } 795 return ""; 796 } 797 798 /** 799 * Returns a substring of the input character sequence that omits all matching BMP characters from 800 * the end of the string. For example: 801 * 802 * <pre>{@code 803 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab") 804 * }</pre> 805 * 806 * ... returns {@code "abacat"}. 807 */ 808 public String trimTrailingFrom(CharSequence sequence) { 809 int len = sequence.length(); 810 for (int last = len - 1; last >= 0; last--) { 811 if (!matches(sequence.charAt(last))) { 812 return sequence.subSequence(0, last + 1).toString(); 813 } 814 } 815 return ""; 816 } 817 818 /** 819 * Returns a string copy of the input character sequence, with each group of consecutive matching 820 * BMP characters replaced by a single replacement character. For example: 821 * 822 * <pre>{@code 823 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-') 824 * }</pre> 825 * 826 * ... returns {@code "b-p-r"}. 827 * 828 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 829 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 830 * character. 831 * 832 * @param sequence the character sequence to replace matching groups of characters in 833 * @param replacement the character to append to the result string in place of each group of 834 * matching characters in {@code sequence} 835 * @return the new string 836 */ 837 public String collapseFrom(CharSequence sequence, char replacement) { 838 // This implementation avoids unnecessary allocation. 839 int len = sequence.length(); 840 for (int i = 0; i < len; i++) { 841 char c = sequence.charAt(i); 842 if (matches(c)) { 843 if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 844 // a no-op replacement 845 i++; 846 } else { 847 StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); 848 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 849 } 850 } 851 } 852 // no replacement needed 853 return sequence.toString(); 854 } 855 856 /** 857 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 858 * groups of matching BMP characters at the start or end of the sequence are removed without 859 * replacement. 860 */ 861 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 862 // This implementation avoids unnecessary allocation. 863 int len = sequence.length(); 864 int first = 0; 865 int last = len - 1; 866 867 while (first < len && matches(sequence.charAt(first))) { 868 first++; 869 } 870 871 while (last > first && matches(sequence.charAt(last))) { 872 last--; 873 } 874 875 return (first == 0 && last == len - 1) 876 ? collapseFrom(sequence, replacement) 877 : finishCollapseFrom( 878 sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); 879 } 880 881 private String finishCollapseFrom( 882 CharSequence sequence, 883 int start, 884 int end, 885 char replacement, 886 StringBuilder builder, 887 boolean inMatchingGroup) { 888 for (int i = start; i < end; i++) { 889 char c = sequence.charAt(i); 890 if (matches(c)) { 891 if (!inMatchingGroup) { 892 builder.append(replacement); 893 inMatchingGroup = true; 894 } 895 } else { 896 builder.append(c); 897 inMatchingGroup = false; 898 } 899 } 900 return builder.toString(); 901 } 902 903 /** 904 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 905 * instead. 906 */ 907 @Deprecated 908 @Override 909 public boolean apply(Character character) { 910 return matches(character); 911 } 912 913 /** 914 * Returns a string representation of this {@code CharMatcher}, such as {@code 915 * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 916 */ 917 @Override 918 public String toString() { 919 return super.toString(); 920 } 921 922 /** 923 * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where 924 * "12AB" is the four hexadecimal digits representing the 16-bit code unit. 925 */ 926 private static String showCharacter(char c) { 927 String hex = "0123456789ABCDEF"; 928 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 929 for (int i = 0; i < 4; i++) { 930 tmp[5 - i] = hex.charAt(c & 0xF); 931 c = (char) (c >> 4); 932 } 933 return String.copyValueOf(tmp); 934 } 935 936 // Fast matchers 937 938 /** A matcher for which precomputation will not yield any significant benefit. */ 939 abstract static class FastMatcher extends CharMatcher { 940 941 @Override 942 public final CharMatcher precomputed() { 943 return this; 944 } 945 946 @Override 947 public CharMatcher negate() { 948 return new NegatedFastMatcher(this); 949 } 950 } 951 952 /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ 953 abstract static class NamedFastMatcher extends FastMatcher { 954 955 private final String description; 956 957 NamedFastMatcher(String description) { 958 this.description = checkNotNull(description); 959 } 960 961 @Override 962 public final String toString() { 963 return description; 964 } 965 } 966 967 /** Negation of a {@link FastMatcher}. */ 968 static class NegatedFastMatcher extends Negated { 969 970 NegatedFastMatcher(CharMatcher original) { 971 super(original); 972 } 973 974 @Override 975 public final CharMatcher precomputed() { 976 return this; 977 } 978 } 979 980 /** Fast matcher using a {@link BitSet} table of matching characters. */ 981 @GwtIncompatible // used only from other GwtIncompatible code 982 private static final class BitSetMatcher extends NamedFastMatcher { 983 984 private final BitSet table; 985 986 private BitSetMatcher(BitSet table, String description) { 987 super(description); 988 if (table.length() + Long.SIZE < table.size()) { 989 table = (BitSet) table.clone(); 990 // If only we could actually call BitSet.trimToSize() ourselves... 991 } 992 this.table = table; 993 } 994 995 @Override 996 public boolean matches(char c) { 997 return table.get(c); 998 } 999 1000 @Override 1001 void setBits(BitSet bitSet) { 1002 bitSet.or(table); 1003 } 1004 } 1005 1006 // Static constant implementation classes 1007 1008 /** Implementation of {@link #any()}. */ 1009 private static final class Any extends NamedFastMatcher { 1010 1011 static final Any INSTANCE = new Any(); 1012 1013 private Any() { 1014 super("CharMatcher.any()"); 1015 } 1016 1017 @Override 1018 public boolean matches(char c) { 1019 return true; 1020 } 1021 1022 @Override 1023 public int indexIn(CharSequence sequence) { 1024 return (sequence.length() == 0) ? -1 : 0; 1025 } 1026 1027 @Override 1028 public int indexIn(CharSequence sequence, int start) { 1029 int length = sequence.length(); 1030 checkPositionIndex(start, length); 1031 return (start == length) ? -1 : start; 1032 } 1033 1034 @Override 1035 public int lastIndexIn(CharSequence sequence) { 1036 return sequence.length() - 1; 1037 } 1038 1039 @Override 1040 public boolean matchesAllOf(CharSequence sequence) { 1041 checkNotNull(sequence); 1042 return true; 1043 } 1044 1045 @Override 1046 public boolean matchesNoneOf(CharSequence sequence) { 1047 return sequence.length() == 0; 1048 } 1049 1050 @Override 1051 public String removeFrom(CharSequence sequence) { 1052 checkNotNull(sequence); 1053 return ""; 1054 } 1055 1056 @Override 1057 public String replaceFrom(CharSequence sequence, char replacement) { 1058 char[] array = new char[sequence.length()]; 1059 Arrays.fill(array, replacement); 1060 return new String(array); 1061 } 1062 1063 @Override 1064 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1065 StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); 1066 for (int i = 0; i < sequence.length(); i++) { 1067 result.append(replacement); 1068 } 1069 return result.toString(); 1070 } 1071 1072 @Override 1073 public String collapseFrom(CharSequence sequence, char replacement) { 1074 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 1075 } 1076 1077 @Override 1078 public String trimFrom(CharSequence sequence) { 1079 checkNotNull(sequence); 1080 return ""; 1081 } 1082 1083 @Override 1084 public int countIn(CharSequence sequence) { 1085 return sequence.length(); 1086 } 1087 1088 @Override 1089 public CharMatcher and(CharMatcher other) { 1090 return checkNotNull(other); 1091 } 1092 1093 @Override 1094 public CharMatcher or(CharMatcher other) { 1095 checkNotNull(other); 1096 return this; 1097 } 1098 1099 @Override 1100 public CharMatcher negate() { 1101 return none(); 1102 } 1103 } 1104 1105 /** Implementation of {@link #none()}. */ 1106 private static final class None extends NamedFastMatcher { 1107 1108 static final None INSTANCE = new None(); 1109 1110 private None() { 1111 super("CharMatcher.none()"); 1112 } 1113 1114 @Override 1115 public boolean matches(char c) { 1116 return false; 1117 } 1118 1119 @Override 1120 public int indexIn(CharSequence sequence) { 1121 checkNotNull(sequence); 1122 return -1; 1123 } 1124 1125 @Override 1126 public int indexIn(CharSequence sequence, int start) { 1127 int length = sequence.length(); 1128 checkPositionIndex(start, length); 1129 return -1; 1130 } 1131 1132 @Override 1133 public int lastIndexIn(CharSequence sequence) { 1134 checkNotNull(sequence); 1135 return -1; 1136 } 1137 1138 @Override 1139 public boolean matchesAllOf(CharSequence sequence) { 1140 return sequence.length() == 0; 1141 } 1142 1143 @Override 1144 public boolean matchesNoneOf(CharSequence sequence) { 1145 checkNotNull(sequence); 1146 return true; 1147 } 1148 1149 @Override 1150 public String removeFrom(CharSequence sequence) { 1151 return sequence.toString(); 1152 } 1153 1154 @Override 1155 public String replaceFrom(CharSequence sequence, char replacement) { 1156 return sequence.toString(); 1157 } 1158 1159 @Override 1160 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1161 checkNotNull(replacement); 1162 return sequence.toString(); 1163 } 1164 1165 @Override 1166 public String collapseFrom(CharSequence sequence, char replacement) { 1167 return sequence.toString(); 1168 } 1169 1170 @Override 1171 public String trimFrom(CharSequence sequence) { 1172 return sequence.toString(); 1173 } 1174 1175 @Override 1176 public String trimLeadingFrom(CharSequence sequence) { 1177 return sequence.toString(); 1178 } 1179 1180 @Override 1181 public String trimTrailingFrom(CharSequence sequence) { 1182 return sequence.toString(); 1183 } 1184 1185 @Override 1186 public int countIn(CharSequence sequence) { 1187 checkNotNull(sequence); 1188 return 0; 1189 } 1190 1191 @Override 1192 public CharMatcher and(CharMatcher other) { 1193 checkNotNull(other); 1194 return this; 1195 } 1196 1197 @Override 1198 public CharMatcher or(CharMatcher other) { 1199 return checkNotNull(other); 1200 } 1201 1202 @Override 1203 public CharMatcher negate() { 1204 return any(); 1205 } 1206 } 1207 1208 /** Implementation of {@link #whitespace()}. */ 1209 @VisibleForTesting 1210 static final class Whitespace extends NamedFastMatcher { 1211 1212 // TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function 1213 // whose key property is that it maps 25 characters into the 32-slot table without collision. 1214 // Basically this is an opportunistic fast implementation as opposed to "good code". For most 1215 // other use-cases, the reduction in readability isn't worth it. 1216 static final String TABLE = 1217 "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1218 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1219 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1220 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1221 static final int MULTIPLIER = 1682554634; 1222 static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); 1223 1224 static final Whitespace INSTANCE = new Whitespace(); 1225 1226 Whitespace() { 1227 super("CharMatcher.whitespace()"); 1228 } 1229 1230 @Override 1231 public boolean matches(char c) { 1232 return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; 1233 } 1234 1235 @GwtIncompatible // used only from other GwtIncompatible code 1236 @Override 1237 void setBits(BitSet table) { 1238 for (int i = 0; i < TABLE.length(); i++) { 1239 table.set(TABLE.charAt(i)); 1240 } 1241 } 1242 } 1243 1244 /** Implementation of {@link #breakingWhitespace()}. */ 1245 private static final class BreakingWhitespace extends CharMatcher { 1246 1247 static final CharMatcher INSTANCE = new BreakingWhitespace(); 1248 1249 @Override 1250 public boolean matches(char c) { 1251 switch (c) { 1252 case '\t': 1253 case '\n': 1254 case '\013': 1255 case '\f': 1256 case '\r': 1257 case ' ': 1258 case '\u0085': 1259 case '\u1680': 1260 case '\u2028': 1261 case '\u2029': 1262 case '\u205f': 1263 case '\u3000': 1264 return true; 1265 case '\u2007': 1266 return false; 1267 default: 1268 return c >= '\u2000' && c <= '\u200a'; 1269 } 1270 } 1271 1272 @Override 1273 public String toString() { 1274 return "CharMatcher.breakingWhitespace()"; 1275 } 1276 } 1277 1278 /** Implementation of {@link #ascii()}. */ 1279 private static final class Ascii extends NamedFastMatcher { 1280 1281 static final Ascii INSTANCE = new Ascii(); 1282 1283 Ascii() { 1284 super("CharMatcher.ascii()"); 1285 } 1286 1287 @Override 1288 public boolean matches(char c) { 1289 return c <= '\u007f'; 1290 } 1291 } 1292 1293 /** Implementation that matches characters that fall within multiple ranges. */ 1294 private static class RangesMatcher extends CharMatcher { 1295 1296 private final String description; 1297 private final char[] rangeStarts; 1298 private final char[] rangeEnds; 1299 1300 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 1301 this.description = description; 1302 this.rangeStarts = rangeStarts; 1303 this.rangeEnds = rangeEnds; 1304 checkArgument(rangeStarts.length == rangeEnds.length); 1305 for (int i = 0; i < rangeStarts.length; i++) { 1306 checkArgument(rangeStarts[i] <= rangeEnds[i]); 1307 if (i + 1 < rangeStarts.length) { 1308 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 1309 } 1310 } 1311 } 1312 1313 @Override 1314 public boolean matches(char c) { 1315 int index = Arrays.binarySearch(rangeStarts, c); 1316 if (index >= 0) { 1317 return true; 1318 } else { 1319 index = ~index - 1; 1320 return index >= 0 && c <= rangeEnds[index]; 1321 } 1322 } 1323 1324 @Override 1325 public String toString() { 1326 return description; 1327 } 1328 } 1329 1330 /** Implementation of {@link #digit()}. */ 1331 private static final class Digit extends RangesMatcher { 1332 // Plug the following UnicodeSet pattern into 1333 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1334 // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] 1335 // and get the zeroes from there. 1336 1337 // Must be in ascending order. 1338 private static final String ZEROES = 1339 "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" 1340 + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" 1341 + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; 1342 1343 private static char[] zeroes() { 1344 return ZEROES.toCharArray(); 1345 } 1346 1347 private static char[] nines() { 1348 char[] nines = new char[ZEROES.length()]; 1349 for (int i = 0; i < ZEROES.length(); i++) { 1350 nines[i] = (char) (ZEROES.charAt(i) + 9); 1351 } 1352 return nines; 1353 } 1354 1355 static final Digit INSTANCE = new Digit(); 1356 1357 private Digit() { 1358 super("CharMatcher.digit()", zeroes(), nines()); 1359 } 1360 } 1361 1362 /** Implementation of {@link #javaDigit()}. */ 1363 private static final class JavaDigit extends CharMatcher { 1364 1365 static final JavaDigit INSTANCE = new JavaDigit(); 1366 1367 @Override 1368 public boolean matches(char c) { 1369 return Character.isDigit(c); 1370 } 1371 1372 @Override 1373 public String toString() { 1374 return "CharMatcher.javaDigit()"; 1375 } 1376 } 1377 1378 /** Implementation of {@link #javaLetter()}. */ 1379 private static final class JavaLetter extends CharMatcher { 1380 1381 static final JavaLetter INSTANCE = new JavaLetter(); 1382 1383 @Override 1384 public boolean matches(char c) { 1385 return Character.isLetter(c); 1386 } 1387 1388 @Override 1389 public String toString() { 1390 return "CharMatcher.javaLetter()"; 1391 } 1392 } 1393 1394 /** Implementation of {@link #javaLetterOrDigit()}. */ 1395 private static final class JavaLetterOrDigit extends CharMatcher { 1396 1397 static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit(); 1398 1399 @Override 1400 public boolean matches(char c) { 1401 return Character.isLetterOrDigit(c); 1402 } 1403 1404 @Override 1405 public String toString() { 1406 return "CharMatcher.javaLetterOrDigit()"; 1407 } 1408 } 1409 1410 /** Implementation of {@link #javaUpperCase()}. */ 1411 private static final class JavaUpperCase extends CharMatcher { 1412 1413 static final JavaUpperCase INSTANCE = new JavaUpperCase(); 1414 1415 @Override 1416 public boolean matches(char c) { 1417 return Character.isUpperCase(c); 1418 } 1419 1420 @Override 1421 public String toString() { 1422 return "CharMatcher.javaUpperCase()"; 1423 } 1424 } 1425 1426 /** Implementation of {@link #javaLowerCase()}. */ 1427 private static final class JavaLowerCase extends CharMatcher { 1428 1429 static final JavaLowerCase INSTANCE = new JavaLowerCase(); 1430 1431 @Override 1432 public boolean matches(char c) { 1433 return Character.isLowerCase(c); 1434 } 1435 1436 @Override 1437 public String toString() { 1438 return "CharMatcher.javaLowerCase()"; 1439 } 1440 } 1441 1442 /** Implementation of {@link #javaIsoControl()}. */ 1443 private static final class JavaIsoControl extends NamedFastMatcher { 1444 1445 static final JavaIsoControl INSTANCE = new JavaIsoControl(); 1446 1447 private JavaIsoControl() { 1448 super("CharMatcher.javaIsoControl()"); 1449 } 1450 1451 @Override 1452 public boolean matches(char c) { 1453 return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); 1454 } 1455 } 1456 1457 /** Implementation of {@link #invisible()}. */ 1458 private static final class Invisible extends RangesMatcher { 1459 // Plug the following UnicodeSet pattern into 1460 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1461 // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] 1462 // with the "Abbreviate" option, and get the ranges from there. 1463 private static final String RANGE_STARTS = 1464 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" 1465 + "\u3000\ud800\ufeff\ufff9"; 1466 private static final String RANGE_ENDS = // inclusive ends 1467 "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" 1468 + "\u3000\uf8ff\ufeff\ufffb"; 1469 1470 static final Invisible INSTANCE = new Invisible(); 1471 1472 private Invisible() { 1473 super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); 1474 } 1475 } 1476 1477 /** Implementation of {@link #singleWidth()}. */ 1478 private static final class SingleWidth extends RangesMatcher { 1479 1480 static final SingleWidth INSTANCE = new SingleWidth(); 1481 1482 private SingleWidth() { 1483 super( 1484 "CharMatcher.singleWidth()", 1485 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 1486 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 1487 } 1488 } 1489 1490 // Non-static factory implementation classes 1491 1492 /** Implementation of {@link #negate()}. */ 1493 private static class Negated extends CharMatcher { 1494 1495 final CharMatcher original; 1496 1497 Negated(CharMatcher original) { 1498 this.original = checkNotNull(original); 1499 } 1500 1501 @Override 1502 public boolean matches(char c) { 1503 return !original.matches(c); 1504 } 1505 1506 @Override 1507 public boolean matchesAllOf(CharSequence sequence) { 1508 return original.matchesNoneOf(sequence); 1509 } 1510 1511 @Override 1512 public boolean matchesNoneOf(CharSequence sequence) { 1513 return original.matchesAllOf(sequence); 1514 } 1515 1516 @Override 1517 public int countIn(CharSequence sequence) { 1518 return sequence.length() - original.countIn(sequence); 1519 } 1520 1521 @GwtIncompatible // used only from other GwtIncompatible code 1522 @Override 1523 void setBits(BitSet table) { 1524 BitSet tmp = new BitSet(); 1525 original.setBits(tmp); 1526 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 1527 table.or(tmp); 1528 } 1529 1530 @Override 1531 public CharMatcher negate() { 1532 return original; 1533 } 1534 1535 @Override 1536 public String toString() { 1537 return original + ".negate()"; 1538 } 1539 } 1540 1541 /** Implementation of {@link #and(CharMatcher)}. */ 1542 private static final class And extends CharMatcher { 1543 1544 final CharMatcher first; 1545 final CharMatcher second; 1546 1547 And(CharMatcher a, CharMatcher b) { 1548 first = checkNotNull(a); 1549 second = checkNotNull(b); 1550 } 1551 1552 @Override 1553 public boolean matches(char c) { 1554 return first.matches(c) && second.matches(c); 1555 } 1556 1557 @GwtIncompatible // used only from other GwtIncompatible code 1558 @Override 1559 void setBits(BitSet table) { 1560 BitSet tmp1 = new BitSet(); 1561 first.setBits(tmp1); 1562 BitSet tmp2 = new BitSet(); 1563 second.setBits(tmp2); 1564 tmp1.and(tmp2); 1565 table.or(tmp1); 1566 } 1567 1568 @Override 1569 public String toString() { 1570 return "CharMatcher.and(" + first + ", " + second + ")"; 1571 } 1572 } 1573 1574 /** Implementation of {@link #or(CharMatcher)}. */ 1575 private static final class Or extends CharMatcher { 1576 1577 final CharMatcher first; 1578 final CharMatcher second; 1579 1580 Or(CharMatcher a, CharMatcher b) { 1581 first = checkNotNull(a); 1582 second = checkNotNull(b); 1583 } 1584 1585 @GwtIncompatible // used only from other GwtIncompatible code 1586 @Override 1587 void setBits(BitSet table) { 1588 first.setBits(table); 1589 second.setBits(table); 1590 } 1591 1592 @Override 1593 public boolean matches(char c) { 1594 return first.matches(c) || second.matches(c); 1595 } 1596 1597 @Override 1598 public String toString() { 1599 return "CharMatcher.or(" + first + ", " + second + ")"; 1600 } 1601 } 1602 1603 // Static factory implementations 1604 1605 /** Implementation of {@link #is(char)}. */ 1606 private static final class Is extends FastMatcher { 1607 1608 private final char match; 1609 1610 Is(char match) { 1611 this.match = match; 1612 } 1613 1614 @Override 1615 public boolean matches(char c) { 1616 return c == match; 1617 } 1618 1619 @Override 1620 public String replaceFrom(CharSequence sequence, char replacement) { 1621 return sequence.toString().replace(match, replacement); 1622 } 1623 1624 @Override 1625 public CharMatcher and(CharMatcher other) { 1626 return other.matches(match) ? this : none(); 1627 } 1628 1629 @Override 1630 public CharMatcher or(CharMatcher other) { 1631 return other.matches(match) ? other : super.or(other); 1632 } 1633 1634 @Override 1635 public CharMatcher negate() { 1636 return isNot(match); 1637 } 1638 1639 @GwtIncompatible // used only from other GwtIncompatible code 1640 @Override 1641 void setBits(BitSet table) { 1642 table.set(match); 1643 } 1644 1645 @Override 1646 public String toString() { 1647 return "CharMatcher.is('" + showCharacter(match) + "')"; 1648 } 1649 } 1650 1651 /** Implementation of {@link #isNot(char)}. */ 1652 private static final class IsNot extends FastMatcher { 1653 1654 private final char match; 1655 1656 IsNot(char match) { 1657 this.match = match; 1658 } 1659 1660 @Override 1661 public boolean matches(char c) { 1662 return c != match; 1663 } 1664 1665 @Override 1666 public CharMatcher and(CharMatcher other) { 1667 return other.matches(match) ? super.and(other) : other; 1668 } 1669 1670 @Override 1671 public CharMatcher or(CharMatcher other) { 1672 return other.matches(match) ? any() : this; 1673 } 1674 1675 @GwtIncompatible // used only from other GwtIncompatible code 1676 @Override 1677 void setBits(BitSet table) { 1678 table.set(0, match); 1679 table.set(match + 1, Character.MAX_VALUE + 1); 1680 } 1681 1682 @Override 1683 public CharMatcher negate() { 1684 return is(match); 1685 } 1686 1687 @Override 1688 public String toString() { 1689 return "CharMatcher.isNot('" + showCharacter(match) + "')"; 1690 } 1691 } 1692 1693 private static CharMatcher.IsEither isEither(char c1, char c2) { 1694 return new CharMatcher.IsEither(c1, c2); 1695 } 1696 1697 /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ 1698 private static final class IsEither extends FastMatcher { 1699 1700 private final char match1; 1701 private final char match2; 1702 1703 IsEither(char match1, char match2) { 1704 this.match1 = match1; 1705 this.match2 = match2; 1706 } 1707 1708 @Override 1709 public boolean matches(char c) { 1710 return c == match1 || c == match2; 1711 } 1712 1713 @GwtIncompatible // used only from other GwtIncompatible code 1714 @Override 1715 void setBits(BitSet table) { 1716 table.set(match1); 1717 table.set(match2); 1718 } 1719 1720 @Override 1721 public String toString() { 1722 return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; 1723 } 1724 } 1725 1726 /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ 1727 private static final class AnyOf extends CharMatcher { 1728 1729 private final char[] chars; 1730 1731 public AnyOf(CharSequence chars) { 1732 this.chars = chars.toString().toCharArray(); 1733 Arrays.sort(this.chars); 1734 } 1735 1736 @Override 1737 public boolean matches(char c) { 1738 return Arrays.binarySearch(chars, c) >= 0; 1739 } 1740 1741 @Override 1742 @GwtIncompatible // used only from other GwtIncompatible code 1743 void setBits(BitSet table) { 1744 for (char c : chars) { 1745 table.set(c); 1746 } 1747 } 1748 1749 @Override 1750 public String toString() { 1751 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 1752 for (char c : chars) { 1753 description.append(showCharacter(c)); 1754 } 1755 description.append("\")"); 1756 return description.toString(); 1757 } 1758 } 1759 1760 /** Implementation of {@link #inRange(char, char)}. */ 1761 private static final class InRange extends FastMatcher { 1762 1763 private final char startInclusive; 1764 private final char endInclusive; 1765 1766 InRange(char startInclusive, char endInclusive) { 1767 checkArgument(endInclusive >= startInclusive); 1768 this.startInclusive = startInclusive; 1769 this.endInclusive = endInclusive; 1770 } 1771 1772 @Override 1773 public boolean matches(char c) { 1774 return startInclusive <= c && c <= endInclusive; 1775 } 1776 1777 @GwtIncompatible // used only from other GwtIncompatible code 1778 @Override 1779 void setBits(BitSet table) { 1780 table.set(startInclusive, endInclusive + 1); 1781 } 1782 1783 @Override 1784 public String toString() { 1785 return "CharMatcher.inRange('" 1786 + showCharacter(startInclusive) 1787 + "', '" 1788 + showCharacter(endInclusive) 1789 + "')"; 1790 } 1791 } 1792 1793 /** Implementation of {@link #forPredicate(Predicate)}. */ 1794 private static final class ForPredicate extends CharMatcher { 1795 1796 private final Predicate<? super Character> predicate; 1797 1798 ForPredicate(Predicate<? super Character> predicate) { 1799 this.predicate = checkNotNull(predicate); 1800 } 1801 1802 @Override 1803 public boolean matches(char c) { 1804 return predicate.apply(c); 1805 } 1806 1807 @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily 1808 @Override 1809 public boolean apply(Character character) { 1810 return predicate.apply(checkNotNull(character)); 1811 } 1812 1813 @Override 1814 public String toString() { 1815 return "CharMatcher.forPredicate(" + predicate + ")"; 1816 } 1817 } 1818}