001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkPositionIndex; 020 021import com.google.common.annotations.GwtCompatible; 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.J2ktIncompatible; 024import com.google.common.annotations.VisibleForTesting; 025import java.util.Arrays; 026import java.util.BitSet; 027 028/** 029 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 030 * for any {@link Object}. Also offers basic text processing methods based on this function. 031 * Implementations are strongly encouraged to be side-effect-free and immutable. 032 * 033 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 034 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". 035 * 036 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a 037 * href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand 038 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code 039 * points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of 040 * assigned characters, including important CJK characters and emoji. 041 * 042 * <p>Supplementary characters are <a 043 * href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded 044 * into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as 045 * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. 046 * 047 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for 048 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For 049 * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. 050 * 051 * <p>Example usages: 052 * 053 * <pre> 054 * String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput); 055 * if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 056 * 057 * <p>See the Guava User Guide article on <a 058 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher} 059 * </a>. 060 * 061 * @author Kevin Bourrillion 062 * @since 1.0 063 */ 064@GwtCompatible(emulated = true) 065@ElementTypesAreNonnullByDefault 066public abstract class CharMatcher implements Predicate<Character> { 067 /* 068 * N777777777NO 069 * N7777777777777N 070 * M777777777777777N 071 * $N877777777D77777M 072 * N M77777777ONND777M 073 * MN777777777NN D777 074 * N7ZN777777777NN ~M7778 075 * N777777777777MMNN88777N 076 * N777777777777MNZZZ7777O 077 * DZN7777O77777777777777 078 * N7OONND7777777D77777N 079 * 8$M++++?N???$77777$ 080 * M7++++N+M77777777N 081 * N77O777777777777$ M 082 * DNNM$$$$777777N D 083 * N$N:=N$777N7777M NZ 084 * 77Z::::N777777777 ODZZZ 085 * 77N::::::N77777777M NNZZZ$ 086 * $777:::::::77777777MN ZM8ZZZZZ 087 * 777M::::::Z7777777Z77 N++ZZZZNN 088 * 7777M:::::M7777777$777M $++IZZZZM 089 * M777$:::::N777777$M7777M +++++ZZZDN 090 * NN$::::::7777$$M777777N N+++ZZZZNZ 091 * N::::::N:7$O:77777777 N++++ZZZZN 092 * M::::::::::::N77777777+ +?+++++ZZZM 093 * 8::::::::::::D77777777M O+++++ZZ 094 * ::::::::::::M777777777N O+?D 095 * M:::::::::::M77777777778 77= 096 * D=::::::::::N7777777777N 777 097 * INN===::::::=77777777777N I777N 098 * ?777N========N7777777777787M N7777 099 * 77777$D======N77777777777N777N? N777777 100 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N 101 * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON 102 * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND 103 * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N 104 * 7 :N MNN$$$$M$ $$$777$8 8D8I 105 * NMM.:7O 777777778 106 * 7777777MN 107 * M NO .7: 108 * M : M 109 * 8 110 */ 111 112 // Constant matcher factory methods 113 114 /** 115 * Matches any character. 116 * 117 * @since 19.0 (since 1.0 as constant {@code ANY}) 118 */ 119 public static CharMatcher any() { 120 return Any.INSTANCE; 121 } 122 123 /** 124 * Matches no characters. 125 * 126 * @since 19.0 (since 1.0 as constant {@code NONE}) 127 */ 128 public static CharMatcher none() { 129 return None.INSTANCE; 130 } 131 132 /** 133 * Determines whether a character is whitespace according to the latest Unicode standard, as 134 * illustrated <a 135 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 136 * This is not the same definition used by other Java APIs. (See a <a 137 * href="https://goo.gl/Y6SLWx">comparison of several definitions of "whitespace"</a>.) 138 * 139 * <p>All Unicode White_Space characters are on the BMP and thus supported by this API. 140 * 141 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to 142 * date. 143 * 144 * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) 145 */ 146 public static CharMatcher whitespace() { 147 return Whitespace.INSTANCE; 148 } 149 150 /** 151 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 152 * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a 153 * discussion of that term. 154 * 155 * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) 156 */ 157 public static CharMatcher breakingWhitespace() { 158 return BreakingWhitespace.INSTANCE; 159 } 160 161 /** 162 * Determines whether a character is ASCII, meaning that its code point is less than 128. 163 * 164 * @since 19.0 (since 1.0 as constant {@code ASCII}) 165 */ 166 public static CharMatcher ascii() { 167 return Ascii.INSTANCE; 168 } 169 170 /** 171 * Determines whether a character is a BMP digit according to <a 172 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If 173 * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 174 * 175 * @deprecated Many digits are supplementary characters; see the class documentation. 176 * @since 19.0 (since 1.0 as constant {@code DIGIT}) 177 */ 178 @Deprecated 179 public static CharMatcher digit() { 180 return Digit.INSTANCE; 181 } 182 183 /** 184 * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) 185 * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', 186 * '9')}. 187 * 188 * @deprecated Many digits are supplementary characters; see the class documentation. 189 * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) 190 */ 191 @Deprecated 192 public static CharMatcher javaDigit() { 193 return JavaDigit.INSTANCE; 194 } 195 196 /** 197 * Determines whether a character is a BMP letter according to {@linkplain 198 * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin 199 * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 200 * 201 * @deprecated Most letters are supplementary characters; see the class documentation. 202 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) 203 */ 204 @Deprecated 205 public static CharMatcher javaLetter() { 206 return JavaLetter.INSTANCE; 207 } 208 209 /** 210 * Determines whether a character is a BMP letter or digit according to {@linkplain 211 * Character#isLetterOrDigit(char) Java's definition}. 212 * 213 * @deprecated Most letters and digits are supplementary characters; see the class documentation. 214 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). 215 */ 216 @Deprecated 217 public static CharMatcher javaLetterOrDigit() { 218 return JavaLetterOrDigit.INSTANCE; 219 } 220 221 /** 222 * Determines whether a BMP character is upper case according to {@linkplain 223 * Character#isUpperCase(char) Java's definition}. 224 * 225 * @deprecated Some uppercase characters are supplementary characters; see the class 226 * documentation. 227 * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) 228 */ 229 @Deprecated 230 public static CharMatcher javaUpperCase() { 231 return JavaUpperCase.INSTANCE; 232 } 233 234 /** 235 * Determines whether a BMP character is lower case according to {@linkplain 236 * Character#isLowerCase(char) Java's definition}. 237 * 238 * @deprecated Some lowercase characters are supplementary characters; see the class 239 * documentation. 240 * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) 241 */ 242 @Deprecated 243 public static CharMatcher javaLowerCase() { 244 return JavaLowerCase.INSTANCE; 245 } 246 247 /** 248 * Determines whether a character is an ISO control character as specified by {@link 249 * Character#isISOControl(char)}. 250 * 251 * <p>All ISO control codes are on the BMP and thus supported by this API. 252 * 253 * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) 254 */ 255 public static CharMatcher javaIsoControl() { 256 return JavaIsoControl.INSTANCE; 257 } 258 259 /** 260 * Determines whether a character is invisible; that is, if its Unicode category is any of 261 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 262 * PRIVATE_USE according to ICU4J. 263 * 264 * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU). 265 * 266 * @deprecated Most invisible characters are supplementary characters; see the class 267 * documentation. 268 * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) 269 */ 270 @Deprecated 271 public static CharMatcher invisible() { 272 return Invisible.INSTANCE; 273 } 274 275 /** 276 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 277 * errs on the side of returning {@code false} (that is, it tends to assume a character is 278 * double-width). 279 * 280 * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to 281 * date. 282 * 283 * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>. 284 * 285 * @deprecated Many such characters are supplementary characters; see the class documentation. 286 * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) 287 */ 288 @Deprecated 289 public static CharMatcher singleWidth() { 290 return SingleWidth.INSTANCE; 291 } 292 293 // Static factories 294 295 /** Returns a {@code char} matcher that matches only one specified BMP character. */ 296 public static CharMatcher is(final char match) { 297 return new Is(match); 298 } 299 300 /** 301 * Returns a {@code char} matcher that matches any character except the BMP character specified. 302 * 303 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 304 */ 305 public static CharMatcher isNot(final char match) { 306 return new IsNot(match); 307 } 308 309 /** 310 * Returns a {@code char} matcher that matches any BMP character present in the given character 311 * sequence. Returns a bogus matcher if the sequence contains supplementary characters. 312 */ 313 public static CharMatcher anyOf(final CharSequence sequence) { 314 switch (sequence.length()) { 315 case 0: 316 return none(); 317 case 1: 318 return is(sequence.charAt(0)); 319 case 2: 320 return isEither(sequence.charAt(0), sequence.charAt(1)); 321 default: 322 // TODO(lowasser): is it potentially worth just going ahead and building a precomputed 323 // matcher? 324 return new AnyOf(sequence); 325 } 326 } 327 328 /** 329 * Returns a {@code char} matcher that matches any BMP character not present in the given 330 * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. 331 */ 332 public static CharMatcher noneOf(CharSequence sequence) { 333 return anyOf(sequence).negate(); 334 } 335 336 /** 337 * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints 338 * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 339 * CharMatcher.inRange('a', 'z')}. 340 * 341 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 342 */ 343 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 344 return new InRange(startInclusive, endInclusive); 345 } 346 347 /** 348 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 349 * which operates on primitive {@code char} instances instead. 350 */ 351 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 352 return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); 353 } 354 355 // Constructors 356 357 /** 358 * Constructor for use by subclasses. When subclassing, you may want to override {@code 359 * toString()} to provide a useful description. 360 */ 361 protected CharMatcher() {} 362 363 // Abstract methods 364 365 /** Determines a true or false value for the given character. */ 366 public abstract boolean matches(char c); 367 368 // Non-static factories 369 370 /** Returns a matcher that matches any character not matched by this matcher. */ 371 // @Override under Java 8 but not under Java 7 372 @Override 373 public CharMatcher negate() { 374 return new Negated(this); 375 } 376 377 /** 378 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 379 */ 380 public CharMatcher and(CharMatcher other) { 381 return new And(this, other); 382 } 383 384 /** 385 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 386 */ 387 public CharMatcher or(CharMatcher other) { 388 return new Or(this, other); 389 } 390 391 /** 392 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 393 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 394 * worthwhile only if the precomputed matcher is queried many thousands of times. 395 * 396 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 397 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 398 * worthwhile tradeoff in a browser. 399 */ 400 public CharMatcher precomputed() { 401 return Platform.precomputeCharMatcher(this); 402 } 403 404 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 405 406 /** 407 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 408 * on {@link Platform} so that we can have different behavior in GWT. 409 * 410 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the 411 * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables 412 * for matchers that only match a few characters, and so on. In the worst-case scenario, it 413 * constructs an eight-kilobyte bit array and queries that. In many situations this produces a 414 * matcher which is faster to query than the original. 415 */ 416 @J2ktIncompatible 417 @GwtIncompatible // SmallCharMatcher 418 CharMatcher precomputedInternal() { 419 final BitSet table = new BitSet(); 420 setBits(table); 421 int totalCharacters = table.cardinality(); 422 if (totalCharacters * 2 <= DISTINCT_CHARS) { 423 return precomputedPositive(totalCharacters, table, toString()); 424 } else { 425 // TODO(lowasser): is it worth it to worry about the last character of large matchers? 426 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 427 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 428 String suffix = ".negate()"; 429 final String description = toString(); 430 String negatedDescription = 431 description.endsWith(suffix) 432 ? description.substring(0, description.length() - suffix.length()) 433 : description + suffix; 434 return new NegatedFastMatcher( 435 precomputedPositive(negatedCharacters, table, negatedDescription)) { 436 @Override 437 public String toString() { 438 return description; 439 } 440 }; 441 } 442 } 443 444 /** 445 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 446 */ 447 @J2ktIncompatible 448 @GwtIncompatible // SmallCharMatcher 449 private static CharMatcher precomputedPositive( 450 int totalCharacters, BitSet table, String description) { 451 switch (totalCharacters) { 452 case 0: 453 return none(); 454 case 1: 455 return is((char) table.nextSetBit(0)); 456 case 2: 457 char c1 = (char) table.nextSetBit(0); 458 char c2 = (char) table.nextSetBit(c1 + 1); 459 return isEither(c1, c2); 460 default: 461 return isSmall(totalCharacters, table.length()) 462 ? SmallCharMatcher.from(table, description) 463 : new BitSetMatcher(table, description); 464 } 465 } 466 467 @J2ktIncompatible 468 @GwtIncompatible // SmallCharMatcher 469 private static boolean isSmall(int totalCharacters, int tableLength) { 470 return totalCharacters <= SmallCharMatcher.MAX_SIZE 471 && tableLength > (totalCharacters * 4 * Character.SIZE); 472 // err on the side of BitSetMatcher 473 } 474 475 /** Sets bits in {@code table} matched by this matcher. */ 476 @J2ktIncompatible 477 @GwtIncompatible // used only from other GwtIncompatible code 478 void setBits(BitSet table) { 479 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 480 if (matches((char) c)) { 481 table.set(c); 482 } 483 } 484 } 485 486 // Text processing routines 487 488 /** 489 * Returns {@code true} if a character sequence contains at least one matching BMP character. 490 * Equivalent to {@code !matchesNoneOf(sequence)}. 491 * 492 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 493 * character, until this returns {@code true} or the end is reached. 494 * 495 * @param sequence the character sequence to examine, possibly empty 496 * @return {@code true} if this matcher matches at least one character in the sequence 497 * @since 8.0 498 */ 499 public boolean matchesAnyOf(CharSequence sequence) { 500 return !matchesNoneOf(sequence); 501 } 502 503 /** 504 * Returns {@code true} if a character sequence contains only matching BMP characters. 505 * 506 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 507 * character, until this returns {@code false} or the end is reached. 508 * 509 * @param sequence the character sequence to examine, possibly empty 510 * @return {@code true} if this matcher matches every character in the sequence, including when 511 * the sequence is empty 512 */ 513 public boolean matchesAllOf(CharSequence sequence) { 514 for (int i = sequence.length() - 1; i >= 0; i--) { 515 if (!matches(sequence.charAt(i))) { 516 return false; 517 } 518 } 519 return true; 520 } 521 522 /** 523 * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to 524 * {@code !matchesAnyOf(sequence)}. 525 * 526 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 527 * character, until this returns {@code true} or the end is reached. 528 * 529 * @param sequence the character sequence to examine, possibly empty 530 * @return {@code true} if this matcher matches no characters in the sequence, including when the 531 * sequence is empty 532 */ 533 public boolean matchesNoneOf(CharSequence sequence) { 534 return indexIn(sequence) == -1; 535 } 536 537 /** 538 * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if 539 * no matching character is present. 540 * 541 * <p>The default implementation iterates over the sequence in forward order calling {@link 542 * #matches} for each character. 543 * 544 * @param sequence the character sequence to examine from the beginning 545 * @return an index, or {@code -1} if no character matches 546 */ 547 public int indexIn(CharSequence sequence) { 548 return indexIn(sequence, 0); 549 } 550 551 /** 552 * Returns the index of the first matching BMP character in a character sequence, starting from a 553 * given position, or {@code -1} if no character matches after that position. 554 * 555 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 556 * start}, calling {@link #matches} for each character. 557 * 558 * @param sequence the character sequence to examine 559 * @param start the first index to examine; must be nonnegative and no greater than {@code 560 * sequence.length()} 561 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 562 * or {@code -1} if no character matches 563 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 564 * sequence.length()} 565 */ 566 public int indexIn(CharSequence sequence, int start) { 567 int length = sequence.length(); 568 checkPositionIndex(start, length); 569 for (int i = start; i < length; i++) { 570 if (matches(sequence.charAt(i))) { 571 return i; 572 } 573 } 574 return -1; 575 } 576 577 /** 578 * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if 579 * no matching character is present. 580 * 581 * <p>The default implementation iterates over the sequence in reverse order calling {@link 582 * #matches} for each character. 583 * 584 * @param sequence the character sequence to examine from the end 585 * @return an index, or {@code -1} if no character matches 586 */ 587 public int lastIndexIn(CharSequence sequence) { 588 for (int i = sequence.length() - 1; i >= 0; i--) { 589 if (matches(sequence.charAt(i))) { 590 return i; 591 } 592 } 593 return -1; 594 } 595 596 /** 597 * Returns the number of matching {@code char}s found in a character sequence. 598 * 599 * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). 600 */ 601 public int countIn(CharSequence sequence) { 602 int count = 0; 603 for (int i = 0; i < sequence.length(); i++) { 604 if (matches(sequence.charAt(i))) { 605 count++; 606 } 607 } 608 return count; 609 } 610 611 /** 612 * Returns a string containing all non-matching characters of a character sequence, in order. For 613 * example: 614 * 615 * <pre>{@code 616 * CharMatcher.is('a').removeFrom("bazaar") 617 * }</pre> 618 * 619 * ... returns {@code "bzr"}. 620 */ 621 public String removeFrom(CharSequence sequence) { 622 String string = sequence.toString(); 623 int pos = indexIn(string); 624 if (pos == -1) { 625 return string; 626 } 627 628 char[] chars = string.toCharArray(); 629 int spread = 1; 630 631 // This unusual loop comes from extensive benchmarking 632 OUT: 633 while (true) { 634 pos++; 635 while (true) { 636 if (pos == chars.length) { 637 break OUT; 638 } 639 if (matches(chars[pos])) { 640 break; 641 } 642 chars[pos - spread] = chars[pos]; 643 pos++; 644 } 645 spread++; 646 } 647 return new String(chars, 0, pos - spread); 648 } 649 650 /** 651 * Returns a string containing all matching BMP characters of a character sequence, in order. For 652 * example: 653 * 654 * <pre>{@code 655 * CharMatcher.is('a').retainFrom("bazaar") 656 * }</pre> 657 * 658 * ... returns {@code "aaa"}. 659 */ 660 public String retainFrom(CharSequence sequence) { 661 return negate().removeFrom(sequence); 662 } 663 664 /** 665 * Returns a string copy of the input character sequence, with each matching BMP character 666 * replaced by a given replacement character. For example: 667 * 668 * <pre>{@code 669 * CharMatcher.is('a').replaceFrom("radar", 'o') 670 * }</pre> 671 * 672 * ... returns {@code "rodor"}. 673 * 674 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 675 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 676 * character. 677 * 678 * @param sequence the character sequence to replace matching characters in 679 * @param replacement the character to append to the result string in place of each matching 680 * character in {@code sequence} 681 * @return the new string 682 */ 683 public String replaceFrom(CharSequence sequence, char replacement) { 684 String string = sequence.toString(); 685 int pos = indexIn(string); 686 if (pos == -1) { 687 return string; 688 } 689 char[] chars = string.toCharArray(); 690 chars[pos] = replacement; 691 for (int i = pos + 1; i < chars.length; i++) { 692 if (matches(chars[i])) { 693 chars[i] = replacement; 694 } 695 } 696 return new String(chars); 697 } 698 699 /** 700 * Returns a string copy of the input character sequence, with each matching BMP character 701 * replaced by a given replacement sequence. For example: 702 * 703 * <pre>{@code 704 * CharMatcher.is('a').replaceFrom("yaha", "oo") 705 * }</pre> 706 * 707 * ... returns {@code "yoohoo"}. 708 * 709 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 710 * off calling {@link #replaceFrom(CharSequence, char)} directly. 711 * 712 * @param sequence the character sequence to replace matching characters in 713 * @param replacement the characters to append to the result string in place of each matching 714 * character in {@code sequence} 715 * @return the new string 716 */ 717 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 718 int replacementLen = replacement.length(); 719 if (replacementLen == 0) { 720 return removeFrom(sequence); 721 } 722 if (replacementLen == 1) { 723 return replaceFrom(sequence, replacement.charAt(0)); 724 } 725 726 String string = sequence.toString(); 727 int pos = indexIn(string); 728 if (pos == -1) { 729 return string; 730 } 731 732 int len = string.length(); 733 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 734 735 int oldpos = 0; 736 do { 737 buf.append(string, oldpos, pos); 738 buf.append(replacement); 739 oldpos = pos + 1; 740 pos = indexIn(string, oldpos); 741 } while (pos != -1); 742 743 buf.append(string, oldpos, len); 744 return buf.toString(); 745 } 746 747 /** 748 * Returns a substring of the input character sequence that omits all matching BMP characters from 749 * the beginning and from the end of the string. For example: 750 * 751 * <pre>{@code 752 * CharMatcher.anyOf("ab").trimFrom("abacatbab") 753 * }</pre> 754 * 755 * ... returns {@code "cat"}. 756 * 757 * <p>Note that: 758 * 759 * <pre>{@code 760 * CharMatcher.inRange('\0', ' ').trimFrom(str) 761 * }</pre> 762 * 763 * ... is equivalent to {@link String#trim()}. 764 */ 765 public String trimFrom(CharSequence sequence) { 766 int len = sequence.length(); 767 int first; 768 int last; 769 770 for (first = 0; first < len; first++) { 771 if (!matches(sequence.charAt(first))) { 772 break; 773 } 774 } 775 for (last = len - 1; last > first; last--) { 776 if (!matches(sequence.charAt(last))) { 777 break; 778 } 779 } 780 781 return sequence.subSequence(first, last + 1).toString(); 782 } 783 784 /** 785 * Returns a substring of the input character sequence that omits all matching BMP characters from 786 * the beginning of the string. For example: 787 * 788 * <pre>{@code 789 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab") 790 * }</pre> 791 * 792 * ... returns {@code "catbab"}. 793 */ 794 public String trimLeadingFrom(CharSequence sequence) { 795 int len = sequence.length(); 796 for (int first = 0; first < len; first++) { 797 if (!matches(sequence.charAt(first))) { 798 return sequence.subSequence(first, len).toString(); 799 } 800 } 801 return ""; 802 } 803 804 /** 805 * Returns a substring of the input character sequence that omits all matching BMP characters from 806 * the end of the string. For example: 807 * 808 * <pre>{@code 809 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab") 810 * }</pre> 811 * 812 * ... returns {@code "abacat"}. 813 */ 814 public String trimTrailingFrom(CharSequence sequence) { 815 int len = sequence.length(); 816 for (int last = len - 1; last >= 0; last--) { 817 if (!matches(sequence.charAt(last))) { 818 return sequence.subSequence(0, last + 1).toString(); 819 } 820 } 821 return ""; 822 } 823 824 /** 825 * Returns a string copy of the input character sequence, with each group of consecutive matching 826 * BMP characters replaced by a single replacement character. For example: 827 * 828 * <pre>{@code 829 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-') 830 * }</pre> 831 * 832 * ... returns {@code "b-p-r"}. 833 * 834 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 835 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 836 * character. 837 * 838 * @param sequence the character sequence to replace matching groups of characters in 839 * @param replacement the character to append to the result string in place of each group of 840 * matching characters in {@code sequence} 841 * @return the new string 842 */ 843 public String collapseFrom(CharSequence sequence, char replacement) { 844 // This implementation avoids unnecessary allocation. 845 int len = sequence.length(); 846 for (int i = 0; i < len; i++) { 847 char c = sequence.charAt(i); 848 if (matches(c)) { 849 if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 850 // a no-op replacement 851 i++; 852 } else { 853 StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); 854 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 855 } 856 } 857 } 858 // no replacement needed 859 return sequence.toString(); 860 } 861 862 /** 863 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 864 * groups of matching BMP characters at the start or end of the sequence are removed without 865 * replacement. 866 */ 867 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 868 // This implementation avoids unnecessary allocation. 869 int len = sequence.length(); 870 int first = 0; 871 int last = len - 1; 872 873 while (first < len && matches(sequence.charAt(first))) { 874 first++; 875 } 876 877 while (last > first && matches(sequence.charAt(last))) { 878 last--; 879 } 880 881 return (first == 0 && last == len - 1) 882 ? collapseFrom(sequence, replacement) 883 : finishCollapseFrom( 884 sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); 885 } 886 887 private String finishCollapseFrom( 888 CharSequence sequence, 889 int start, 890 int end, 891 char replacement, 892 StringBuilder builder, 893 boolean inMatchingGroup) { 894 for (int i = start; i < end; i++) { 895 char c = sequence.charAt(i); 896 if (matches(c)) { 897 if (!inMatchingGroup) { 898 builder.append(replacement); 899 inMatchingGroup = true; 900 } 901 } else { 902 builder.append(c); 903 inMatchingGroup = false; 904 } 905 } 906 return builder.toString(); 907 } 908 909 /** 910 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 911 * instead. 912 */ 913 @Deprecated 914 @Override 915 public boolean apply(Character character) { 916 return matches(character); 917 } 918 919 /** 920 * Returns a string representation of this {@code CharMatcher}, such as {@code 921 * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 922 */ 923 @Override 924 public String toString() { 925 return super.toString(); 926 } 927 928 /** 929 * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where 930 * "12AB" is the four hexadecimal digits representing the 16-bit code unit. 931 */ 932 private static String showCharacter(char c) { 933 String hex = "0123456789ABCDEF"; 934 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 935 for (int i = 0; i < 4; i++) { 936 tmp[5 - i] = hex.charAt(c & 0xF); 937 c = (char) (c >> 4); 938 } 939 return String.copyValueOf(tmp); 940 } 941 942 // Fast matchers 943 944 /** A matcher for which precomputation will not yield any significant benefit. */ 945 abstract static class FastMatcher extends CharMatcher { 946 947 @Override 948 public final CharMatcher precomputed() { 949 return this; 950 } 951 952 @Override 953 public CharMatcher negate() { 954 return new NegatedFastMatcher(this); 955 } 956 } 957 958 /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ 959 abstract static class NamedFastMatcher extends FastMatcher { 960 961 private final String description; 962 963 NamedFastMatcher(String description) { 964 this.description = checkNotNull(description); 965 } 966 967 @Override 968 public final String toString() { 969 return description; 970 } 971 } 972 973 /** Negation of a {@link FastMatcher}. */ 974 static class NegatedFastMatcher extends Negated { 975 976 NegatedFastMatcher(CharMatcher original) { 977 super(original); 978 } 979 980 @Override 981 public final CharMatcher precomputed() { 982 return this; 983 } 984 } 985 986 /** Fast matcher using a {@link BitSet} table of matching characters. */ 987 @J2ktIncompatible 988 @GwtIncompatible // used only from other GwtIncompatible code 989 private static final class BitSetMatcher extends NamedFastMatcher { 990 991 private final BitSet table; 992 993 private BitSetMatcher(BitSet table, String description) { 994 super(description); 995 if (table.length() + Long.SIZE < table.size()) { 996 table = (BitSet) table.clone(); 997 // If only we could actually call BitSet.trimToSize() ourselves... 998 } 999 this.table = table; 1000 } 1001 1002 @Override 1003 public boolean matches(char c) { 1004 return table.get(c); 1005 } 1006 1007 @Override 1008 void setBits(BitSet bitSet) { 1009 bitSet.or(table); 1010 } 1011 } 1012 1013 // Static constant implementation classes 1014 1015 /** Implementation of {@link #any()}. */ 1016 private static final class Any extends NamedFastMatcher { 1017 1018 static final Any INSTANCE = new Any(); 1019 1020 private Any() { 1021 super("CharMatcher.any()"); 1022 } 1023 1024 @Override 1025 public boolean matches(char c) { 1026 return true; 1027 } 1028 1029 @Override 1030 public int indexIn(CharSequence sequence) { 1031 return (sequence.length() == 0) ? -1 : 0; 1032 } 1033 1034 @Override 1035 public int indexIn(CharSequence sequence, int start) { 1036 int length = sequence.length(); 1037 checkPositionIndex(start, length); 1038 return (start == length) ? -1 : start; 1039 } 1040 1041 @Override 1042 public int lastIndexIn(CharSequence sequence) { 1043 return sequence.length() - 1; 1044 } 1045 1046 @Override 1047 public boolean matchesAllOf(CharSequence sequence) { 1048 checkNotNull(sequence); 1049 return true; 1050 } 1051 1052 @Override 1053 public boolean matchesNoneOf(CharSequence sequence) { 1054 return sequence.length() == 0; 1055 } 1056 1057 @Override 1058 public String removeFrom(CharSequence sequence) { 1059 checkNotNull(sequence); 1060 return ""; 1061 } 1062 1063 @Override 1064 public String replaceFrom(CharSequence sequence, char replacement) { 1065 char[] array = new char[sequence.length()]; 1066 Arrays.fill(array, replacement); 1067 return new String(array); 1068 } 1069 1070 @Override 1071 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1072 StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); 1073 for (int i = 0; i < sequence.length(); i++) { 1074 result.append(replacement); 1075 } 1076 return result.toString(); 1077 } 1078 1079 @Override 1080 public String collapseFrom(CharSequence sequence, char replacement) { 1081 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 1082 } 1083 1084 @Override 1085 public String trimFrom(CharSequence sequence) { 1086 checkNotNull(sequence); 1087 return ""; 1088 } 1089 1090 @Override 1091 public int countIn(CharSequence sequence) { 1092 return sequence.length(); 1093 } 1094 1095 @Override 1096 public CharMatcher and(CharMatcher other) { 1097 return checkNotNull(other); 1098 } 1099 1100 @Override 1101 public CharMatcher or(CharMatcher other) { 1102 checkNotNull(other); 1103 return this; 1104 } 1105 1106 @Override 1107 public CharMatcher negate() { 1108 return none(); 1109 } 1110 } 1111 1112 /** Implementation of {@link #none()}. */ 1113 private static final class None extends NamedFastMatcher { 1114 1115 static final None INSTANCE = new None(); 1116 1117 private None() { 1118 super("CharMatcher.none()"); 1119 } 1120 1121 @Override 1122 public boolean matches(char c) { 1123 return false; 1124 } 1125 1126 @Override 1127 public int indexIn(CharSequence sequence) { 1128 checkNotNull(sequence); 1129 return -1; 1130 } 1131 1132 @Override 1133 public int indexIn(CharSequence sequence, int start) { 1134 int length = sequence.length(); 1135 checkPositionIndex(start, length); 1136 return -1; 1137 } 1138 1139 @Override 1140 public int lastIndexIn(CharSequence sequence) { 1141 checkNotNull(sequence); 1142 return -1; 1143 } 1144 1145 @Override 1146 public boolean matchesAllOf(CharSequence sequence) { 1147 return sequence.length() == 0; 1148 } 1149 1150 @Override 1151 public boolean matchesNoneOf(CharSequence sequence) { 1152 checkNotNull(sequence); 1153 return true; 1154 } 1155 1156 @Override 1157 public String removeFrom(CharSequence sequence) { 1158 return sequence.toString(); 1159 } 1160 1161 @Override 1162 public String replaceFrom(CharSequence sequence, char replacement) { 1163 return sequence.toString(); 1164 } 1165 1166 @Override 1167 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1168 checkNotNull(replacement); 1169 return sequence.toString(); 1170 } 1171 1172 @Override 1173 public String collapseFrom(CharSequence sequence, char replacement) { 1174 return sequence.toString(); 1175 } 1176 1177 @Override 1178 public String trimFrom(CharSequence sequence) { 1179 return sequence.toString(); 1180 } 1181 1182 @Override 1183 public String trimLeadingFrom(CharSequence sequence) { 1184 return sequence.toString(); 1185 } 1186 1187 @Override 1188 public String trimTrailingFrom(CharSequence sequence) { 1189 return sequence.toString(); 1190 } 1191 1192 @Override 1193 public int countIn(CharSequence sequence) { 1194 checkNotNull(sequence); 1195 return 0; 1196 } 1197 1198 @Override 1199 public CharMatcher and(CharMatcher other) { 1200 checkNotNull(other); 1201 return this; 1202 } 1203 1204 @Override 1205 public CharMatcher or(CharMatcher other) { 1206 return checkNotNull(other); 1207 } 1208 1209 @Override 1210 public CharMatcher negate() { 1211 return any(); 1212 } 1213 } 1214 1215 /** Implementation of {@link #whitespace()}. */ 1216 @VisibleForTesting 1217 static final class Whitespace extends NamedFastMatcher { 1218 1219 // TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function 1220 // whose key property is that it maps 25 characters into the 32-slot table without collision. 1221 // Basically this is an opportunistic fast implementation as opposed to "good code". For most 1222 // other use-cases, the reduction in readability isn't worth it. 1223 static final String TABLE = 1224 "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1225 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1226 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1227 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1228 static final int MULTIPLIER = 1682554634; 1229 static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); 1230 1231 static final Whitespace INSTANCE = new Whitespace(); 1232 1233 Whitespace() { 1234 super("CharMatcher.whitespace()"); 1235 } 1236 1237 @Override 1238 public boolean matches(char c) { 1239 return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; 1240 } 1241 1242 @J2ktIncompatible 1243 @GwtIncompatible // used only from other GwtIncompatible code 1244 @Override 1245 void setBits(BitSet table) { 1246 for (int i = 0; i < TABLE.length(); i++) { 1247 table.set(TABLE.charAt(i)); 1248 } 1249 } 1250 } 1251 1252 /** Implementation of {@link #breakingWhitespace()}. */ 1253 private static final class BreakingWhitespace extends CharMatcher { 1254 1255 static final CharMatcher INSTANCE = new BreakingWhitespace(); 1256 1257 @Override 1258 public boolean matches(char c) { 1259 switch (c) { 1260 case '\t': 1261 case '\n': 1262 case '\013': 1263 case '\f': 1264 case '\r': 1265 case ' ': 1266 case '\u0085': 1267 case '\u1680': 1268 case '\u2028': 1269 case '\u2029': 1270 case '\u205f': 1271 case '\u3000': 1272 return true; 1273 case '\u2007': 1274 return false; 1275 default: 1276 return c >= '\u2000' && c <= '\u200a'; 1277 } 1278 } 1279 1280 @Override 1281 public String toString() { 1282 return "CharMatcher.breakingWhitespace()"; 1283 } 1284 } 1285 1286 /** Implementation of {@link #ascii()}. */ 1287 private static final class Ascii extends NamedFastMatcher { 1288 1289 static final Ascii INSTANCE = new Ascii(); 1290 1291 Ascii() { 1292 super("CharMatcher.ascii()"); 1293 } 1294 1295 @Override 1296 public boolean matches(char c) { 1297 return c <= '\u007f'; 1298 } 1299 } 1300 1301 /** Implementation that matches characters that fall within multiple ranges. */ 1302 private static class RangesMatcher extends CharMatcher { 1303 1304 private final String description; 1305 private final char[] rangeStarts; 1306 private final char[] rangeEnds; 1307 1308 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 1309 this.description = description; 1310 this.rangeStarts = rangeStarts; 1311 this.rangeEnds = rangeEnds; 1312 checkArgument(rangeStarts.length == rangeEnds.length); 1313 for (int i = 0; i < rangeStarts.length; i++) { 1314 checkArgument(rangeStarts[i] <= rangeEnds[i]); 1315 if (i + 1 < rangeStarts.length) { 1316 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 1317 } 1318 } 1319 } 1320 1321 @Override 1322 public boolean matches(char c) { 1323 int index = Arrays.binarySearch(rangeStarts, c); 1324 if (index >= 0) { 1325 return true; 1326 } else { 1327 index = ~index - 1; 1328 return index >= 0 && c <= rangeEnds[index]; 1329 } 1330 } 1331 1332 @Override 1333 public String toString() { 1334 return description; 1335 } 1336 } 1337 1338 /** Implementation of {@link #digit()}. */ 1339 private static final class Digit extends RangesMatcher { 1340 // Plug the following UnicodeSet pattern into 1341 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1342 // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] 1343 // and get the zeroes from there. 1344 1345 // Must be in ascending order. 1346 private static final String ZEROES = 1347 "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" 1348 + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" 1349 + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; 1350 1351 private static char[] zeroes() { 1352 return ZEROES.toCharArray(); 1353 } 1354 1355 private static char[] nines() { 1356 char[] nines = new char[ZEROES.length()]; 1357 for (int i = 0; i < ZEROES.length(); i++) { 1358 nines[i] = (char) (ZEROES.charAt(i) + 9); 1359 } 1360 return nines; 1361 } 1362 1363 static final Digit INSTANCE = new Digit(); 1364 1365 private Digit() { 1366 super("CharMatcher.digit()", zeroes(), nines()); 1367 } 1368 } 1369 1370 /** Implementation of {@link #javaDigit()}. */ 1371 private static final class JavaDigit extends CharMatcher { 1372 1373 static final JavaDigit INSTANCE = new JavaDigit(); 1374 1375 @Override 1376 public boolean matches(char c) { 1377 return Character.isDigit(c); 1378 } 1379 1380 @Override 1381 public String toString() { 1382 return "CharMatcher.javaDigit()"; 1383 } 1384 } 1385 1386 /** Implementation of {@link #javaLetter()}. */ 1387 private static final class JavaLetter extends CharMatcher { 1388 1389 static final JavaLetter INSTANCE = new JavaLetter(); 1390 1391 @Override 1392 public boolean matches(char c) { 1393 return Character.isLetter(c); 1394 } 1395 1396 @Override 1397 public String toString() { 1398 return "CharMatcher.javaLetter()"; 1399 } 1400 } 1401 1402 /** Implementation of {@link #javaLetterOrDigit()}. */ 1403 private static final class JavaLetterOrDigit extends CharMatcher { 1404 1405 static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit(); 1406 1407 @Override 1408 public boolean matches(char c) { 1409 return Character.isLetterOrDigit(c); 1410 } 1411 1412 @Override 1413 public String toString() { 1414 return "CharMatcher.javaLetterOrDigit()"; 1415 } 1416 } 1417 1418 /** Implementation of {@link #javaUpperCase()}. */ 1419 private static final class JavaUpperCase extends CharMatcher { 1420 1421 static final JavaUpperCase INSTANCE = new JavaUpperCase(); 1422 1423 @Override 1424 public boolean matches(char c) { 1425 return Character.isUpperCase(c); 1426 } 1427 1428 @Override 1429 public String toString() { 1430 return "CharMatcher.javaUpperCase()"; 1431 } 1432 } 1433 1434 /** Implementation of {@link #javaLowerCase()}. */ 1435 private static final class JavaLowerCase extends CharMatcher { 1436 1437 static final JavaLowerCase INSTANCE = new JavaLowerCase(); 1438 1439 @Override 1440 public boolean matches(char c) { 1441 return Character.isLowerCase(c); 1442 } 1443 1444 @Override 1445 public String toString() { 1446 return "CharMatcher.javaLowerCase()"; 1447 } 1448 } 1449 1450 /** Implementation of {@link #javaIsoControl()}. */ 1451 private static final class JavaIsoControl extends NamedFastMatcher { 1452 1453 static final JavaIsoControl INSTANCE = new JavaIsoControl(); 1454 1455 private JavaIsoControl() { 1456 super("CharMatcher.javaIsoControl()"); 1457 } 1458 1459 @Override 1460 public boolean matches(char c) { 1461 return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); 1462 } 1463 } 1464 1465 /** Implementation of {@link #invisible()}. */ 1466 private static final class Invisible extends RangesMatcher { 1467 // Plug the following UnicodeSet pattern into 1468 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1469 // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] 1470 // with the "Abbreviate" option, and get the ranges from there. 1471 private static final String RANGE_STARTS = 1472 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u0890\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" 1473 + "\u3000\ud800\ufeff\ufff9"; 1474 private static final String RANGE_ENDS = // inclusive ends 1475 "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u0891\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" 1476 + "\u3000\uf8ff\ufeff\ufffb"; 1477 1478 static final Invisible INSTANCE = new Invisible(); 1479 1480 private Invisible() { 1481 super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); 1482 } 1483 } 1484 1485 /** Implementation of {@link #singleWidth()}. */ 1486 private static final class SingleWidth extends RangesMatcher { 1487 1488 static final SingleWidth INSTANCE = new SingleWidth(); 1489 1490 private SingleWidth() { 1491 super( 1492 "CharMatcher.singleWidth()", 1493 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 1494 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 1495 } 1496 } 1497 1498 // Non-static factory implementation classes 1499 1500 /** Implementation of {@link #negate()}. */ 1501 private static class Negated extends CharMatcher { 1502 1503 final CharMatcher original; 1504 1505 Negated(CharMatcher original) { 1506 this.original = checkNotNull(original); 1507 } 1508 1509 @Override 1510 public boolean matches(char c) { 1511 return !original.matches(c); 1512 } 1513 1514 @Override 1515 public boolean matchesAllOf(CharSequence sequence) { 1516 return original.matchesNoneOf(sequence); 1517 } 1518 1519 @Override 1520 public boolean matchesNoneOf(CharSequence sequence) { 1521 return original.matchesAllOf(sequence); 1522 } 1523 1524 @Override 1525 public int countIn(CharSequence sequence) { 1526 return sequence.length() - original.countIn(sequence); 1527 } 1528 1529 @J2ktIncompatible 1530 @GwtIncompatible // used only from other GwtIncompatible code 1531 @Override 1532 void setBits(BitSet table) { 1533 BitSet tmp = new BitSet(); 1534 original.setBits(tmp); 1535 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 1536 table.or(tmp); 1537 } 1538 1539 @Override 1540 public CharMatcher negate() { 1541 return original; 1542 } 1543 1544 @Override 1545 public String toString() { 1546 return original + ".negate()"; 1547 } 1548 } 1549 1550 /** Implementation of {@link #and(CharMatcher)}. */ 1551 private static final class And extends CharMatcher { 1552 1553 final CharMatcher first; 1554 final CharMatcher second; 1555 1556 And(CharMatcher a, CharMatcher b) { 1557 first = checkNotNull(a); 1558 second = checkNotNull(b); 1559 } 1560 1561 @Override 1562 public boolean matches(char c) { 1563 return first.matches(c) && second.matches(c); 1564 } 1565 1566 @J2ktIncompatible 1567 @GwtIncompatible // used only from other GwtIncompatible code 1568 @Override 1569 void setBits(BitSet table) { 1570 BitSet tmp1 = new BitSet(); 1571 first.setBits(tmp1); 1572 BitSet tmp2 = new BitSet(); 1573 second.setBits(tmp2); 1574 tmp1.and(tmp2); 1575 table.or(tmp1); 1576 } 1577 1578 @Override 1579 public String toString() { 1580 return "CharMatcher.and(" + first + ", " + second + ")"; 1581 } 1582 } 1583 1584 /** Implementation of {@link #or(CharMatcher)}. */ 1585 private static final class Or extends CharMatcher { 1586 1587 final CharMatcher first; 1588 final CharMatcher second; 1589 1590 Or(CharMatcher a, CharMatcher b) { 1591 first = checkNotNull(a); 1592 second = checkNotNull(b); 1593 } 1594 1595 @J2ktIncompatible 1596 @GwtIncompatible // used only from other GwtIncompatible code 1597 @Override 1598 void setBits(BitSet table) { 1599 first.setBits(table); 1600 second.setBits(table); 1601 } 1602 1603 @Override 1604 public boolean matches(char c) { 1605 return first.matches(c) || second.matches(c); 1606 } 1607 1608 @Override 1609 public String toString() { 1610 return "CharMatcher.or(" + first + ", " + second + ")"; 1611 } 1612 } 1613 1614 // Static factory implementations 1615 1616 /** Implementation of {@link #is(char)}. */ 1617 private static final class Is extends FastMatcher { 1618 1619 private final char match; 1620 1621 Is(char match) { 1622 this.match = match; 1623 } 1624 1625 @Override 1626 public boolean matches(char c) { 1627 return c == match; 1628 } 1629 1630 @Override 1631 public String replaceFrom(CharSequence sequence, char replacement) { 1632 return sequence.toString().replace(match, replacement); 1633 } 1634 1635 @Override 1636 public CharMatcher and(CharMatcher other) { 1637 return other.matches(match) ? this : none(); 1638 } 1639 1640 @Override 1641 public CharMatcher or(CharMatcher other) { 1642 return other.matches(match) ? other : super.or(other); 1643 } 1644 1645 @Override 1646 public CharMatcher negate() { 1647 return isNot(match); 1648 } 1649 1650 @J2ktIncompatible 1651 @GwtIncompatible // used only from other GwtIncompatible code 1652 @Override 1653 void setBits(BitSet table) { 1654 table.set(match); 1655 } 1656 1657 @Override 1658 public String toString() { 1659 return "CharMatcher.is('" + showCharacter(match) + "')"; 1660 } 1661 } 1662 1663 /** Implementation of {@link #isNot(char)}. */ 1664 private static final class IsNot extends FastMatcher { 1665 1666 private final char match; 1667 1668 IsNot(char match) { 1669 this.match = match; 1670 } 1671 1672 @Override 1673 public boolean matches(char c) { 1674 return c != match; 1675 } 1676 1677 @Override 1678 public CharMatcher and(CharMatcher other) { 1679 return other.matches(match) ? super.and(other) : other; 1680 } 1681 1682 @Override 1683 public CharMatcher or(CharMatcher other) { 1684 return other.matches(match) ? any() : this; 1685 } 1686 1687 @J2ktIncompatible 1688 @GwtIncompatible // used only from other GwtIncompatible code 1689 @Override 1690 void setBits(BitSet table) { 1691 table.set(0, match); 1692 table.set(match + 1, Character.MAX_VALUE + 1); 1693 } 1694 1695 @Override 1696 public CharMatcher negate() { 1697 return is(match); 1698 } 1699 1700 @Override 1701 public String toString() { 1702 return "CharMatcher.isNot('" + showCharacter(match) + "')"; 1703 } 1704 } 1705 1706 private static CharMatcher.IsEither isEither(char c1, char c2) { 1707 return new CharMatcher.IsEither(c1, c2); 1708 } 1709 1710 /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ 1711 private static final class IsEither extends FastMatcher { 1712 1713 private final char match1; 1714 private final char match2; 1715 1716 IsEither(char match1, char match2) { 1717 this.match1 = match1; 1718 this.match2 = match2; 1719 } 1720 1721 @Override 1722 public boolean matches(char c) { 1723 return c == match1 || c == match2; 1724 } 1725 1726 @J2ktIncompatible 1727 @GwtIncompatible // used only from other GwtIncompatible code 1728 @Override 1729 void setBits(BitSet table) { 1730 table.set(match1); 1731 table.set(match2); 1732 } 1733 1734 @Override 1735 public String toString() { 1736 return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; 1737 } 1738 } 1739 1740 /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ 1741 private static final class AnyOf extends CharMatcher { 1742 1743 private final char[] chars; 1744 1745 public AnyOf(CharSequence chars) { 1746 this.chars = chars.toString().toCharArray(); 1747 Arrays.sort(this.chars); 1748 } 1749 1750 @Override 1751 public boolean matches(char c) { 1752 return Arrays.binarySearch(chars, c) >= 0; 1753 } 1754 1755 @Override 1756 @J2ktIncompatible 1757 @GwtIncompatible // used only from other GwtIncompatible code 1758 void setBits(BitSet table) { 1759 for (char c : chars) { 1760 table.set(c); 1761 } 1762 } 1763 1764 @Override 1765 public String toString() { 1766 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 1767 for (char c : chars) { 1768 description.append(showCharacter(c)); 1769 } 1770 description.append("\")"); 1771 return description.toString(); 1772 } 1773 } 1774 1775 /** Implementation of {@link #inRange(char, char)}. */ 1776 private static final class InRange extends FastMatcher { 1777 1778 private final char startInclusive; 1779 private final char endInclusive; 1780 1781 InRange(char startInclusive, char endInclusive) { 1782 checkArgument(endInclusive >= startInclusive); 1783 this.startInclusive = startInclusive; 1784 this.endInclusive = endInclusive; 1785 } 1786 1787 @Override 1788 public boolean matches(char c) { 1789 return startInclusive <= c && c <= endInclusive; 1790 } 1791 1792 @J2ktIncompatible 1793 @GwtIncompatible // used only from other GwtIncompatible code 1794 @Override 1795 void setBits(BitSet table) { 1796 table.set(startInclusive, endInclusive + 1); 1797 } 1798 1799 @Override 1800 public String toString() { 1801 return "CharMatcher.inRange('" 1802 + showCharacter(startInclusive) 1803 + "', '" 1804 + showCharacter(endInclusive) 1805 + "')"; 1806 } 1807 } 1808 1809 /** Implementation of {@link #forPredicate(Predicate)}. */ 1810 private static final class ForPredicate extends CharMatcher { 1811 1812 private final Predicate<? super Character> predicate; 1813 1814 ForPredicate(Predicate<? super Character> predicate) { 1815 this.predicate = checkNotNull(predicate); 1816 } 1817 1818 @Override 1819 public boolean matches(char c) { 1820 return predicate.apply(c); 1821 } 1822 1823 @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily 1824 @Override 1825 public boolean apply(Character character) { 1826 return predicate.apply(checkNotNull(character)); 1827 } 1828 1829 @Override 1830 public String toString() { 1831 return "CharMatcher.forPredicate(" + predicate + ")"; 1832 } 1833 } 1834}