001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkPositionIndex; 020 021import com.google.common.annotations.GwtCompatible; 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.J2ktIncompatible; 024import com.google.common.annotations.VisibleForTesting; 025import java.util.Arrays; 026import java.util.BitSet; 027 028/** 029 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 030 * for any {@link Object}. Also offers basic text processing methods based on this function. 031 * Implementations are strongly encouraged to be side-effect-free and immutable. 032 * 033 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 034 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". 035 * 036 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a 037 * href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand 038 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code 039 * points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of 040 * assigned characters, including important CJK characters and emoji. 041 * 042 * <p>Supplementary characters are <a 043 * href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded 044 * into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as 045 * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. 046 * 047 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for 048 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For 049 * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. 050 * 051 * <p>Example usages: 052 * 053 * <pre> 054 * String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput); 055 * if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 056 * 057 * <p>See the Guava User Guide article on <a 058 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher} 059 * </a>. 060 * 061 * @author Kevin Bourrillion 062 * @since 1.0 063 */ 064@GwtCompatible(emulated = true) 065@ElementTypesAreNonnullByDefault 066public abstract class CharMatcher implements Predicate<Character> { 067 /* 068 * N777777777NO 069 * N7777777777777N 070 * M777777777777777N 071 * $N877777777D77777M 072 * N M77777777ONND777M 073 * MN777777777NN D777 074 * N7ZN777777777NN ~M7778 075 * N777777777777MMNN88777N 076 * N777777777777MNZZZ7777O 077 * DZN7777O77777777777777 078 * N7OONND7777777D77777N 079 * 8$M++++?N???$77777$ 080 * M7++++N+M77777777N 081 * N77O777777777777$ M 082 * DNNM$$$$777777N D 083 * N$N:=N$777N7777M NZ 084 * 77Z::::N777777777 ODZZZ 085 * 77N::::::N77777777M NNZZZ$ 086 * $777:::::::77777777MN ZM8ZZZZZ 087 * 777M::::::Z7777777Z77 N++ZZZZNN 088 * 7777M:::::M7777777$777M $++IZZZZM 089 * M777$:::::N777777$M7777M +++++ZZZDN 090 * NN$::::::7777$$M777777N N+++ZZZZNZ 091 * N::::::N:7$O:77777777 N++++ZZZZN 092 * M::::::::::::N77777777+ +?+++++ZZZM 093 * 8::::::::::::D77777777M O+++++ZZ 094 * ::::::::::::M777777777N O+?D 095 * M:::::::::::M77777777778 77= 096 * D=::::::::::N7777777777N 777 097 * INN===::::::=77777777777N I777N 098 * ?777N========N7777777777787M N7777 099 * 77777$D======N77777777777N777N? N777777 100 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N 101 * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON 102 * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND 103 * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N 104 * 7 :N MNN$$$$M$ $$$777$8 8D8I 105 * NMM.:7O 777777778 106 * 7777777MN 107 * M NO .7: 108 * M : M 109 * 8 110 */ 111 112 // Constant matcher factory methods 113 114 /** 115 * Matches any character. 116 * 117 * @since 19.0 (since 1.0 as constant {@code ANY}) 118 */ 119 public static CharMatcher any() { 120 return Any.INSTANCE; 121 } 122 123 /** 124 * Matches no characters. 125 * 126 * @since 19.0 (since 1.0 as constant {@code NONE}) 127 */ 128 public static CharMatcher none() { 129 return None.INSTANCE; 130 } 131 132 /** 133 * Determines whether a character is whitespace according to the latest Unicode standard, as 134 * illustrated <a 135 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 136 * This is not the same definition used by other Java APIs. (See a <a 137 * href="https://goo.gl/Y6SLWx">comparison of several definitions of "whitespace"</a>.) 138 * 139 * <p>All Unicode White_Space characters are on the BMP and thus supported by this API. 140 * 141 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to 142 * date. 143 * 144 * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) 145 */ 146 public static CharMatcher whitespace() { 147 return Whitespace.INSTANCE; 148 } 149 150 /** 151 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 152 * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a 153 * discussion of that term. 154 * 155 * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) 156 */ 157 public static CharMatcher breakingWhitespace() { 158 return BreakingWhitespace.INSTANCE; 159 } 160 161 /** 162 * Determines whether a character is ASCII, meaning that its code point is less than 128. 163 * 164 * @since 19.0 (since 1.0 as constant {@code ASCII}) 165 */ 166 public static CharMatcher ascii() { 167 return Ascii.INSTANCE; 168 } 169 170 /** 171 * Determines whether a character is a BMP digit according to <a 172 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If 173 * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 174 * 175 * @deprecated Many digits are supplementary characters; see the class documentation. 176 * @since 19.0 (since 1.0 as constant {@code DIGIT}) 177 */ 178 @Deprecated 179 public static CharMatcher digit() { 180 return Digit.INSTANCE; 181 } 182 183 /** 184 * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) 185 * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', 186 * '9')}. 187 * 188 * @deprecated Many digits are supplementary characters; see the class documentation. 189 * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) 190 */ 191 @Deprecated 192 public static CharMatcher javaDigit() { 193 return JavaDigit.INSTANCE; 194 } 195 196 /** 197 * Determines whether a character is a BMP letter according to {@linkplain 198 * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin 199 * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 200 * 201 * @deprecated Most letters are supplementary characters; see the class documentation. 202 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) 203 */ 204 @Deprecated 205 public static CharMatcher javaLetter() { 206 return JavaLetter.INSTANCE; 207 } 208 209 /** 210 * Determines whether a character is a BMP letter or digit according to {@linkplain 211 * Character#isLetterOrDigit(char) Java's definition}. 212 * 213 * @deprecated Most letters and digits are supplementary characters; see the class documentation. 214 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). 215 */ 216 @Deprecated 217 public static CharMatcher javaLetterOrDigit() { 218 return JavaLetterOrDigit.INSTANCE; 219 } 220 221 /** 222 * Determines whether a BMP character is upper case according to {@linkplain 223 * Character#isUpperCase(char) Java's definition}. 224 * 225 * @deprecated Some uppercase characters are supplementary characters; see the class 226 * documentation. 227 * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) 228 */ 229 @Deprecated 230 public static CharMatcher javaUpperCase() { 231 return JavaUpperCase.INSTANCE; 232 } 233 234 /** 235 * Determines whether a BMP character is lower case according to {@linkplain 236 * Character#isLowerCase(char) Java's definition}. 237 * 238 * @deprecated Some lowercase characters are supplementary characters; see the class 239 * documentation. 240 * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) 241 */ 242 @Deprecated 243 public static CharMatcher javaLowerCase() { 244 return JavaLowerCase.INSTANCE; 245 } 246 247 /** 248 * Determines whether a character is an ISO control character as specified by {@link 249 * Character#isISOControl(char)}. 250 * 251 * <p>All ISO control codes are on the BMP and thus supported by this API. 252 * 253 * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) 254 */ 255 public static CharMatcher javaIsoControl() { 256 return JavaIsoControl.INSTANCE; 257 } 258 259 /** 260 * Determines whether a character is invisible; that is, if its Unicode category is any of 261 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 262 * PRIVATE_USE according to ICU4J. 263 * 264 * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU). 265 * 266 * @deprecated Most invisible characters are supplementary characters; see the class 267 * documentation. 268 * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) 269 */ 270 @Deprecated 271 public static CharMatcher invisible() { 272 return Invisible.INSTANCE; 273 } 274 275 /** 276 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 277 * errs on the side of returning {@code false} (that is, it tends to assume a character is 278 * double-width). 279 * 280 * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to 281 * date. 282 * 283 * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>. 284 * 285 * @deprecated Many such characters are supplementary characters; see the class documentation. 286 * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) 287 */ 288 @Deprecated 289 public static CharMatcher singleWidth() { 290 return SingleWidth.INSTANCE; 291 } 292 293 // Static factories 294 295 /** Returns a {@code char} matcher that matches only one specified BMP character. */ 296 public static CharMatcher is(final char match) { 297 return new Is(match); 298 } 299 300 /** 301 * Returns a {@code char} matcher that matches any character except the BMP character specified. 302 * 303 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 304 */ 305 public static CharMatcher isNot(final char match) { 306 return new IsNot(match); 307 } 308 309 /** 310 * Returns a {@code char} matcher that matches any BMP character present in the given character 311 * sequence. Returns a bogus matcher if the sequence contains supplementary characters. 312 */ 313 public static CharMatcher anyOf(final CharSequence sequence) { 314 switch (sequence.length()) { 315 case 0: 316 return none(); 317 case 1: 318 return is(sequence.charAt(0)); 319 case 2: 320 return isEither(sequence.charAt(0), sequence.charAt(1)); 321 default: 322 // TODO(lowasser): is it potentially worth just going ahead and building a precomputed 323 // matcher? 324 return new AnyOf(sequence); 325 } 326 } 327 328 /** 329 * Returns a {@code char} matcher that matches any BMP character not present in the given 330 * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. 331 */ 332 public static CharMatcher noneOf(CharSequence sequence) { 333 return anyOf(sequence).negate(); 334 } 335 336 /** 337 * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints 338 * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 339 * CharMatcher.inRange('a', 'z')}. 340 * 341 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 342 */ 343 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 344 return new InRange(startInclusive, endInclusive); 345 } 346 347 /** 348 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 349 * which operates on primitive {@code char} instances instead. 350 */ 351 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 352 return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); 353 } 354 355 // Constructors 356 357 /** 358 * Constructor for use by subclasses. When subclassing, you may want to override {@code 359 * toString()} to provide a useful description. 360 */ 361 protected CharMatcher() {} 362 363 // Abstract methods 364 365 /** Determines a true or false value for the given character. */ 366 public abstract boolean matches(char c); 367 368 // Non-static factories 369 370 /** Returns a matcher that matches any character not matched by this matcher. */ 371 // @Override under Java 8 but not under Java 7 372 public CharMatcher negate() { 373 return new Negated(this); 374 } 375 376 /** 377 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 378 */ 379 public CharMatcher and(CharMatcher other) { 380 return new And(this, other); 381 } 382 383 /** 384 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 385 */ 386 public CharMatcher or(CharMatcher other) { 387 return new Or(this, other); 388 } 389 390 /** 391 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 392 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 393 * worthwhile only if the precomputed matcher is queried many thousands of times. 394 * 395 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 396 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 397 * worthwhile tradeoff in a browser. 398 */ 399 public CharMatcher precomputed() { 400 return Platform.precomputeCharMatcher(this); 401 } 402 403 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 404 405 /** 406 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 407 * on {@link Platform} so that we can have different behavior in GWT. 408 * 409 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the 410 * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables 411 * for matchers that only match a few characters, and so on. In the worst-case scenario, it 412 * constructs an eight-kilobyte bit array and queries that. In many situations this produces a 413 * matcher which is faster to query than the original. 414 */ 415 @J2ktIncompatible 416 @GwtIncompatible // SmallCharMatcher 417 CharMatcher precomputedInternal() { 418 final BitSet table = new BitSet(); 419 setBits(table); 420 int totalCharacters = table.cardinality(); 421 if (totalCharacters * 2 <= DISTINCT_CHARS) { 422 return precomputedPositive(totalCharacters, table, toString()); 423 } else { 424 // TODO(lowasser): is it worth it to worry about the last character of large matchers? 425 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 426 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 427 String suffix = ".negate()"; 428 final String description = toString(); 429 String negatedDescription = 430 description.endsWith(suffix) 431 ? description.substring(0, description.length() - suffix.length()) 432 : description + suffix; 433 return new NegatedFastMatcher( 434 precomputedPositive(negatedCharacters, table, negatedDescription)) { 435 @Override 436 public String toString() { 437 return description; 438 } 439 }; 440 } 441 } 442 443 /** 444 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 445 */ 446 @J2ktIncompatible 447 @GwtIncompatible // SmallCharMatcher 448 private static CharMatcher precomputedPositive( 449 int totalCharacters, BitSet table, String description) { 450 switch (totalCharacters) { 451 case 0: 452 return none(); 453 case 1: 454 return is((char) table.nextSetBit(0)); 455 case 2: 456 char c1 = (char) table.nextSetBit(0); 457 char c2 = (char) table.nextSetBit(c1 + 1); 458 return isEither(c1, c2); 459 default: 460 return isSmall(totalCharacters, table.length()) 461 ? SmallCharMatcher.from(table, description) 462 : new BitSetMatcher(table, description); 463 } 464 } 465 466 @J2ktIncompatible 467 @GwtIncompatible // SmallCharMatcher 468 private static boolean isSmall(int totalCharacters, int tableLength) { 469 return totalCharacters <= SmallCharMatcher.MAX_SIZE 470 && tableLength > (totalCharacters * 4 * Character.SIZE); 471 // err on the side of BitSetMatcher 472 } 473 474 /** Sets bits in {@code table} matched by this matcher. */ 475 @J2ktIncompatible 476 @GwtIncompatible // used only from other GwtIncompatible code 477 void setBits(BitSet table) { 478 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 479 if (matches((char) c)) { 480 table.set(c); 481 } 482 } 483 } 484 485 // Text processing routines 486 487 /** 488 * Returns {@code true} if a character sequence contains at least one matching BMP character. 489 * Equivalent to {@code !matchesNoneOf(sequence)}. 490 * 491 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 492 * character, until this returns {@code true} or the end is reached. 493 * 494 * @param sequence the character sequence to examine, possibly empty 495 * @return {@code true} if this matcher matches at least one character in the sequence 496 * @since 8.0 497 */ 498 public boolean matchesAnyOf(CharSequence sequence) { 499 return !matchesNoneOf(sequence); 500 } 501 502 /** 503 * Returns {@code true} if a character sequence contains only matching BMP characters. 504 * 505 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 506 * character, until this returns {@code false} or the end is reached. 507 * 508 * @param sequence the character sequence to examine, possibly empty 509 * @return {@code true} if this matcher matches every character in the sequence, including when 510 * the sequence is empty 511 */ 512 public boolean matchesAllOf(CharSequence sequence) { 513 for (int i = sequence.length() - 1; i >= 0; i--) { 514 if (!matches(sequence.charAt(i))) { 515 return false; 516 } 517 } 518 return true; 519 } 520 521 /** 522 * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to 523 * {@code !matchesAnyOf(sequence)}. 524 * 525 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 526 * character, until this returns {@code true} or the end is reached. 527 * 528 * @param sequence the character sequence to examine, possibly empty 529 * @return {@code true} if this matcher matches no characters in the sequence, including when the 530 * sequence is empty 531 */ 532 public boolean matchesNoneOf(CharSequence sequence) { 533 return indexIn(sequence) == -1; 534 } 535 536 /** 537 * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if 538 * no matching character is present. 539 * 540 * <p>The default implementation iterates over the sequence in forward order calling {@link 541 * #matches} for each character. 542 * 543 * @param sequence the character sequence to examine from the beginning 544 * @return an index, or {@code -1} if no character matches 545 */ 546 public int indexIn(CharSequence sequence) { 547 return indexIn(sequence, 0); 548 } 549 550 /** 551 * Returns the index of the first matching BMP character in a character sequence, starting from a 552 * given position, or {@code -1} if no character matches after that position. 553 * 554 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 555 * start}, calling {@link #matches} for each character. 556 * 557 * @param sequence the character sequence to examine 558 * @param start the first index to examine; must be nonnegative and no greater than {@code 559 * sequence.length()} 560 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 561 * or {@code -1} if no character matches 562 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 563 * sequence.length()} 564 */ 565 public int indexIn(CharSequence sequence, int start) { 566 int length = sequence.length(); 567 checkPositionIndex(start, length); 568 for (int i = start; i < length; i++) { 569 if (matches(sequence.charAt(i))) { 570 return i; 571 } 572 } 573 return -1; 574 } 575 576 /** 577 * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if 578 * no matching character is present. 579 * 580 * <p>The default implementation iterates over the sequence in reverse order calling {@link 581 * #matches} for each character. 582 * 583 * @param sequence the character sequence to examine from the end 584 * @return an index, or {@code -1} if no character matches 585 */ 586 public int lastIndexIn(CharSequence sequence) { 587 for (int i = sequence.length() - 1; i >= 0; i--) { 588 if (matches(sequence.charAt(i))) { 589 return i; 590 } 591 } 592 return -1; 593 } 594 595 /** 596 * Returns the number of matching {@code char}s found in a character sequence. 597 * 598 * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). 599 */ 600 public int countIn(CharSequence sequence) { 601 int count = 0; 602 for (int i = 0; i < sequence.length(); i++) { 603 if (matches(sequence.charAt(i))) { 604 count++; 605 } 606 } 607 return count; 608 } 609 610 /** 611 * Returns a string containing all non-matching characters of a character sequence, in order. For 612 * example: 613 * 614 * <pre>{@code 615 * CharMatcher.is('a').removeFrom("bazaar") 616 * }</pre> 617 * 618 * ... returns {@code "bzr"}. 619 */ 620 public String removeFrom(CharSequence sequence) { 621 String string = sequence.toString(); 622 int pos = indexIn(string); 623 if (pos == -1) { 624 return string; 625 } 626 627 char[] chars = string.toCharArray(); 628 int spread = 1; 629 630 // This unusual loop comes from extensive benchmarking 631 OUT: 632 while (true) { 633 pos++; 634 while (true) { 635 if (pos == chars.length) { 636 break OUT; 637 } 638 if (matches(chars[pos])) { 639 break; 640 } 641 chars[pos - spread] = chars[pos]; 642 pos++; 643 } 644 spread++; 645 } 646 return new String(chars, 0, pos - spread); 647 } 648 649 /** 650 * Returns a string containing all matching BMP characters of a character sequence, in order. For 651 * example: 652 * 653 * <pre>{@code 654 * CharMatcher.is('a').retainFrom("bazaar") 655 * }</pre> 656 * 657 * ... returns {@code "aaa"}. 658 */ 659 public String retainFrom(CharSequence sequence) { 660 return negate().removeFrom(sequence); 661 } 662 663 /** 664 * Returns a string copy of the input character sequence, with each matching BMP character 665 * replaced by a given replacement character. For example: 666 * 667 * <pre>{@code 668 * CharMatcher.is('a').replaceFrom("radar", 'o') 669 * }</pre> 670 * 671 * ... returns {@code "rodor"}. 672 * 673 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 674 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 675 * character. 676 * 677 * @param sequence the character sequence to replace matching characters in 678 * @param replacement the character to append to the result string in place of each matching 679 * character in {@code sequence} 680 * @return the new string 681 */ 682 public String replaceFrom(CharSequence sequence, char replacement) { 683 String string = sequence.toString(); 684 int pos = indexIn(string); 685 if (pos == -1) { 686 return string; 687 } 688 char[] chars = string.toCharArray(); 689 chars[pos] = replacement; 690 for (int i = pos + 1; i < chars.length; i++) { 691 if (matches(chars[i])) { 692 chars[i] = replacement; 693 } 694 } 695 return new String(chars); 696 } 697 698 /** 699 * Returns a string copy of the input character sequence, with each matching BMP character 700 * replaced by a given replacement sequence. For example: 701 * 702 * <pre>{@code 703 * CharMatcher.is('a').replaceFrom("yaha", "oo") 704 * }</pre> 705 * 706 * ... returns {@code "yoohoo"}. 707 * 708 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 709 * off calling {@link #replaceFrom(CharSequence, char)} directly. 710 * 711 * @param sequence the character sequence to replace matching characters in 712 * @param replacement the characters to append to the result string in place of each matching 713 * character in {@code sequence} 714 * @return the new string 715 */ 716 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 717 int replacementLen = replacement.length(); 718 if (replacementLen == 0) { 719 return removeFrom(sequence); 720 } 721 if (replacementLen == 1) { 722 return replaceFrom(sequence, replacement.charAt(0)); 723 } 724 725 String string = sequence.toString(); 726 int pos = indexIn(string); 727 if (pos == -1) { 728 return string; 729 } 730 731 int len = string.length(); 732 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 733 734 int oldpos = 0; 735 do { 736 buf.append(string, oldpos, pos); 737 buf.append(replacement); 738 oldpos = pos + 1; 739 pos = indexIn(string, oldpos); 740 } while (pos != -1); 741 742 buf.append(string, oldpos, len); 743 return buf.toString(); 744 } 745 746 /** 747 * Returns a substring of the input character sequence that omits all matching BMP characters from 748 * the beginning and from the end of the string. For example: 749 * 750 * <pre>{@code 751 * CharMatcher.anyOf("ab").trimFrom("abacatbab") 752 * }</pre> 753 * 754 * ... returns {@code "cat"}. 755 * 756 * <p>Note that: 757 * 758 * <pre>{@code 759 * CharMatcher.inRange('\0', ' ').trimFrom(str) 760 * }</pre> 761 * 762 * ... is equivalent to {@link String#trim()}. 763 */ 764 public String trimFrom(CharSequence sequence) { 765 int len = sequence.length(); 766 int first; 767 int last; 768 769 for (first = 0; first < len; first++) { 770 if (!matches(sequence.charAt(first))) { 771 break; 772 } 773 } 774 for (last = len - 1; last > first; last--) { 775 if (!matches(sequence.charAt(last))) { 776 break; 777 } 778 } 779 780 return sequence.subSequence(first, last + 1).toString(); 781 } 782 783 /** 784 * Returns a substring of the input character sequence that omits all matching BMP characters from 785 * the beginning of the string. For example: 786 * 787 * <pre>{@code 788 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab") 789 * }</pre> 790 * 791 * ... returns {@code "catbab"}. 792 */ 793 public String trimLeadingFrom(CharSequence sequence) { 794 int len = sequence.length(); 795 for (int first = 0; first < len; first++) { 796 if (!matches(sequence.charAt(first))) { 797 return sequence.subSequence(first, len).toString(); 798 } 799 } 800 return ""; 801 } 802 803 /** 804 * Returns a substring of the input character sequence that omits all matching BMP characters from 805 * the end of the string. For example: 806 * 807 * <pre>{@code 808 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab") 809 * }</pre> 810 * 811 * ... returns {@code "abacat"}. 812 */ 813 public String trimTrailingFrom(CharSequence sequence) { 814 int len = sequence.length(); 815 for (int last = len - 1; last >= 0; last--) { 816 if (!matches(sequence.charAt(last))) { 817 return sequence.subSequence(0, last + 1).toString(); 818 } 819 } 820 return ""; 821 } 822 823 /** 824 * Returns a string copy of the input character sequence, with each group of consecutive matching 825 * BMP characters replaced by a single replacement character. For example: 826 * 827 * <pre>{@code 828 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-') 829 * }</pre> 830 * 831 * ... returns {@code "b-p-r"}. 832 * 833 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 834 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 835 * character. 836 * 837 * @param sequence the character sequence to replace matching groups of characters in 838 * @param replacement the character to append to the result string in place of each group of 839 * matching characters in {@code sequence} 840 * @return the new string 841 */ 842 public String collapseFrom(CharSequence sequence, char replacement) { 843 // This implementation avoids unnecessary allocation. 844 int len = sequence.length(); 845 for (int i = 0; i < len; i++) { 846 char c = sequence.charAt(i); 847 if (matches(c)) { 848 if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 849 // a no-op replacement 850 i++; 851 } else { 852 StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); 853 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 854 } 855 } 856 } 857 // no replacement needed 858 return sequence.toString(); 859 } 860 861 /** 862 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 863 * groups of matching BMP characters at the start or end of the sequence are removed without 864 * replacement. 865 */ 866 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 867 // This implementation avoids unnecessary allocation. 868 int len = sequence.length(); 869 int first = 0; 870 int last = len - 1; 871 872 while (first < len && matches(sequence.charAt(first))) { 873 first++; 874 } 875 876 while (last > first && matches(sequence.charAt(last))) { 877 last--; 878 } 879 880 return (first == 0 && last == len - 1) 881 ? collapseFrom(sequence, replacement) 882 : finishCollapseFrom( 883 sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); 884 } 885 886 private String finishCollapseFrom( 887 CharSequence sequence, 888 int start, 889 int end, 890 char replacement, 891 StringBuilder builder, 892 boolean inMatchingGroup) { 893 for (int i = start; i < end; i++) { 894 char c = sequence.charAt(i); 895 if (matches(c)) { 896 if (!inMatchingGroup) { 897 builder.append(replacement); 898 inMatchingGroup = true; 899 } 900 } else { 901 builder.append(c); 902 inMatchingGroup = false; 903 } 904 } 905 return builder.toString(); 906 } 907 908 /** 909 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 910 * instead. 911 */ 912 @Deprecated 913 @Override 914 public boolean apply(Character character) { 915 return matches(character); 916 } 917 918 /** 919 * Returns a string representation of this {@code CharMatcher}, such as {@code 920 * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 921 */ 922 @Override 923 public String toString() { 924 return super.toString(); 925 } 926 927 /** 928 * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where 929 * "12AB" is the four hexadecimal digits representing the 16-bit code unit. 930 */ 931 private static String showCharacter(char c) { 932 String hex = "0123456789ABCDEF"; 933 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 934 for (int i = 0; i < 4; i++) { 935 tmp[5 - i] = hex.charAt(c & 0xF); 936 c = (char) (c >> 4); 937 } 938 return String.copyValueOf(tmp); 939 } 940 941 // Fast matchers 942 943 /** A matcher for which precomputation will not yield any significant benefit. */ 944 abstract static class FastMatcher extends CharMatcher { 945 946 @Override 947 public final CharMatcher precomputed() { 948 return this; 949 } 950 951 @Override 952 public CharMatcher negate() { 953 return new NegatedFastMatcher(this); 954 } 955 } 956 957 /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ 958 abstract static class NamedFastMatcher extends FastMatcher { 959 960 private final String description; 961 962 NamedFastMatcher(String description) { 963 this.description = checkNotNull(description); 964 } 965 966 @Override 967 public final String toString() { 968 return description; 969 } 970 } 971 972 /** Negation of a {@link FastMatcher}. */ 973 private static class NegatedFastMatcher extends Negated { 974 975 NegatedFastMatcher(CharMatcher original) { 976 super(original); 977 } 978 979 @Override 980 public final CharMatcher precomputed() { 981 return this; 982 } 983 } 984 985 /** Fast matcher using a {@link BitSet} table of matching characters. */ 986 @J2ktIncompatible 987 @GwtIncompatible // used only from other GwtIncompatible code 988 private static final class BitSetMatcher extends NamedFastMatcher { 989 990 private final BitSet table; 991 992 private BitSetMatcher(BitSet table, String description) { 993 super(description); 994 if (table.length() + Long.SIZE < table.size()) { 995 table = (BitSet) table.clone(); 996 // If only we could actually call BitSet.trimToSize() ourselves... 997 } 998 this.table = table; 999 } 1000 1001 @Override 1002 public boolean matches(char c) { 1003 return table.get(c); 1004 } 1005 1006 @Override 1007 void setBits(BitSet bitSet) { 1008 bitSet.or(table); 1009 } 1010 } 1011 1012 // Static constant implementation classes 1013 1014 /** Implementation of {@link #any()}. */ 1015 private static final class Any extends NamedFastMatcher { 1016 1017 static final CharMatcher INSTANCE = new Any(); 1018 1019 private Any() { 1020 super("CharMatcher.any()"); 1021 } 1022 1023 @Override 1024 public boolean matches(char c) { 1025 return true; 1026 } 1027 1028 @Override 1029 public int indexIn(CharSequence sequence) { 1030 return (sequence.length() == 0) ? -1 : 0; 1031 } 1032 1033 @Override 1034 public int indexIn(CharSequence sequence, int start) { 1035 int length = sequence.length(); 1036 checkPositionIndex(start, length); 1037 return (start == length) ? -1 : start; 1038 } 1039 1040 @Override 1041 public int lastIndexIn(CharSequence sequence) { 1042 return sequence.length() - 1; 1043 } 1044 1045 @Override 1046 public boolean matchesAllOf(CharSequence sequence) { 1047 checkNotNull(sequence); 1048 return true; 1049 } 1050 1051 @Override 1052 public boolean matchesNoneOf(CharSequence sequence) { 1053 return sequence.length() == 0; 1054 } 1055 1056 @Override 1057 public String removeFrom(CharSequence sequence) { 1058 checkNotNull(sequence); 1059 return ""; 1060 } 1061 1062 @Override 1063 public String replaceFrom(CharSequence sequence, char replacement) { 1064 char[] array = new char[sequence.length()]; 1065 Arrays.fill(array, replacement); 1066 return new String(array); 1067 } 1068 1069 @Override 1070 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1071 StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); 1072 for (int i = 0; i < sequence.length(); i++) { 1073 result.append(replacement); 1074 } 1075 return result.toString(); 1076 } 1077 1078 @Override 1079 public String collapseFrom(CharSequence sequence, char replacement) { 1080 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 1081 } 1082 1083 @Override 1084 public String trimFrom(CharSequence sequence) { 1085 checkNotNull(sequence); 1086 return ""; 1087 } 1088 1089 @Override 1090 public int countIn(CharSequence sequence) { 1091 return sequence.length(); 1092 } 1093 1094 @Override 1095 public CharMatcher and(CharMatcher other) { 1096 return checkNotNull(other); 1097 } 1098 1099 @Override 1100 public CharMatcher or(CharMatcher other) { 1101 checkNotNull(other); 1102 return this; 1103 } 1104 1105 @Override 1106 public CharMatcher negate() { 1107 return none(); 1108 } 1109 } 1110 1111 /** Implementation of {@link #none()}. */ 1112 private static final class None extends NamedFastMatcher { 1113 1114 static final CharMatcher INSTANCE = new None(); 1115 1116 private None() { 1117 super("CharMatcher.none()"); 1118 } 1119 1120 @Override 1121 public boolean matches(char c) { 1122 return false; 1123 } 1124 1125 @Override 1126 public int indexIn(CharSequence sequence) { 1127 checkNotNull(sequence); 1128 return -1; 1129 } 1130 1131 @Override 1132 public int indexIn(CharSequence sequence, int start) { 1133 int length = sequence.length(); 1134 checkPositionIndex(start, length); 1135 return -1; 1136 } 1137 1138 @Override 1139 public int lastIndexIn(CharSequence sequence) { 1140 checkNotNull(sequence); 1141 return -1; 1142 } 1143 1144 @Override 1145 public boolean matchesAllOf(CharSequence sequence) { 1146 return sequence.length() == 0; 1147 } 1148 1149 @Override 1150 public boolean matchesNoneOf(CharSequence sequence) { 1151 checkNotNull(sequence); 1152 return true; 1153 } 1154 1155 @Override 1156 public String removeFrom(CharSequence sequence) { 1157 return sequence.toString(); 1158 } 1159 1160 @Override 1161 public String replaceFrom(CharSequence sequence, char replacement) { 1162 return sequence.toString(); 1163 } 1164 1165 @Override 1166 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1167 checkNotNull(replacement); 1168 return sequence.toString(); 1169 } 1170 1171 @Override 1172 public String collapseFrom(CharSequence sequence, char replacement) { 1173 return sequence.toString(); 1174 } 1175 1176 @Override 1177 public String trimFrom(CharSequence sequence) { 1178 return sequence.toString(); 1179 } 1180 1181 @Override 1182 public String trimLeadingFrom(CharSequence sequence) { 1183 return sequence.toString(); 1184 } 1185 1186 @Override 1187 public String trimTrailingFrom(CharSequence sequence) { 1188 return sequence.toString(); 1189 } 1190 1191 @Override 1192 public int countIn(CharSequence sequence) { 1193 checkNotNull(sequence); 1194 return 0; 1195 } 1196 1197 @Override 1198 public CharMatcher and(CharMatcher other) { 1199 checkNotNull(other); 1200 return this; 1201 } 1202 1203 @Override 1204 public CharMatcher or(CharMatcher other) { 1205 return checkNotNull(other); 1206 } 1207 1208 @Override 1209 public CharMatcher negate() { 1210 return any(); 1211 } 1212 } 1213 1214 /** Implementation of {@link #whitespace()}. */ 1215 @VisibleForTesting 1216 static final class Whitespace extends NamedFastMatcher { 1217 1218 // TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function 1219 // whose key property is that it maps 25 characters into the 32-slot table without collision. 1220 // Basically this is an opportunistic fast implementation as opposed to "good code". For most 1221 // other use-cases, the reduction in readability isn't worth it. 1222 static final String TABLE = 1223 "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1224 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1225 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1226 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1227 static final int MULTIPLIER = 1682554634; 1228 static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); 1229 1230 static final CharMatcher INSTANCE = new Whitespace(); 1231 1232 Whitespace() { 1233 super("CharMatcher.whitespace()"); 1234 } 1235 1236 @Override 1237 public boolean matches(char c) { 1238 return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; 1239 } 1240 1241 @J2ktIncompatible 1242 @GwtIncompatible // used only from other GwtIncompatible code 1243 @Override 1244 void setBits(BitSet table) { 1245 for (int i = 0; i < TABLE.length(); i++) { 1246 table.set(TABLE.charAt(i)); 1247 } 1248 } 1249 } 1250 1251 /** Implementation of {@link #breakingWhitespace()}. */ 1252 private static final class BreakingWhitespace extends CharMatcher { 1253 1254 static final CharMatcher INSTANCE = new BreakingWhitespace(); 1255 1256 @Override 1257 public boolean matches(char c) { 1258 switch (c) { 1259 case '\t': 1260 case '\n': 1261 case '\013': 1262 case '\f': 1263 case '\r': 1264 case ' ': 1265 case '\u0085': 1266 case '\u1680': 1267 case '\u2028': 1268 case '\u2029': 1269 case '\u205f': 1270 case '\u3000': 1271 return true; 1272 case '\u2007': 1273 return false; 1274 default: 1275 return c >= '\u2000' && c <= '\u200a'; 1276 } 1277 } 1278 1279 @Override 1280 public String toString() { 1281 return "CharMatcher.breakingWhitespace()"; 1282 } 1283 } 1284 1285 /** Implementation of {@link #ascii()}. */ 1286 private static final class Ascii extends NamedFastMatcher { 1287 1288 static final CharMatcher INSTANCE = new Ascii(); 1289 1290 Ascii() { 1291 super("CharMatcher.ascii()"); 1292 } 1293 1294 @Override 1295 public boolean matches(char c) { 1296 return c <= '\u007f'; 1297 } 1298 } 1299 1300 /** Implementation that matches characters that fall within multiple ranges. */ 1301 private static class RangesMatcher extends CharMatcher { 1302 1303 private final String description; 1304 private final char[] rangeStarts; 1305 private final char[] rangeEnds; 1306 1307 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 1308 this.description = description; 1309 this.rangeStarts = rangeStarts; 1310 this.rangeEnds = rangeEnds; 1311 checkArgument(rangeStarts.length == rangeEnds.length); 1312 for (int i = 0; i < rangeStarts.length; i++) { 1313 checkArgument(rangeStarts[i] <= rangeEnds[i]); 1314 if (i + 1 < rangeStarts.length) { 1315 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 1316 } 1317 } 1318 } 1319 1320 @Override 1321 public boolean matches(char c) { 1322 int index = Arrays.binarySearch(rangeStarts, c); 1323 if (index >= 0) { 1324 return true; 1325 } else { 1326 index = ~index - 1; 1327 return index >= 0 && c <= rangeEnds[index]; 1328 } 1329 } 1330 1331 @Override 1332 public String toString() { 1333 return description; 1334 } 1335 } 1336 1337 /** Implementation of {@link #digit()}. */ 1338 private static final class Digit extends RangesMatcher { 1339 // Plug the following UnicodeSet pattern into 1340 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1341 // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] 1342 // and get the zeroes from there. 1343 1344 // Must be in ascending order. 1345 private static final String ZEROES = 1346 "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" 1347 + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" 1348 + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; 1349 1350 private static char[] zeroes() { 1351 return ZEROES.toCharArray(); 1352 } 1353 1354 private static char[] nines() { 1355 char[] nines = new char[ZEROES.length()]; 1356 for (int i = 0; i < ZEROES.length(); i++) { 1357 nines[i] = (char) (ZEROES.charAt(i) + 9); 1358 } 1359 return nines; 1360 } 1361 1362 static final CharMatcher INSTANCE = new Digit(); 1363 1364 private Digit() { 1365 super("CharMatcher.digit()", zeroes(), nines()); 1366 } 1367 } 1368 1369 /** Implementation of {@link #javaDigit()}. */ 1370 private static final class JavaDigit extends CharMatcher { 1371 1372 static final CharMatcher INSTANCE = new JavaDigit(); 1373 1374 @Override 1375 public boolean matches(char c) { 1376 return Character.isDigit(c); 1377 } 1378 1379 @Override 1380 public String toString() { 1381 return "CharMatcher.javaDigit()"; 1382 } 1383 } 1384 1385 /** Implementation of {@link #javaLetter()}. */ 1386 private static final class JavaLetter extends CharMatcher { 1387 1388 static final CharMatcher INSTANCE = new JavaLetter(); 1389 1390 @Override 1391 public boolean matches(char c) { 1392 return Character.isLetter(c); 1393 } 1394 1395 @Override 1396 public String toString() { 1397 return "CharMatcher.javaLetter()"; 1398 } 1399 } 1400 1401 /** Implementation of {@link #javaLetterOrDigit()}. */ 1402 private static final class JavaLetterOrDigit extends CharMatcher { 1403 1404 static final CharMatcher INSTANCE = new JavaLetterOrDigit(); 1405 1406 @Override 1407 public boolean matches(char c) { 1408 return Character.isLetterOrDigit(c); 1409 } 1410 1411 @Override 1412 public String toString() { 1413 return "CharMatcher.javaLetterOrDigit()"; 1414 } 1415 } 1416 1417 /** Implementation of {@link #javaUpperCase()}. */ 1418 private static final class JavaUpperCase extends CharMatcher { 1419 1420 static final CharMatcher INSTANCE = new JavaUpperCase(); 1421 1422 @Override 1423 public boolean matches(char c) { 1424 return Character.isUpperCase(c); 1425 } 1426 1427 @Override 1428 public String toString() { 1429 return "CharMatcher.javaUpperCase()"; 1430 } 1431 } 1432 1433 /** Implementation of {@link #javaLowerCase()}. */ 1434 private static final class JavaLowerCase extends CharMatcher { 1435 1436 static final CharMatcher INSTANCE = new JavaLowerCase(); 1437 1438 @Override 1439 public boolean matches(char c) { 1440 return Character.isLowerCase(c); 1441 } 1442 1443 @Override 1444 public String toString() { 1445 return "CharMatcher.javaLowerCase()"; 1446 } 1447 } 1448 1449 /** Implementation of {@link #javaIsoControl()}. */ 1450 private static final class JavaIsoControl extends NamedFastMatcher { 1451 1452 static final CharMatcher INSTANCE = new JavaIsoControl(); 1453 1454 private JavaIsoControl() { 1455 super("CharMatcher.javaIsoControl()"); 1456 } 1457 1458 @Override 1459 public boolean matches(char c) { 1460 return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); 1461 } 1462 } 1463 1464 /** Implementation of {@link #invisible()}. */ 1465 private static final class Invisible extends RangesMatcher { 1466 // Plug the following UnicodeSet pattern into 1467 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1468 // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] 1469 // with the "Abbreviate" option, and get the ranges from there. 1470 private static final String RANGE_STARTS = 1471 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u0890\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" 1472 + "\u3000\ud800\ufeff\ufff9"; 1473 private static final String RANGE_ENDS = // inclusive ends 1474 "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u0891\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" 1475 + "\u3000\uf8ff\ufeff\ufffb"; 1476 1477 static final CharMatcher INSTANCE = new Invisible(); 1478 1479 private Invisible() { 1480 super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); 1481 } 1482 } 1483 1484 /** Implementation of {@link #singleWidth()}. */ 1485 private static final class SingleWidth extends RangesMatcher { 1486 1487 static final CharMatcher INSTANCE = new SingleWidth(); 1488 1489 private SingleWidth() { 1490 super( 1491 "CharMatcher.singleWidth()", 1492 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 1493 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 1494 } 1495 } 1496 1497 // Non-static factory implementation classes 1498 1499 /** Implementation of {@link #negate()}. */ 1500 private static class Negated extends CharMatcher { 1501 1502 final CharMatcher original; 1503 1504 Negated(CharMatcher original) { 1505 this.original = checkNotNull(original); 1506 } 1507 1508 @Override 1509 public boolean matches(char c) { 1510 return !original.matches(c); 1511 } 1512 1513 @Override 1514 public boolean matchesAllOf(CharSequence sequence) { 1515 return original.matchesNoneOf(sequence); 1516 } 1517 1518 @Override 1519 public boolean matchesNoneOf(CharSequence sequence) { 1520 return original.matchesAllOf(sequence); 1521 } 1522 1523 @Override 1524 public int countIn(CharSequence sequence) { 1525 return sequence.length() - original.countIn(sequence); 1526 } 1527 1528 @J2ktIncompatible 1529 @GwtIncompatible // used only from other GwtIncompatible code 1530 @Override 1531 void setBits(BitSet table) { 1532 BitSet tmp = new BitSet(); 1533 original.setBits(tmp); 1534 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 1535 table.or(tmp); 1536 } 1537 1538 @Override 1539 public CharMatcher negate() { 1540 return original; 1541 } 1542 1543 @Override 1544 public String toString() { 1545 return original + ".negate()"; 1546 } 1547 } 1548 1549 /** Implementation of {@link #and(CharMatcher)}. */ 1550 private static final class And extends CharMatcher { 1551 1552 final CharMatcher first; 1553 final CharMatcher second; 1554 1555 And(CharMatcher a, CharMatcher b) { 1556 first = checkNotNull(a); 1557 second = checkNotNull(b); 1558 } 1559 1560 @Override 1561 public boolean matches(char c) { 1562 return first.matches(c) && second.matches(c); 1563 } 1564 1565 @J2ktIncompatible 1566 @GwtIncompatible // used only from other GwtIncompatible code 1567 @Override 1568 void setBits(BitSet table) { 1569 BitSet tmp1 = new BitSet(); 1570 first.setBits(tmp1); 1571 BitSet tmp2 = new BitSet(); 1572 second.setBits(tmp2); 1573 tmp1.and(tmp2); 1574 table.or(tmp1); 1575 } 1576 1577 @Override 1578 public String toString() { 1579 return "CharMatcher.and(" + first + ", " + second + ")"; 1580 } 1581 } 1582 1583 /** Implementation of {@link #or(CharMatcher)}. */ 1584 private static final class Or extends CharMatcher { 1585 1586 final CharMatcher first; 1587 final CharMatcher second; 1588 1589 Or(CharMatcher a, CharMatcher b) { 1590 first = checkNotNull(a); 1591 second = checkNotNull(b); 1592 } 1593 1594 @J2ktIncompatible 1595 @GwtIncompatible // used only from other GwtIncompatible code 1596 @Override 1597 void setBits(BitSet table) { 1598 first.setBits(table); 1599 second.setBits(table); 1600 } 1601 1602 @Override 1603 public boolean matches(char c) { 1604 return first.matches(c) || second.matches(c); 1605 } 1606 1607 @Override 1608 public String toString() { 1609 return "CharMatcher.or(" + first + ", " + second + ")"; 1610 } 1611 } 1612 1613 // Static factory implementations 1614 1615 /** Implementation of {@link #is(char)}. */ 1616 private static final class Is extends FastMatcher { 1617 1618 private final char match; 1619 1620 Is(char match) { 1621 this.match = match; 1622 } 1623 1624 @Override 1625 public boolean matches(char c) { 1626 return c == match; 1627 } 1628 1629 @Override 1630 public String replaceFrom(CharSequence sequence, char replacement) { 1631 return sequence.toString().replace(match, replacement); 1632 } 1633 1634 @Override 1635 public CharMatcher and(CharMatcher other) { 1636 return other.matches(match) ? this : none(); 1637 } 1638 1639 @Override 1640 public CharMatcher or(CharMatcher other) { 1641 return other.matches(match) ? other : super.or(other); 1642 } 1643 1644 @Override 1645 public CharMatcher negate() { 1646 return isNot(match); 1647 } 1648 1649 @J2ktIncompatible 1650 @GwtIncompatible // used only from other GwtIncompatible code 1651 @Override 1652 void setBits(BitSet table) { 1653 table.set(match); 1654 } 1655 1656 @Override 1657 public String toString() { 1658 return "CharMatcher.is('" + showCharacter(match) + "')"; 1659 } 1660 } 1661 1662 /** Implementation of {@link #isNot(char)}. */ 1663 private static final class IsNot extends FastMatcher { 1664 1665 private final char match; 1666 1667 IsNot(char match) { 1668 this.match = match; 1669 } 1670 1671 @Override 1672 public boolean matches(char c) { 1673 return c != match; 1674 } 1675 1676 @Override 1677 public CharMatcher and(CharMatcher other) { 1678 return other.matches(match) ? super.and(other) : other; 1679 } 1680 1681 @Override 1682 public CharMatcher or(CharMatcher other) { 1683 return other.matches(match) ? any() : this; 1684 } 1685 1686 @J2ktIncompatible 1687 @GwtIncompatible // used only from other GwtIncompatible code 1688 @Override 1689 void setBits(BitSet table) { 1690 table.set(0, match); 1691 table.set(match + 1, Character.MAX_VALUE + 1); 1692 } 1693 1694 @Override 1695 public CharMatcher negate() { 1696 return is(match); 1697 } 1698 1699 @Override 1700 public String toString() { 1701 return "CharMatcher.isNot('" + showCharacter(match) + "')"; 1702 } 1703 } 1704 1705 private static CharMatcher.IsEither isEither(char c1, char c2) { 1706 return new CharMatcher.IsEither(c1, c2); 1707 } 1708 1709 /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ 1710 private static final class IsEither extends FastMatcher { 1711 1712 private final char match1; 1713 private final char match2; 1714 1715 IsEither(char match1, char match2) { 1716 this.match1 = match1; 1717 this.match2 = match2; 1718 } 1719 1720 @Override 1721 public boolean matches(char c) { 1722 return c == match1 || c == match2; 1723 } 1724 1725 @J2ktIncompatible 1726 @GwtIncompatible // used only from other GwtIncompatible code 1727 @Override 1728 void setBits(BitSet table) { 1729 table.set(match1); 1730 table.set(match2); 1731 } 1732 1733 @Override 1734 public String toString() { 1735 return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; 1736 } 1737 } 1738 1739 /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ 1740 private static final class AnyOf extends CharMatcher { 1741 1742 private final char[] chars; 1743 1744 public AnyOf(CharSequence chars) { 1745 this.chars = chars.toString().toCharArray(); 1746 Arrays.sort(this.chars); 1747 } 1748 1749 @Override 1750 public boolean matches(char c) { 1751 return Arrays.binarySearch(chars, c) >= 0; 1752 } 1753 1754 @Override 1755 @J2ktIncompatible 1756 @GwtIncompatible // used only from other GwtIncompatible code 1757 void setBits(BitSet table) { 1758 for (char c : chars) { 1759 table.set(c); 1760 } 1761 } 1762 1763 @Override 1764 public String toString() { 1765 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 1766 for (char c : chars) { 1767 description.append(showCharacter(c)); 1768 } 1769 description.append("\")"); 1770 return description.toString(); 1771 } 1772 } 1773 1774 /** Implementation of {@link #inRange(char, char)}. */ 1775 private static final class InRange extends FastMatcher { 1776 1777 private final char startInclusive; 1778 private final char endInclusive; 1779 1780 InRange(char startInclusive, char endInclusive) { 1781 checkArgument(endInclusive >= startInclusive); 1782 this.startInclusive = startInclusive; 1783 this.endInclusive = endInclusive; 1784 } 1785 1786 @Override 1787 public boolean matches(char c) { 1788 return startInclusive <= c && c <= endInclusive; 1789 } 1790 1791 @J2ktIncompatible 1792 @GwtIncompatible // used only from other GwtIncompatible code 1793 @Override 1794 void setBits(BitSet table) { 1795 table.set(startInclusive, endInclusive + 1); 1796 } 1797 1798 @Override 1799 public String toString() { 1800 return "CharMatcher.inRange('" 1801 + showCharacter(startInclusive) 1802 + "', '" 1803 + showCharacter(endInclusive) 1804 + "')"; 1805 } 1806 } 1807 1808 /** Implementation of {@link #forPredicate(Predicate)}. */ 1809 private static final class ForPredicate extends CharMatcher { 1810 1811 private final Predicate<? super Character> predicate; 1812 1813 ForPredicate(Predicate<? super Character> predicate) { 1814 this.predicate = checkNotNull(predicate); 1815 } 1816 1817 @Override 1818 public boolean matches(char c) { 1819 return predicate.apply(c); 1820 } 1821 1822 @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily 1823 @Override 1824 public boolean apply(Character character) { 1825 return predicate.apply(checkNotNull(character)); 1826 } 1827 1828 @Override 1829 public String toString() { 1830 return "CharMatcher.forPredicate(" + predicate + ")"; 1831 } 1832 } 1833}