001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkPositionIndex; 020 021import com.google.common.annotations.GwtCompatible; 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.VisibleForTesting; 024import java.util.Arrays; 025import java.util.BitSet; 026 027/** 028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 029 * for any {@link Object}. Also offers basic text processing methods based on this function. 030 * Implementations are strongly encouraged to be side-effect-free and immutable. 031 * 032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 033 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". 034 * 035 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a 036 * href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand 037 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code 038 * points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of 039 * assigned characters, including important CJK characters and emoji. 040 * 041 * <p>Supplementary characters are <a 042 * href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded 043 * into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as 044 * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. 045 * 046 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for 047 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For 048 * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. 049 * 050 * <p>Example usages: 051 * 052 * <pre> 053 * String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput); 054 * if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 055 * 056 * <p>See the Guava User Guide article on <a 057 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher} 058 * </a>. 059 * 060 * @author Kevin Bourrillion 061 * @since 1.0 062 */ 063@GwtCompatible(emulated = true) 064public abstract class CharMatcher implements Predicate<Character> { 065 /* 066 * N777777777NO 067 * N7777777777777N 068 * M777777777777777N 069 * $N877777777D77777M 070 * N M77777777ONND777M 071 * MN777777777NN D777 072 * N7ZN777777777NN ~M7778 073 * N777777777777MMNN88777N 074 * N777777777777MNZZZ7777O 075 * DZN7777O77777777777777 076 * N7OONND7777777D77777N 077 * 8$M++++?N???$77777$ 078 * M7++++N+M77777777N 079 * N77O777777777777$ M 080 * DNNM$$$$777777N D 081 * N$N:=N$777N7777M NZ 082 * 77Z::::N777777777 ODZZZ 083 * 77N::::::N77777777M NNZZZ$ 084 * $777:::::::77777777MN ZM8ZZZZZ 085 * 777M::::::Z7777777Z77 N++ZZZZNN 086 * 7777M:::::M7777777$777M $++IZZZZM 087 * M777$:::::N777777$M7777M +++++ZZZDN 088 * NN$::::::7777$$M777777N N+++ZZZZNZ 089 * N::::::N:7$O:77777777 N++++ZZZZN 090 * M::::::::::::N77777777+ +?+++++ZZZM 091 * 8::::::::::::D77777777M O+++++ZZ 092 * ::::::::::::M777777777N O+?D 093 * M:::::::::::M77777777778 77= 094 * D=::::::::::N7777777777N 777 095 * INN===::::::=77777777777N I777N 096 * ?777N========N7777777777787M N7777 097 * 77777$D======N77777777777N777N? N777777 098 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N 099 * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON 100 * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND 101 * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N 102 * 7 :N MNN$$$$M$ $$$777$8 8D8I 103 * NMM.:7O 777777778 104 * 7777777MN 105 * M NO .7: 106 * M : M 107 * 8 108 */ 109 110 // Constant matcher factory methods 111 112 /** 113 * Matches any character. 114 * 115 * @since 19.0 (since 1.0 as constant {@code ANY}) 116 */ 117 public static CharMatcher any() { 118 return Any.INSTANCE; 119 } 120 121 /** 122 * Matches no characters. 123 * 124 * @since 19.0 (since 1.0 as constant {@code NONE}) 125 */ 126 public static CharMatcher none() { 127 return None.INSTANCE; 128 } 129 130 /** 131 * Determines whether a character is whitespace according to the latest Unicode standard, as 132 * illustrated <a 133 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 134 * This is not the same definition used by other Java APIs. (See a <a 135 * href="https://goo.gl/Y6SLWx">comparison of several definitions of "whitespace"</a>.) 136 * 137 * <p>All Unicode White_Space characters are on the BMP and thus supported by this API. 138 * 139 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to 140 * date. 141 * 142 * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) 143 */ 144 public static CharMatcher whitespace() { 145 return Whitespace.INSTANCE; 146 } 147 148 /** 149 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 150 * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a 151 * discussion of that term. 152 * 153 * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) 154 */ 155 public static CharMatcher breakingWhitespace() { 156 return BreakingWhitespace.INSTANCE; 157 } 158 159 /** 160 * Determines whether a character is ASCII, meaning that its code point is less than 128. 161 * 162 * @since 19.0 (since 1.0 as constant {@code ASCII}) 163 */ 164 public static CharMatcher ascii() { 165 return Ascii.INSTANCE; 166 } 167 168 /** 169 * Determines whether a character is a BMP digit according to <a 170 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If 171 * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 172 * 173 * @deprecated Many digits are supplementary characters; see the class documentation. 174 * @since 19.0 (since 1.0 as constant {@code DIGIT}) 175 */ 176 @Deprecated 177 public static CharMatcher digit() { 178 return Digit.INSTANCE; 179 } 180 181 /** 182 * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) 183 * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', 184 * '9')}. 185 * 186 * @deprecated Many digits are supplementary characters; see the class documentation. 187 * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) 188 */ 189 @Deprecated 190 public static CharMatcher javaDigit() { 191 return JavaDigit.INSTANCE; 192 } 193 194 /** 195 * Determines whether a character is a BMP letter according to {@linkplain 196 * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin 197 * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 198 * 199 * @deprecated Most letters are supplementary characters; see the class documentation. 200 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) 201 */ 202 @Deprecated 203 public static CharMatcher javaLetter() { 204 return JavaLetter.INSTANCE; 205 } 206 207 /** 208 * Determines whether a character is a BMP letter or digit according to {@linkplain 209 * Character#isLetterOrDigit(char) Java's definition}. 210 * 211 * @deprecated Most letters and digits are supplementary characters; see the class documentation. 212 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). 213 */ 214 @Deprecated 215 public static CharMatcher javaLetterOrDigit() { 216 return JavaLetterOrDigit.INSTANCE; 217 } 218 219 /** 220 * Determines whether a BMP character is upper case according to {@linkplain 221 * Character#isUpperCase(char) Java's definition}. 222 * 223 * @deprecated Some uppercase characters are supplementary characters; see the class 224 * documentation. 225 * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) 226 */ 227 @Deprecated 228 public static CharMatcher javaUpperCase() { 229 return JavaUpperCase.INSTANCE; 230 } 231 232 /** 233 * Determines whether a BMP character is lower case according to {@linkplain 234 * Character#isLowerCase(char) Java's definition}. 235 * 236 * @deprecated Some lowercase characters are supplementary characters; see the class 237 * documentation. 238 * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) 239 */ 240 @Deprecated 241 public static CharMatcher javaLowerCase() { 242 return JavaLowerCase.INSTANCE; 243 } 244 245 /** 246 * Determines whether a character is an ISO control character as specified by {@link 247 * Character#isISOControl(char)}. 248 * 249 * <p>All ISO control codes are on the BMP and thus supported by this API. 250 * 251 * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) 252 */ 253 public static CharMatcher javaIsoControl() { 254 return JavaIsoControl.INSTANCE; 255 } 256 257 /** 258 * Determines whether a character is invisible; that is, if its Unicode category is any of 259 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 260 * PRIVATE_USE according to ICU4J. 261 * 262 * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU). 263 * 264 * @deprecated Most invisible characters are supplementary characters; see the class 265 * documentation. 266 * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) 267 */ 268 @Deprecated 269 public static CharMatcher invisible() { 270 return Invisible.INSTANCE; 271 } 272 273 /** 274 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 275 * errs on the side of returning {@code false} (that is, it tends to assume a character is 276 * double-width). 277 * 278 * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to 279 * date. 280 * 281 * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>. 282 * 283 * @deprecated Many such characters are supplementary characters; see the class documentation. 284 * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) 285 */ 286 @Deprecated 287 public static CharMatcher singleWidth() { 288 return SingleWidth.INSTANCE; 289 } 290 291 // Static factories 292 293 /** Returns a {@code char} matcher that matches only one specified BMP character. */ 294 public static CharMatcher is(final char match) { 295 return new Is(match); 296 } 297 298 /** 299 * Returns a {@code char} matcher that matches any character except the BMP character specified. 300 * 301 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 302 */ 303 public static CharMatcher isNot(final char match) { 304 return new IsNot(match); 305 } 306 307 /** 308 * Returns a {@code char} matcher that matches any BMP character present in the given character 309 * sequence. Returns a bogus matcher if the sequence contains supplementary characters. 310 */ 311 public static CharMatcher anyOf(final CharSequence sequence) { 312 switch (sequence.length()) { 313 case 0: 314 return none(); 315 case 1: 316 return is(sequence.charAt(0)); 317 case 2: 318 return isEither(sequence.charAt(0), sequence.charAt(1)); 319 default: 320 // TODO(lowasser): is it potentially worth just going ahead and building a precomputed 321 // matcher? 322 return new AnyOf(sequence); 323 } 324 } 325 326 /** 327 * Returns a {@code char} matcher that matches any BMP character not present in the given 328 * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. 329 */ 330 public static CharMatcher noneOf(CharSequence sequence) { 331 return anyOf(sequence).negate(); 332 } 333 334 /** 335 * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints 336 * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 337 * CharMatcher.inRange('a', 'z')}. 338 * 339 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 340 */ 341 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 342 return new InRange(startInclusive, endInclusive); 343 } 344 345 /** 346 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 347 * which operates on primitive {@code char} instances instead. 348 */ 349 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 350 return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); 351 } 352 353 // Constructors 354 355 /** 356 * Constructor for use by subclasses. When subclassing, you may want to override {@code 357 * toString()} to provide a useful description. 358 */ 359 protected CharMatcher() {} 360 361 // Abstract methods 362 363 /** Determines a true or false value for the given character. */ 364 public abstract boolean matches(char c); 365 366 // Non-static factories 367 368 /** Returns a matcher that matches any character not matched by this matcher. */ 369 // @Override under Java 8 but not under Java 7 370 public CharMatcher negate() { 371 return new Negated(this); 372 } 373 374 /** 375 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 376 */ 377 public CharMatcher and(CharMatcher other) { 378 return new And(this, other); 379 } 380 381 /** 382 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 383 */ 384 public CharMatcher or(CharMatcher other) { 385 return new Or(this, other); 386 } 387 388 /** 389 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 390 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 391 * worthwhile only if the precomputed matcher is queried many thousands of times. 392 * 393 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 394 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 395 * worthwhile tradeoff in a browser. 396 */ 397 public CharMatcher precomputed() { 398 return Platform.precomputeCharMatcher(this); 399 } 400 401 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 402 403 /** 404 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 405 * on {@link Platform} so that we can have different behavior in GWT. 406 * 407 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the 408 * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables 409 * for matchers that only match a few characters, and so on. In the worst-case scenario, it 410 * constructs an eight-kilobyte bit array and queries that. In many situations this produces a 411 * matcher which is faster to query than the original. 412 */ 413 @GwtIncompatible // SmallCharMatcher 414 CharMatcher precomputedInternal() { 415 final BitSet table = new BitSet(); 416 setBits(table); 417 int totalCharacters = table.cardinality(); 418 if (totalCharacters * 2 <= DISTINCT_CHARS) { 419 return precomputedPositive(totalCharacters, table, toString()); 420 } else { 421 // TODO(lowasser): is it worth it to worry about the last character of large matchers? 422 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 423 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 424 String suffix = ".negate()"; 425 final String description = toString(); 426 String negatedDescription = 427 description.endsWith(suffix) 428 ? description.substring(0, description.length() - suffix.length()) 429 : description + suffix; 430 return new NegatedFastMatcher( 431 precomputedPositive(negatedCharacters, table, negatedDescription)) { 432 @Override 433 public String toString() { 434 return description; 435 } 436 }; 437 } 438 } 439 440 /** 441 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 442 */ 443 @GwtIncompatible // SmallCharMatcher 444 private static CharMatcher precomputedPositive( 445 int totalCharacters, BitSet table, String description) { 446 switch (totalCharacters) { 447 case 0: 448 return none(); 449 case 1: 450 return is((char) table.nextSetBit(0)); 451 case 2: 452 char c1 = (char) table.nextSetBit(0); 453 char c2 = (char) table.nextSetBit(c1 + 1); 454 return isEither(c1, c2); 455 default: 456 return isSmall(totalCharacters, table.length()) 457 ? SmallCharMatcher.from(table, description) 458 : new BitSetMatcher(table, description); 459 } 460 } 461 462 @GwtIncompatible // SmallCharMatcher 463 private static boolean isSmall(int totalCharacters, int tableLength) { 464 return totalCharacters <= SmallCharMatcher.MAX_SIZE 465 && tableLength > (totalCharacters * 4 * Character.SIZE); 466 // err on the side of BitSetMatcher 467 } 468 469 /** Sets bits in {@code table} matched by this matcher. */ 470 @GwtIncompatible // used only from other GwtIncompatible code 471 void setBits(BitSet table) { 472 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 473 if (matches((char) c)) { 474 table.set(c); 475 } 476 } 477 } 478 479 // Text processing routines 480 481 /** 482 * Returns {@code true} if a character sequence contains at least one matching BMP character. 483 * Equivalent to {@code !matchesNoneOf(sequence)}. 484 * 485 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 486 * character, until this returns {@code true} or the end is reached. 487 * 488 * @param sequence the character sequence to examine, possibly empty 489 * @return {@code true} if this matcher matches at least one character in the sequence 490 * @since 8.0 491 */ 492 public boolean matchesAnyOf(CharSequence sequence) { 493 return !matchesNoneOf(sequence); 494 } 495 496 /** 497 * Returns {@code true} if a character sequence contains only matching BMP characters. 498 * 499 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 500 * character, until this returns {@code false} or the end is reached. 501 * 502 * @param sequence the character sequence to examine, possibly empty 503 * @return {@code true} if this matcher matches every character in the sequence, including when 504 * the sequence is empty 505 */ 506 public boolean matchesAllOf(CharSequence sequence) { 507 for (int i = sequence.length() - 1; i >= 0; i--) { 508 if (!matches(sequence.charAt(i))) { 509 return false; 510 } 511 } 512 return true; 513 } 514 515 /** 516 * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to 517 * {@code !matchesAnyOf(sequence)}. 518 * 519 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 520 * character, until this returns {@code true} or the end is reached. 521 * 522 * @param sequence the character sequence to examine, possibly empty 523 * @return {@code true} if this matcher matches no characters in the sequence, including when the 524 * sequence is empty 525 */ 526 public boolean matchesNoneOf(CharSequence sequence) { 527 return indexIn(sequence) == -1; 528 } 529 530 /** 531 * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if 532 * no matching character is present. 533 * 534 * <p>The default implementation iterates over the sequence in forward order calling {@link 535 * #matches} for each character. 536 * 537 * @param sequence the character sequence to examine from the beginning 538 * @return an index, or {@code -1} if no character matches 539 */ 540 public int indexIn(CharSequence sequence) { 541 return indexIn(sequence, 0); 542 } 543 544 /** 545 * Returns the index of the first matching BMP character in a character sequence, starting from a 546 * given position, or {@code -1} if no character matches after that position. 547 * 548 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 549 * start}, calling {@link #matches} for each character. 550 * 551 * @param sequence the character sequence to examine 552 * @param start the first index to examine; must be nonnegative and no greater than {@code 553 * sequence.length()} 554 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 555 * or {@code -1} if no character matches 556 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 557 * sequence.length()} 558 */ 559 public int indexIn(CharSequence sequence, int start) { 560 int length = sequence.length(); 561 checkPositionIndex(start, length); 562 for (int i = start; i < length; i++) { 563 if (matches(sequence.charAt(i))) { 564 return i; 565 } 566 } 567 return -1; 568 } 569 570 /** 571 * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if 572 * no matching character is present. 573 * 574 * <p>The default implementation iterates over the sequence in reverse order calling {@link 575 * #matches} for each character. 576 * 577 * @param sequence the character sequence to examine from the end 578 * @return an index, or {@code -1} if no character matches 579 */ 580 public int lastIndexIn(CharSequence sequence) { 581 for (int i = sequence.length() - 1; i >= 0; i--) { 582 if (matches(sequence.charAt(i))) { 583 return i; 584 } 585 } 586 return -1; 587 } 588 589 /** 590 * Returns the number of matching {@code char}s found in a character sequence. 591 * 592 * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). 593 */ 594 public int countIn(CharSequence sequence) { 595 int count = 0; 596 for (int i = 0; i < sequence.length(); i++) { 597 if (matches(sequence.charAt(i))) { 598 count++; 599 } 600 } 601 return count; 602 } 603 604 /** 605 * Returns a string containing all non-matching characters of a character sequence, in order. For 606 * example: 607 * 608 * <pre>{@code 609 * CharMatcher.is('a').removeFrom("bazaar") 610 * }</pre> 611 * 612 * ... returns {@code "bzr"}. 613 */ 614 public String removeFrom(CharSequence sequence) { 615 String string = sequence.toString(); 616 int pos = indexIn(string); 617 if (pos == -1) { 618 return string; 619 } 620 621 char[] chars = string.toCharArray(); 622 int spread = 1; 623 624 // This unusual loop comes from extensive benchmarking 625 OUT: 626 while (true) { 627 pos++; 628 while (true) { 629 if (pos == chars.length) { 630 break OUT; 631 } 632 if (matches(chars[pos])) { 633 break; 634 } 635 chars[pos - spread] = chars[pos]; 636 pos++; 637 } 638 spread++; 639 } 640 return new String(chars, 0, pos - spread); 641 } 642 643 /** 644 * Returns a string containing all matching BMP characters of a character sequence, in order. For 645 * example: 646 * 647 * <pre>{@code 648 * CharMatcher.is('a').retainFrom("bazaar") 649 * }</pre> 650 * 651 * ... returns {@code "aaa"}. 652 */ 653 public String retainFrom(CharSequence sequence) { 654 return negate().removeFrom(sequence); 655 } 656 657 /** 658 * Returns a string copy of the input character sequence, with each matching BMP character 659 * replaced by a given replacement character. For example: 660 * 661 * <pre>{@code 662 * CharMatcher.is('a').replaceFrom("radar", 'o') 663 * }</pre> 664 * 665 * ... returns {@code "rodor"}. 666 * 667 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 668 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 669 * character. 670 * 671 * @param sequence the character sequence to replace matching characters in 672 * @param replacement the character to append to the result string in place of each matching 673 * character in {@code sequence} 674 * @return the new string 675 */ 676 public String replaceFrom(CharSequence sequence, char replacement) { 677 String string = sequence.toString(); 678 int pos = indexIn(string); 679 if (pos == -1) { 680 return string; 681 } 682 char[] chars = string.toCharArray(); 683 chars[pos] = replacement; 684 for (int i = pos + 1; i < chars.length; i++) { 685 if (matches(chars[i])) { 686 chars[i] = replacement; 687 } 688 } 689 return new String(chars); 690 } 691 692 /** 693 * Returns a string copy of the input character sequence, with each matching BMP character 694 * replaced by a given replacement sequence. For example: 695 * 696 * <pre>{@code 697 * CharMatcher.is('a').replaceFrom("yaha", "oo") 698 * }</pre> 699 * 700 * ... returns {@code "yoohoo"}. 701 * 702 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 703 * off calling {@link #replaceFrom(CharSequence, char)} directly. 704 * 705 * @param sequence the character sequence to replace matching characters in 706 * @param replacement the characters to append to the result string in place of each matching 707 * character in {@code sequence} 708 * @return the new string 709 */ 710 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 711 int replacementLen = replacement.length(); 712 if (replacementLen == 0) { 713 return removeFrom(sequence); 714 } 715 if (replacementLen == 1) { 716 return replaceFrom(sequence, replacement.charAt(0)); 717 } 718 719 String string = sequence.toString(); 720 int pos = indexIn(string); 721 if (pos == -1) { 722 return string; 723 } 724 725 int len = string.length(); 726 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 727 728 int oldpos = 0; 729 do { 730 buf.append(string, oldpos, pos); 731 buf.append(replacement); 732 oldpos = pos + 1; 733 pos = indexIn(string, oldpos); 734 } while (pos != -1); 735 736 buf.append(string, oldpos, len); 737 return buf.toString(); 738 } 739 740 /** 741 * Returns a substring of the input character sequence that omits all matching BMP characters from 742 * the beginning and from the end of the string. For example: 743 * 744 * <pre>{@code 745 * CharMatcher.anyOf("ab").trimFrom("abacatbab") 746 * }</pre> 747 * 748 * ... returns {@code "cat"}. 749 * 750 * <p>Note that: 751 * 752 * <pre>{@code 753 * CharMatcher.inRange('\0', ' ').trimFrom(str) 754 * }</pre> 755 * 756 * ... is equivalent to {@link String#trim()}. 757 */ 758 public String trimFrom(CharSequence sequence) { 759 int len = sequence.length(); 760 int first; 761 int last; 762 763 for (first = 0; first < len; first++) { 764 if (!matches(sequence.charAt(first))) { 765 break; 766 } 767 } 768 for (last = len - 1; last > first; last--) { 769 if (!matches(sequence.charAt(last))) { 770 break; 771 } 772 } 773 774 return sequence.subSequence(first, last + 1).toString(); 775 } 776 777 /** 778 * Returns a substring of the input character sequence that omits all matching BMP characters from 779 * the beginning of the string. For example: 780 * 781 * <pre>{@code 782 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab") 783 * }</pre> 784 * 785 * ... returns {@code "catbab"}. 786 */ 787 public String trimLeadingFrom(CharSequence sequence) { 788 int len = sequence.length(); 789 for (int first = 0; first < len; first++) { 790 if (!matches(sequence.charAt(first))) { 791 return sequence.subSequence(first, len).toString(); 792 } 793 } 794 return ""; 795 } 796 797 /** 798 * Returns a substring of the input character sequence that omits all matching BMP characters from 799 * the end of the string. For example: 800 * 801 * <pre>{@code 802 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab") 803 * }</pre> 804 * 805 * ... returns {@code "abacat"}. 806 */ 807 public String trimTrailingFrom(CharSequence sequence) { 808 int len = sequence.length(); 809 for (int last = len - 1; last >= 0; last--) { 810 if (!matches(sequence.charAt(last))) { 811 return sequence.subSequence(0, last + 1).toString(); 812 } 813 } 814 return ""; 815 } 816 817 /** 818 * Returns a string copy of the input character sequence, with each group of consecutive matching 819 * BMP characters replaced by a single replacement character. For example: 820 * 821 * <pre>{@code 822 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-') 823 * }</pre> 824 * 825 * ... returns {@code "b-p-r"}. 826 * 827 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 828 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 829 * character. 830 * 831 * @param sequence the character sequence to replace matching groups of characters in 832 * @param replacement the character to append to the result string in place of each group of 833 * matching characters in {@code sequence} 834 * @return the new string 835 */ 836 public String collapseFrom(CharSequence sequence, char replacement) { 837 // This implementation avoids unnecessary allocation. 838 int len = sequence.length(); 839 for (int i = 0; i < len; i++) { 840 char c = sequence.charAt(i); 841 if (matches(c)) { 842 if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 843 // a no-op replacement 844 i++; 845 } else { 846 StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); 847 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 848 } 849 } 850 } 851 // no replacement needed 852 return sequence.toString(); 853 } 854 855 /** 856 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 857 * groups of matching BMP characters at the start or end of the sequence are removed without 858 * replacement. 859 */ 860 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 861 // This implementation avoids unnecessary allocation. 862 int len = sequence.length(); 863 int first = 0; 864 int last = len - 1; 865 866 while (first < len && matches(sequence.charAt(first))) { 867 first++; 868 } 869 870 while (last > first && matches(sequence.charAt(last))) { 871 last--; 872 } 873 874 return (first == 0 && last == len - 1) 875 ? collapseFrom(sequence, replacement) 876 : finishCollapseFrom( 877 sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); 878 } 879 880 private String finishCollapseFrom( 881 CharSequence sequence, 882 int start, 883 int end, 884 char replacement, 885 StringBuilder builder, 886 boolean inMatchingGroup) { 887 for (int i = start; i < end; i++) { 888 char c = sequence.charAt(i); 889 if (matches(c)) { 890 if (!inMatchingGroup) { 891 builder.append(replacement); 892 inMatchingGroup = true; 893 } 894 } else { 895 builder.append(c); 896 inMatchingGroup = false; 897 } 898 } 899 return builder.toString(); 900 } 901 902 /** 903 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 904 * instead. 905 */ 906 @Deprecated 907 @Override 908 public boolean apply(Character character) { 909 return matches(character); 910 } 911 912 /** 913 * Returns a string representation of this {@code CharMatcher}, such as {@code 914 * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 915 */ 916 @Override 917 public String toString() { 918 return super.toString(); 919 } 920 921 /** 922 * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where 923 * "12AB" is the four hexadecimal digits representing the 16-bit code unit. 924 */ 925 private static String showCharacter(char c) { 926 String hex = "0123456789ABCDEF"; 927 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 928 for (int i = 0; i < 4; i++) { 929 tmp[5 - i] = hex.charAt(c & 0xF); 930 c = (char) (c >> 4); 931 } 932 return String.copyValueOf(tmp); 933 } 934 935 // Fast matchers 936 937 /** A matcher for which precomputation will not yield any significant benefit. */ 938 abstract static class FastMatcher extends CharMatcher { 939 940 @Override 941 public final CharMatcher precomputed() { 942 return this; 943 } 944 945 @Override 946 public CharMatcher negate() { 947 return new NegatedFastMatcher(this); 948 } 949 } 950 951 /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ 952 abstract static class NamedFastMatcher extends FastMatcher { 953 954 private final String description; 955 956 NamedFastMatcher(String description) { 957 this.description = checkNotNull(description); 958 } 959 960 @Override 961 public final String toString() { 962 return description; 963 } 964 } 965 966 /** Negation of a {@link FastMatcher}. */ 967 static class NegatedFastMatcher extends Negated { 968 969 NegatedFastMatcher(CharMatcher original) { 970 super(original); 971 } 972 973 @Override 974 public final CharMatcher precomputed() { 975 return this; 976 } 977 } 978 979 /** Fast matcher using a {@link BitSet} table of matching characters. */ 980 @GwtIncompatible // used only from other GwtIncompatible code 981 private static final class BitSetMatcher extends NamedFastMatcher { 982 983 private final BitSet table; 984 985 private BitSetMatcher(BitSet table, String description) { 986 super(description); 987 if (table.length() + Long.SIZE < table.size()) { 988 table = (BitSet) table.clone(); 989 // If only we could actually call BitSet.trimToSize() ourselves... 990 } 991 this.table = table; 992 } 993 994 @Override 995 public boolean matches(char c) { 996 return table.get(c); 997 } 998 999 @Override 1000 void setBits(BitSet bitSet) { 1001 bitSet.or(table); 1002 } 1003 } 1004 1005 // Static constant implementation classes 1006 1007 /** Implementation of {@link #any()}. */ 1008 private static final class Any extends NamedFastMatcher { 1009 1010 static final Any INSTANCE = new Any(); 1011 1012 private Any() { 1013 super("CharMatcher.any()"); 1014 } 1015 1016 @Override 1017 public boolean matches(char c) { 1018 return true; 1019 } 1020 1021 @Override 1022 public int indexIn(CharSequence sequence) { 1023 return (sequence.length() == 0) ? -1 : 0; 1024 } 1025 1026 @Override 1027 public int indexIn(CharSequence sequence, int start) { 1028 int length = sequence.length(); 1029 checkPositionIndex(start, length); 1030 return (start == length) ? -1 : start; 1031 } 1032 1033 @Override 1034 public int lastIndexIn(CharSequence sequence) { 1035 return sequence.length() - 1; 1036 } 1037 1038 @Override 1039 public boolean matchesAllOf(CharSequence sequence) { 1040 checkNotNull(sequence); 1041 return true; 1042 } 1043 1044 @Override 1045 public boolean matchesNoneOf(CharSequence sequence) { 1046 return sequence.length() == 0; 1047 } 1048 1049 @Override 1050 public String removeFrom(CharSequence sequence) { 1051 checkNotNull(sequence); 1052 return ""; 1053 } 1054 1055 @Override 1056 public String replaceFrom(CharSequence sequence, char replacement) { 1057 char[] array = new char[sequence.length()]; 1058 Arrays.fill(array, replacement); 1059 return new String(array); 1060 } 1061 1062 @Override 1063 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1064 StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); 1065 for (int i = 0; i < sequence.length(); i++) { 1066 result.append(replacement); 1067 } 1068 return result.toString(); 1069 } 1070 1071 @Override 1072 public String collapseFrom(CharSequence sequence, char replacement) { 1073 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 1074 } 1075 1076 @Override 1077 public String trimFrom(CharSequence sequence) { 1078 checkNotNull(sequence); 1079 return ""; 1080 } 1081 1082 @Override 1083 public int countIn(CharSequence sequence) { 1084 return sequence.length(); 1085 } 1086 1087 @Override 1088 public CharMatcher and(CharMatcher other) { 1089 return checkNotNull(other); 1090 } 1091 1092 @Override 1093 public CharMatcher or(CharMatcher other) { 1094 checkNotNull(other); 1095 return this; 1096 } 1097 1098 @Override 1099 public CharMatcher negate() { 1100 return none(); 1101 } 1102 } 1103 1104 /** Implementation of {@link #none()}. */ 1105 private static final class None extends NamedFastMatcher { 1106 1107 static final None INSTANCE = new None(); 1108 1109 private None() { 1110 super("CharMatcher.none()"); 1111 } 1112 1113 @Override 1114 public boolean matches(char c) { 1115 return false; 1116 } 1117 1118 @Override 1119 public int indexIn(CharSequence sequence) { 1120 checkNotNull(sequence); 1121 return -1; 1122 } 1123 1124 @Override 1125 public int indexIn(CharSequence sequence, int start) { 1126 int length = sequence.length(); 1127 checkPositionIndex(start, length); 1128 return -1; 1129 } 1130 1131 @Override 1132 public int lastIndexIn(CharSequence sequence) { 1133 checkNotNull(sequence); 1134 return -1; 1135 } 1136 1137 @Override 1138 public boolean matchesAllOf(CharSequence sequence) { 1139 return sequence.length() == 0; 1140 } 1141 1142 @Override 1143 public boolean matchesNoneOf(CharSequence sequence) { 1144 checkNotNull(sequence); 1145 return true; 1146 } 1147 1148 @Override 1149 public String removeFrom(CharSequence sequence) { 1150 return sequence.toString(); 1151 } 1152 1153 @Override 1154 public String replaceFrom(CharSequence sequence, char replacement) { 1155 return sequence.toString(); 1156 } 1157 1158 @Override 1159 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1160 checkNotNull(replacement); 1161 return sequence.toString(); 1162 } 1163 1164 @Override 1165 public String collapseFrom(CharSequence sequence, char replacement) { 1166 return sequence.toString(); 1167 } 1168 1169 @Override 1170 public String trimFrom(CharSequence sequence) { 1171 return sequence.toString(); 1172 } 1173 1174 @Override 1175 public String trimLeadingFrom(CharSequence sequence) { 1176 return sequence.toString(); 1177 } 1178 1179 @Override 1180 public String trimTrailingFrom(CharSequence sequence) { 1181 return sequence.toString(); 1182 } 1183 1184 @Override 1185 public int countIn(CharSequence sequence) { 1186 checkNotNull(sequence); 1187 return 0; 1188 } 1189 1190 @Override 1191 public CharMatcher and(CharMatcher other) { 1192 checkNotNull(other); 1193 return this; 1194 } 1195 1196 @Override 1197 public CharMatcher or(CharMatcher other) { 1198 return checkNotNull(other); 1199 } 1200 1201 @Override 1202 public CharMatcher negate() { 1203 return any(); 1204 } 1205 } 1206 1207 /** Implementation of {@link #whitespace()}. */ 1208 @VisibleForTesting 1209 static final class Whitespace extends NamedFastMatcher { 1210 1211 // TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function 1212 // whose key property is that it maps 25 characters into the 32-slot table without collision. 1213 // Basically this is an opportunistic fast implementation as opposed to "good code". For most 1214 // other use-cases, the reduction in readability isn't worth it. 1215 static final String TABLE = 1216 "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1217 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1218 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1219 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1220 static final int MULTIPLIER = 1682554634; 1221 static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); 1222 1223 static final Whitespace INSTANCE = new Whitespace(); 1224 1225 Whitespace() { 1226 super("CharMatcher.whitespace()"); 1227 } 1228 1229 @Override 1230 public boolean matches(char c) { 1231 return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; 1232 } 1233 1234 @GwtIncompatible // used only from other GwtIncompatible code 1235 @Override 1236 void setBits(BitSet table) { 1237 for (int i = 0; i < TABLE.length(); i++) { 1238 table.set(TABLE.charAt(i)); 1239 } 1240 } 1241 } 1242 1243 /** Implementation of {@link #breakingWhitespace()}. */ 1244 private static final class BreakingWhitespace extends CharMatcher { 1245 1246 static final CharMatcher INSTANCE = new BreakingWhitespace(); 1247 1248 @Override 1249 public boolean matches(char c) { 1250 switch (c) { 1251 case '\t': 1252 case '\n': 1253 case '\013': 1254 case '\f': 1255 case '\r': 1256 case ' ': 1257 case '\u0085': 1258 case '\u1680': 1259 case '\u2028': 1260 case '\u2029': 1261 case '\u205f': 1262 case '\u3000': 1263 return true; 1264 case '\u2007': 1265 return false; 1266 default: 1267 return c >= '\u2000' && c <= '\u200a'; 1268 } 1269 } 1270 1271 @Override 1272 public String toString() { 1273 return "CharMatcher.breakingWhitespace()"; 1274 } 1275 } 1276 1277 /** Implementation of {@link #ascii()}. */ 1278 private static final class Ascii extends NamedFastMatcher { 1279 1280 static final Ascii INSTANCE = new Ascii(); 1281 1282 Ascii() { 1283 super("CharMatcher.ascii()"); 1284 } 1285 1286 @Override 1287 public boolean matches(char c) { 1288 return c <= '\u007f'; 1289 } 1290 } 1291 1292 /** Implementation that matches characters that fall within multiple ranges. */ 1293 private static class RangesMatcher extends CharMatcher { 1294 1295 private final String description; 1296 private final char[] rangeStarts; 1297 private final char[] rangeEnds; 1298 1299 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 1300 this.description = description; 1301 this.rangeStarts = rangeStarts; 1302 this.rangeEnds = rangeEnds; 1303 checkArgument(rangeStarts.length == rangeEnds.length); 1304 for (int i = 0; i < rangeStarts.length; i++) { 1305 checkArgument(rangeStarts[i] <= rangeEnds[i]); 1306 if (i + 1 < rangeStarts.length) { 1307 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 1308 } 1309 } 1310 } 1311 1312 @Override 1313 public boolean matches(char c) { 1314 int index = Arrays.binarySearch(rangeStarts, c); 1315 if (index >= 0) { 1316 return true; 1317 } else { 1318 index = ~index - 1; 1319 return index >= 0 && c <= rangeEnds[index]; 1320 } 1321 } 1322 1323 @Override 1324 public String toString() { 1325 return description; 1326 } 1327 } 1328 1329 /** Implementation of {@link #digit()}. */ 1330 private static final class Digit extends RangesMatcher { 1331 // Plug the following UnicodeSet pattern into 1332 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1333 // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] 1334 // and get the zeroes from there. 1335 1336 // Must be in ascending order. 1337 private static final String ZEROES = 1338 "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" 1339 + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" 1340 + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; 1341 1342 private static char[] zeroes() { 1343 return ZEROES.toCharArray(); 1344 } 1345 1346 private static char[] nines() { 1347 char[] nines = new char[ZEROES.length()]; 1348 for (int i = 0; i < ZEROES.length(); i++) { 1349 nines[i] = (char) (ZEROES.charAt(i) + 9); 1350 } 1351 return nines; 1352 } 1353 1354 static final Digit INSTANCE = new Digit(); 1355 1356 private Digit() { 1357 super("CharMatcher.digit()", zeroes(), nines()); 1358 } 1359 } 1360 1361 /** Implementation of {@link #javaDigit()}. */ 1362 private static final class JavaDigit extends CharMatcher { 1363 1364 static final JavaDigit INSTANCE = new JavaDigit(); 1365 1366 @Override 1367 public boolean matches(char c) { 1368 return Character.isDigit(c); 1369 } 1370 1371 @Override 1372 public String toString() { 1373 return "CharMatcher.javaDigit()"; 1374 } 1375 } 1376 1377 /** Implementation of {@link #javaLetter()}. */ 1378 private static final class JavaLetter extends CharMatcher { 1379 1380 static final JavaLetter INSTANCE = new JavaLetter(); 1381 1382 @Override 1383 public boolean matches(char c) { 1384 return Character.isLetter(c); 1385 } 1386 1387 @Override 1388 public String toString() { 1389 return "CharMatcher.javaLetter()"; 1390 } 1391 } 1392 1393 /** Implementation of {@link #javaLetterOrDigit()}. */ 1394 private static final class JavaLetterOrDigit extends CharMatcher { 1395 1396 static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit(); 1397 1398 @Override 1399 public boolean matches(char c) { 1400 return Character.isLetterOrDigit(c); 1401 } 1402 1403 @Override 1404 public String toString() { 1405 return "CharMatcher.javaLetterOrDigit()"; 1406 } 1407 } 1408 1409 /** Implementation of {@link #javaUpperCase()}. */ 1410 private static final class JavaUpperCase extends CharMatcher { 1411 1412 static final JavaUpperCase INSTANCE = new JavaUpperCase(); 1413 1414 @Override 1415 public boolean matches(char c) { 1416 return Character.isUpperCase(c); 1417 } 1418 1419 @Override 1420 public String toString() { 1421 return "CharMatcher.javaUpperCase()"; 1422 } 1423 } 1424 1425 /** Implementation of {@link #javaLowerCase()}. */ 1426 private static final class JavaLowerCase extends CharMatcher { 1427 1428 static final JavaLowerCase INSTANCE = new JavaLowerCase(); 1429 1430 @Override 1431 public boolean matches(char c) { 1432 return Character.isLowerCase(c); 1433 } 1434 1435 @Override 1436 public String toString() { 1437 return "CharMatcher.javaLowerCase()"; 1438 } 1439 } 1440 1441 /** Implementation of {@link #javaIsoControl()}. */ 1442 private static final class JavaIsoControl extends NamedFastMatcher { 1443 1444 static final JavaIsoControl INSTANCE = new JavaIsoControl(); 1445 1446 private JavaIsoControl() { 1447 super("CharMatcher.javaIsoControl()"); 1448 } 1449 1450 @Override 1451 public boolean matches(char c) { 1452 return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); 1453 } 1454 } 1455 1456 /** Implementation of {@link #invisible()}. */ 1457 private static final class Invisible extends RangesMatcher { 1458 // Plug the following UnicodeSet pattern into 1459 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1460 // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] 1461 // with the "Abbreviate" option, and get the ranges from there. 1462 private static final String RANGE_STARTS = 1463 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" 1464 + "\u3000\ud800\ufeff\ufff9"; 1465 private static final String RANGE_ENDS = // inclusive ends 1466 "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" 1467 + "\u3000\uf8ff\ufeff\ufffb"; 1468 1469 static final Invisible INSTANCE = new Invisible(); 1470 1471 private Invisible() { 1472 super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); 1473 } 1474 } 1475 1476 /** Implementation of {@link #singleWidth()}. */ 1477 private static final class SingleWidth extends RangesMatcher { 1478 1479 static final SingleWidth INSTANCE = new SingleWidth(); 1480 1481 private SingleWidth() { 1482 super( 1483 "CharMatcher.singleWidth()", 1484 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 1485 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 1486 } 1487 } 1488 1489 // Non-static factory implementation classes 1490 1491 /** Implementation of {@link #negate()}. */ 1492 private static class Negated extends CharMatcher { 1493 1494 final CharMatcher original; 1495 1496 Negated(CharMatcher original) { 1497 this.original = checkNotNull(original); 1498 } 1499 1500 @Override 1501 public boolean matches(char c) { 1502 return !original.matches(c); 1503 } 1504 1505 @Override 1506 public boolean matchesAllOf(CharSequence sequence) { 1507 return original.matchesNoneOf(sequence); 1508 } 1509 1510 @Override 1511 public boolean matchesNoneOf(CharSequence sequence) { 1512 return original.matchesAllOf(sequence); 1513 } 1514 1515 @Override 1516 public int countIn(CharSequence sequence) { 1517 return sequence.length() - original.countIn(sequence); 1518 } 1519 1520 @GwtIncompatible // used only from other GwtIncompatible code 1521 @Override 1522 void setBits(BitSet table) { 1523 BitSet tmp = new BitSet(); 1524 original.setBits(tmp); 1525 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 1526 table.or(tmp); 1527 } 1528 1529 @Override 1530 public CharMatcher negate() { 1531 return original; 1532 } 1533 1534 @Override 1535 public String toString() { 1536 return original + ".negate()"; 1537 } 1538 } 1539 1540 /** Implementation of {@link #and(CharMatcher)}. */ 1541 private static final class And extends CharMatcher { 1542 1543 final CharMatcher first; 1544 final CharMatcher second; 1545 1546 And(CharMatcher a, CharMatcher b) { 1547 first = checkNotNull(a); 1548 second = checkNotNull(b); 1549 } 1550 1551 @Override 1552 public boolean matches(char c) { 1553 return first.matches(c) && second.matches(c); 1554 } 1555 1556 @GwtIncompatible // used only from other GwtIncompatible code 1557 @Override 1558 void setBits(BitSet table) { 1559 BitSet tmp1 = new BitSet(); 1560 first.setBits(tmp1); 1561 BitSet tmp2 = new BitSet(); 1562 second.setBits(tmp2); 1563 tmp1.and(tmp2); 1564 table.or(tmp1); 1565 } 1566 1567 @Override 1568 public String toString() { 1569 return "CharMatcher.and(" + first + ", " + second + ")"; 1570 } 1571 } 1572 1573 /** Implementation of {@link #or(CharMatcher)}. */ 1574 private static final class Or extends CharMatcher { 1575 1576 final CharMatcher first; 1577 final CharMatcher second; 1578 1579 Or(CharMatcher a, CharMatcher b) { 1580 first = checkNotNull(a); 1581 second = checkNotNull(b); 1582 } 1583 1584 @GwtIncompatible // used only from other GwtIncompatible code 1585 @Override 1586 void setBits(BitSet table) { 1587 first.setBits(table); 1588 second.setBits(table); 1589 } 1590 1591 @Override 1592 public boolean matches(char c) { 1593 return first.matches(c) || second.matches(c); 1594 } 1595 1596 @Override 1597 public String toString() { 1598 return "CharMatcher.or(" + first + ", " + second + ")"; 1599 } 1600 } 1601 1602 // Static factory implementations 1603 1604 /** Implementation of {@link #is(char)}. */ 1605 private static final class Is extends FastMatcher { 1606 1607 private final char match; 1608 1609 Is(char match) { 1610 this.match = match; 1611 } 1612 1613 @Override 1614 public boolean matches(char c) { 1615 return c == match; 1616 } 1617 1618 @Override 1619 public String replaceFrom(CharSequence sequence, char replacement) { 1620 return sequence.toString().replace(match, replacement); 1621 } 1622 1623 @Override 1624 public CharMatcher and(CharMatcher other) { 1625 return other.matches(match) ? this : none(); 1626 } 1627 1628 @Override 1629 public CharMatcher or(CharMatcher other) { 1630 return other.matches(match) ? other : super.or(other); 1631 } 1632 1633 @Override 1634 public CharMatcher negate() { 1635 return isNot(match); 1636 } 1637 1638 @GwtIncompatible // used only from other GwtIncompatible code 1639 @Override 1640 void setBits(BitSet table) { 1641 table.set(match); 1642 } 1643 1644 @Override 1645 public String toString() { 1646 return "CharMatcher.is('" + showCharacter(match) + "')"; 1647 } 1648 } 1649 1650 /** Implementation of {@link #isNot(char)}. */ 1651 private static final class IsNot extends FastMatcher { 1652 1653 private final char match; 1654 1655 IsNot(char match) { 1656 this.match = match; 1657 } 1658 1659 @Override 1660 public boolean matches(char c) { 1661 return c != match; 1662 } 1663 1664 @Override 1665 public CharMatcher and(CharMatcher other) { 1666 return other.matches(match) ? super.and(other) : other; 1667 } 1668 1669 @Override 1670 public CharMatcher or(CharMatcher other) { 1671 return other.matches(match) ? any() : this; 1672 } 1673 1674 @GwtIncompatible // used only from other GwtIncompatible code 1675 @Override 1676 void setBits(BitSet table) { 1677 table.set(0, match); 1678 table.set(match + 1, Character.MAX_VALUE + 1); 1679 } 1680 1681 @Override 1682 public CharMatcher negate() { 1683 return is(match); 1684 } 1685 1686 @Override 1687 public String toString() { 1688 return "CharMatcher.isNot('" + showCharacter(match) + "')"; 1689 } 1690 } 1691 1692 private static CharMatcher.IsEither isEither(char c1, char c2) { 1693 return new CharMatcher.IsEither(c1, c2); 1694 } 1695 1696 /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ 1697 private static final class IsEither extends FastMatcher { 1698 1699 private final char match1; 1700 private final char match2; 1701 1702 IsEither(char match1, char match2) { 1703 this.match1 = match1; 1704 this.match2 = match2; 1705 } 1706 1707 @Override 1708 public boolean matches(char c) { 1709 return c == match1 || c == match2; 1710 } 1711 1712 @GwtIncompatible // used only from other GwtIncompatible code 1713 @Override 1714 void setBits(BitSet table) { 1715 table.set(match1); 1716 table.set(match2); 1717 } 1718 1719 @Override 1720 public String toString() { 1721 return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; 1722 } 1723 } 1724 1725 /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ 1726 private static final class AnyOf extends CharMatcher { 1727 1728 private final char[] chars; 1729 1730 public AnyOf(CharSequence chars) { 1731 this.chars = chars.toString().toCharArray(); 1732 Arrays.sort(this.chars); 1733 } 1734 1735 @Override 1736 public boolean matches(char c) { 1737 return Arrays.binarySearch(chars, c) >= 0; 1738 } 1739 1740 @Override 1741 @GwtIncompatible // used only from other GwtIncompatible code 1742 void setBits(BitSet table) { 1743 for (char c : chars) { 1744 table.set(c); 1745 } 1746 } 1747 1748 @Override 1749 public String toString() { 1750 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 1751 for (char c : chars) { 1752 description.append(showCharacter(c)); 1753 } 1754 description.append("\")"); 1755 return description.toString(); 1756 } 1757 } 1758 1759 /** Implementation of {@link #inRange(char, char)}. */ 1760 private static final class InRange extends FastMatcher { 1761 1762 private final char startInclusive; 1763 private final char endInclusive; 1764 1765 InRange(char startInclusive, char endInclusive) { 1766 checkArgument(endInclusive >= startInclusive); 1767 this.startInclusive = startInclusive; 1768 this.endInclusive = endInclusive; 1769 } 1770 1771 @Override 1772 public boolean matches(char c) { 1773 return startInclusive <= c && c <= endInclusive; 1774 } 1775 1776 @GwtIncompatible // used only from other GwtIncompatible code 1777 @Override 1778 void setBits(BitSet table) { 1779 table.set(startInclusive, endInclusive + 1); 1780 } 1781 1782 @Override 1783 public String toString() { 1784 return "CharMatcher.inRange('" 1785 + showCharacter(startInclusive) 1786 + "', '" 1787 + showCharacter(endInclusive) 1788 + "')"; 1789 } 1790 } 1791 1792 /** Implementation of {@link #forPredicate(Predicate)}. */ 1793 private static final class ForPredicate extends CharMatcher { 1794 1795 private final Predicate<? super Character> predicate; 1796 1797 ForPredicate(Predicate<? super Character> predicate) { 1798 this.predicate = checkNotNull(predicate); 1799 } 1800 1801 @Override 1802 public boolean matches(char c) { 1803 return predicate.apply(c); 1804 } 1805 1806 @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily 1807 @Override 1808 public boolean apply(Character character) { 1809 return predicate.apply(checkNotNull(character)); 1810 } 1811 1812 @Override 1813 public String toString() { 1814 return "CharMatcher.forPredicate(" + predicate + ")"; 1815 } 1816 } 1817}