001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkPositionIndex; 020 021import com.google.common.annotations.GwtCompatible; 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.VisibleForTesting; 024import java.util.Arrays; 025import java.util.BitSet; 026 027/** 028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 029 * for any {@link Object}. Also offers basic text processing methods based on this function. 030 * Implementations are strongly encouraged to be side-effect-free and immutable. 031 * 032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 033 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". 034 * 035 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a 036 * href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand 037 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code 038 * points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of 039 * assigned characters, including important CJK characters and emoji. 040 * 041 * <p>Supplementary characters are <a 042 * href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded 043 * into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as 044 * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. 045 * 046 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for 047 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For 048 * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. 049 * 050 * <p>Example usages: 051 * 052 * <pre> 053 * String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput); 054 * if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 055 * 056 * <p>See the Guava User Guide article on <a 057 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher} 058 * </a>. 059 * 060 * @author Kevin Bourrillion 061 * @since 1.0 062 */ 063@GwtCompatible(emulated = true) 064public abstract class CharMatcher implements Predicate<Character> { 065 /* 066 * N777777777NO 067 * N7777777777777N 068 * M777777777777777N 069 * $N877777777D77777M 070 * N M77777777ONND777M 071 * MN777777777NN D777 072 * N7ZN777777777NN ~M7778 073 * N777777777777MMNN88777N 074 * N777777777777MNZZZ7777O 075 * DZN7777O77777777777777 076 * N7OONND7777777D77777N 077 * 8$M++++?N???$77777$ 078 * M7++++N+M77777777N 079 * N77O777777777777$ M 080 * DNNM$$$$777777N D 081 * N$N:=N$777N7777M NZ 082 * 77Z::::N777777777 ODZZZ 083 * 77N::::::N77777777M NNZZZ$ 084 * $777:::::::77777777MN ZM8ZZZZZ 085 * 777M::::::Z7777777Z77 N++ZZZZNN 086 * 7777M:::::M7777777$777M $++IZZZZM 087 * M777$:::::N777777$M7777M +++++ZZZDN 088 * NN$::::::7777$$M777777N N+++ZZZZNZ 089 * N::::::N:7$O:77777777 N++++ZZZZN 090 * M::::::::::::N77777777+ +?+++++ZZZM 091 * 8::::::::::::D77777777M O+++++ZZ 092 * ::::::::::::M777777777N O+?D 093 * M:::::::::::M77777777778 77= 094 * D=::::::::::N7777777777N 777 095 * INN===::::::=77777777777N I777N 096 * ?777N========N7777777777787M N7777 097 * 77777$D======N77777777777N777N? N777777 098 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N 099 * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON 100 * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND 101 * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N 102 * 7 :N MNN$$$$M$ $$$777$8 8D8I 103 * NMM.:7O 777777778 104 * 7777777MN 105 * M NO .7: 106 * M : M 107 * 8 108 */ 109 110 // Constant matcher factory methods 111 112 /** 113 * Matches any character. 114 * 115 * @since 19.0 (since 1.0 as constant {@code ANY}) 116 */ 117 public static CharMatcher any() { 118 return Any.INSTANCE; 119 } 120 121 /** 122 * Matches no characters. 123 * 124 * @since 19.0 (since 1.0 as constant {@code NONE}) 125 */ 126 public static CharMatcher none() { 127 return None.INSTANCE; 128 } 129 130 /** 131 * Determines whether a character is whitespace according to the latest Unicode standard, as 132 * illustrated <a 133 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 134 * This is not the same definition used by other Java APIs. (See a <a 135 * href="https://docs.google.com/spreadsheets/d/1kq4ECwPjHX9B8QUCTPclgsDCXYaj7T-FlT4tB5q3ahk/edit">comparison 136 * of several definitions of "whitespace"</a>.) 137 * 138 * <p>All Unicode White_Space characters are on the BMP and thus supported by this API. 139 * 140 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to 141 * date. 142 * 143 * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) 144 */ 145 public static CharMatcher whitespace() { 146 return Whitespace.INSTANCE; 147 } 148 149 /** 150 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 151 * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a 152 * discussion of that term. 153 * 154 * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) 155 */ 156 public static CharMatcher breakingWhitespace() { 157 return BreakingWhitespace.INSTANCE; 158 } 159 160 /** 161 * Determines whether a character is ASCII, meaning that its code point is less than 128. 162 * 163 * @since 19.0 (since 1.0 as constant {@code ASCII}) 164 */ 165 public static CharMatcher ascii() { 166 return Ascii.INSTANCE; 167 } 168 169 /** 170 * Determines whether a character is a BMP digit according to <a 171 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If 172 * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 173 * 174 * @deprecated Many digits are supplementary characters; see the class documentation. 175 * @since 19.0 (since 1.0 as constant {@code DIGIT}) 176 */ 177 @Deprecated 178 public static CharMatcher digit() { 179 return Digit.INSTANCE; 180 } 181 182 /** 183 * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) 184 * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', 185 * '9')}. 186 * 187 * @deprecated Many digits are supplementary characters; see the class documentation. 188 * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) 189 */ 190 @Deprecated 191 public static CharMatcher javaDigit() { 192 return JavaDigit.INSTANCE; 193 } 194 195 /** 196 * Determines whether a character is a BMP letter according to {@linkplain 197 * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin 198 * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 199 * 200 * @deprecated Most letters are supplementary characters; see the class documentation. 201 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) 202 */ 203 @Deprecated 204 public static CharMatcher javaLetter() { 205 return JavaLetter.INSTANCE; 206 } 207 208 /** 209 * Determines whether a character is a BMP letter or digit according to {@linkplain 210 * Character#isLetterOrDigit(char) Java's definition}. 211 * 212 * @deprecated Most letters and digits are supplementary characters; see the class documentation. 213 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). 214 */ 215 @Deprecated 216 public static CharMatcher javaLetterOrDigit() { 217 return JavaLetterOrDigit.INSTANCE; 218 } 219 220 /** 221 * Determines whether a BMP character is upper case according to {@linkplain 222 * Character#isUpperCase(char) Java's definition}. 223 * 224 * @deprecated Some uppercase characters are supplementary characters; see the class 225 * documentation. 226 * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) 227 */ 228 @Deprecated 229 public static CharMatcher javaUpperCase() { 230 return JavaUpperCase.INSTANCE; 231 } 232 233 /** 234 * Determines whether a BMP character is lower case according to {@linkplain 235 * Character#isLowerCase(char) Java's definition}. 236 * 237 * @deprecated Some lowercase characters are supplementary characters; see the class 238 * documentation. 239 * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) 240 */ 241 @Deprecated 242 public static CharMatcher javaLowerCase() { 243 return JavaLowerCase.INSTANCE; 244 } 245 246 /** 247 * Determines whether a character is an ISO control character as specified by {@link 248 * Character#isISOControl(char)}. 249 * 250 * <p>All ISO control codes are on the BMP and thus supported by this API. 251 * 252 * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) 253 */ 254 public static CharMatcher javaIsoControl() { 255 return JavaIsoControl.INSTANCE; 256 } 257 258 /** 259 * Determines whether a character is invisible; that is, if its Unicode category is any of 260 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 261 * PRIVATE_USE according to ICU4J. 262 * 263 * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU). 264 * 265 * @deprecated Most invisible characters are supplementary characters; see the class 266 * documentation. 267 * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) 268 */ 269 @Deprecated 270 public static CharMatcher invisible() { 271 return Invisible.INSTANCE; 272 } 273 274 /** 275 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 276 * errs on the side of returning {@code false} (that is, it tends to assume a character is 277 * double-width). 278 * 279 * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to 280 * date. 281 * 282 * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>. 283 * 284 * @deprecated Many such characters are supplementary characters; see the class documentation. 285 * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) 286 */ 287 @Deprecated 288 public static CharMatcher singleWidth() { 289 return SingleWidth.INSTANCE; 290 } 291 292 // Static factories 293 294 /** Returns a {@code char} matcher that matches only one specified BMP character. */ 295 public static CharMatcher is(final char match) { 296 return new Is(match); 297 } 298 299 /** 300 * Returns a {@code char} matcher that matches any character except the BMP character specified. 301 * 302 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 303 */ 304 public static CharMatcher isNot(final char match) { 305 return new IsNot(match); 306 } 307 308 /** 309 * Returns a {@code char} matcher that matches any BMP character present in the given character 310 * sequence. Returns a bogus matcher if the sequence contains supplementary characters. 311 */ 312 public static CharMatcher anyOf(final CharSequence sequence) { 313 switch (sequence.length()) { 314 case 0: 315 return none(); 316 case 1: 317 return is(sequence.charAt(0)); 318 case 2: 319 return isEither(sequence.charAt(0), sequence.charAt(1)); 320 default: 321 // TODO(lowasser): is it potentially worth just going ahead and building a precomputed 322 // matcher? 323 return new AnyOf(sequence); 324 } 325 } 326 327 /** 328 * Returns a {@code char} matcher that matches any BMP character not present in the given 329 * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. 330 */ 331 public static CharMatcher noneOf(CharSequence sequence) { 332 return anyOf(sequence).negate(); 333 } 334 335 /** 336 * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints 337 * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 338 * CharMatcher.inRange('a', 'z')}. 339 * 340 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 341 */ 342 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 343 return new InRange(startInclusive, endInclusive); 344 } 345 346 /** 347 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 348 * which operates on primitive {@code char} instances instead. 349 */ 350 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 351 return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); 352 } 353 354 // Constructors 355 356 /** 357 * Constructor for use by subclasses. When subclassing, you may want to override {@code 358 * toString()} to provide a useful description. 359 */ 360 protected CharMatcher() {} 361 362 // Abstract methods 363 364 /** Determines a true or false value for the given character. */ 365 public abstract boolean matches(char c); 366 367 // Non-static factories 368 369 /** Returns a matcher that matches any character not matched by this matcher. */ 370 // This is not an override in java7, where Guava's Predicate does not extend the JDK's Predicate. 371 @SuppressWarnings("MissingOverride") 372 public CharMatcher negate() { 373 return new Negated(this); 374 } 375 376 /** 377 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 378 */ 379 public CharMatcher and(CharMatcher other) { 380 return new And(this, other); 381 } 382 383 /** 384 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 385 */ 386 public CharMatcher or(CharMatcher other) { 387 return new Or(this, other); 388 } 389 390 /** 391 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 392 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 393 * worthwhile only if the precomputed matcher is queried many thousands of times. 394 * 395 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 396 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 397 * worthwhile tradeoff in a browser. 398 */ 399 public CharMatcher precomputed() { 400 return Platform.precomputeCharMatcher(this); 401 } 402 403 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 404 405 /** 406 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 407 * on {@link Platform} so that we can have different behavior in GWT. 408 * 409 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the 410 * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables 411 * for matchers that only match a few characters, and so on. In the worst-case scenario, it 412 * constructs an eight-kilobyte bit array and queries that. In many situations this produces a 413 * matcher which is faster to query than the original. 414 */ 415 @GwtIncompatible // SmallCharMatcher 416 CharMatcher precomputedInternal() { 417 final BitSet table = new BitSet(); 418 setBits(table); 419 int totalCharacters = table.cardinality(); 420 if (totalCharacters * 2 <= DISTINCT_CHARS) { 421 return precomputedPositive(totalCharacters, table, toString()); 422 } else { 423 // TODO(lowasser): is it worth it to worry about the last character of large matchers? 424 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 425 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 426 String suffix = ".negate()"; 427 final String description = toString(); 428 String negatedDescription = 429 description.endsWith(suffix) 430 ? description.substring(0, description.length() - suffix.length()) 431 : description + suffix; 432 return new NegatedFastMatcher( 433 precomputedPositive(negatedCharacters, table, negatedDescription)) { 434 @Override 435 public String toString() { 436 return description; 437 } 438 }; 439 } 440 } 441 442 /** 443 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 444 */ 445 @GwtIncompatible // SmallCharMatcher 446 private static CharMatcher precomputedPositive( 447 int totalCharacters, BitSet table, String description) { 448 switch (totalCharacters) { 449 case 0: 450 return none(); 451 case 1: 452 return is((char) table.nextSetBit(0)); 453 case 2: 454 char c1 = (char) table.nextSetBit(0); 455 char c2 = (char) table.nextSetBit(c1 + 1); 456 return isEither(c1, c2); 457 default: 458 return isSmall(totalCharacters, table.length()) 459 ? SmallCharMatcher.from(table, description) 460 : new BitSetMatcher(table, description); 461 } 462 } 463 464 @GwtIncompatible // SmallCharMatcher 465 private static boolean isSmall(int totalCharacters, int tableLength) { 466 return totalCharacters <= SmallCharMatcher.MAX_SIZE 467 && tableLength > (totalCharacters * 4 * Character.SIZE); 468 // err on the side of BitSetMatcher 469 } 470 471 /** Sets bits in {@code table} matched by this matcher. */ 472 @GwtIncompatible // used only from other GwtIncompatible code 473 void setBits(BitSet table) { 474 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 475 if (matches((char) c)) { 476 table.set(c); 477 } 478 } 479 } 480 481 // Text processing routines 482 483 /** 484 * Returns {@code true} if a character sequence contains at least one matching BMP character. 485 * Equivalent to {@code !matchesNoneOf(sequence)}. 486 * 487 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 488 * character, until this returns {@code true} or the end is reached. 489 * 490 * @param sequence the character sequence to examine, possibly empty 491 * @return {@code true} if this matcher matches at least one character in the sequence 492 * @since 8.0 493 */ 494 public boolean matchesAnyOf(CharSequence sequence) { 495 return !matchesNoneOf(sequence); 496 } 497 498 /** 499 * Returns {@code true} if a character sequence contains only matching BMP characters. 500 * 501 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 502 * character, until this returns {@code false} or the end is reached. 503 * 504 * @param sequence the character sequence to examine, possibly empty 505 * @return {@code true} if this matcher matches every character in the sequence, including when 506 * the sequence is empty 507 */ 508 public boolean matchesAllOf(CharSequence sequence) { 509 for (int i = sequence.length() - 1; i >= 0; i--) { 510 if (!matches(sequence.charAt(i))) { 511 return false; 512 } 513 } 514 return true; 515 } 516 517 /** 518 * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to 519 * {@code !matchesAnyOf(sequence)}. 520 * 521 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 522 * character, until this returns {@code true} or the end is reached. 523 * 524 * @param sequence the character sequence to examine, possibly empty 525 * @return {@code true} if this matcher matches no characters in the sequence, including when the 526 * sequence is empty 527 */ 528 public boolean matchesNoneOf(CharSequence sequence) { 529 return indexIn(sequence) == -1; 530 } 531 532 /** 533 * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if 534 * no matching character is present. 535 * 536 * <p>The default implementation iterates over the sequence in forward order calling {@link 537 * #matches} for each character. 538 * 539 * @param sequence the character sequence to examine from the beginning 540 * @return an index, or {@code -1} if no character matches 541 */ 542 public int indexIn(CharSequence sequence) { 543 return indexIn(sequence, 0); 544 } 545 546 /** 547 * Returns the index of the first matching BMP character in a character sequence, starting from a 548 * given position, or {@code -1} if no character matches after that position. 549 * 550 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 551 * start}, calling {@link #matches} for each character. 552 * 553 * @param sequence the character sequence to examine 554 * @param start the first index to examine; must be nonnegative and no greater than {@code 555 * sequence.length()} 556 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 557 * or {@code -1} if no character matches 558 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 559 * sequence.length()} 560 */ 561 public int indexIn(CharSequence sequence, int start) { 562 int length = sequence.length(); 563 checkPositionIndex(start, length); 564 for (int i = start; i < length; i++) { 565 if (matches(sequence.charAt(i))) { 566 return i; 567 } 568 } 569 return -1; 570 } 571 572 /** 573 * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if 574 * no matching character is present. 575 * 576 * <p>The default implementation iterates over the sequence in reverse order calling {@link 577 * #matches} for each character. 578 * 579 * @param sequence the character sequence to examine from the end 580 * @return an index, or {@code -1} if no character matches 581 */ 582 public int lastIndexIn(CharSequence sequence) { 583 for (int i = sequence.length() - 1; i >= 0; i--) { 584 if (matches(sequence.charAt(i))) { 585 return i; 586 } 587 } 588 return -1; 589 } 590 591 /** 592 * Returns the number of matching {@code char}s found in a character sequence. 593 * 594 * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). 595 */ 596 public int countIn(CharSequence sequence) { 597 int count = 0; 598 for (int i = 0; i < sequence.length(); i++) { 599 if (matches(sequence.charAt(i))) { 600 count++; 601 } 602 } 603 return count; 604 } 605 606 /** 607 * Returns a string containing all non-matching characters of a character sequence, in order. For 608 * example: 609 * 610 * <pre>{@code 611 * CharMatcher.is('a').removeFrom("bazaar") 612 * }</pre> 613 * 614 * ... returns {@code "bzr"}. 615 */ 616 public String removeFrom(CharSequence sequence) { 617 String string = sequence.toString(); 618 int pos = indexIn(string); 619 if (pos == -1) { 620 return string; 621 } 622 623 char[] chars = string.toCharArray(); 624 int spread = 1; 625 626 // This unusual loop comes from extensive benchmarking 627 OUT: 628 while (true) { 629 pos++; 630 while (true) { 631 if (pos == chars.length) { 632 break OUT; 633 } 634 if (matches(chars[pos])) { 635 break; 636 } 637 chars[pos - spread] = chars[pos]; 638 pos++; 639 } 640 spread++; 641 } 642 return new String(chars, 0, pos - spread); 643 } 644 645 /** 646 * Returns a string containing all matching BMP characters of a character sequence, in order. For 647 * example: 648 * 649 * <pre>{@code 650 * CharMatcher.is('a').retainFrom("bazaar") 651 * }</pre> 652 * 653 * ... returns {@code "aaa"}. 654 */ 655 public String retainFrom(CharSequence sequence) { 656 return negate().removeFrom(sequence); 657 } 658 659 /** 660 * Returns a string copy of the input character sequence, with each matching BMP character 661 * replaced by a given replacement character. For example: 662 * 663 * <pre>{@code 664 * CharMatcher.is('a').replaceFrom("radar", 'o') 665 * }</pre> 666 * 667 * ... returns {@code "rodor"}. 668 * 669 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 670 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 671 * character. 672 * 673 * @param sequence the character sequence to replace matching characters in 674 * @param replacement the character to append to the result string in place of each matching 675 * character in {@code sequence} 676 * @return the new string 677 */ 678 public String replaceFrom(CharSequence sequence, char replacement) { 679 String string = sequence.toString(); 680 int pos = indexIn(string); 681 if (pos == -1) { 682 return string; 683 } 684 char[] chars = string.toCharArray(); 685 chars[pos] = replacement; 686 for (int i = pos + 1; i < chars.length; i++) { 687 if (matches(chars[i])) { 688 chars[i] = replacement; 689 } 690 } 691 return new String(chars); 692 } 693 694 /** 695 * Returns a string copy of the input character sequence, with each matching BMP character 696 * replaced by a given replacement sequence. For example: 697 * 698 * <pre>{@code 699 * CharMatcher.is('a').replaceFrom("yaha", "oo") 700 * }</pre> 701 * 702 * ... returns {@code "yoohoo"}. 703 * 704 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 705 * off calling {@link #replaceFrom(CharSequence, char)} directly. 706 * 707 * @param sequence the character sequence to replace matching characters in 708 * @param replacement the characters to append to the result string in place of each matching 709 * character in {@code sequence} 710 * @return the new string 711 */ 712 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 713 int replacementLen = replacement.length(); 714 if (replacementLen == 0) { 715 return removeFrom(sequence); 716 } 717 if (replacementLen == 1) { 718 return replaceFrom(sequence, replacement.charAt(0)); 719 } 720 721 String string = sequence.toString(); 722 int pos = indexIn(string); 723 if (pos == -1) { 724 return string; 725 } 726 727 int len = string.length(); 728 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 729 730 int oldpos = 0; 731 do { 732 buf.append(string, oldpos, pos); 733 buf.append(replacement); 734 oldpos = pos + 1; 735 pos = indexIn(string, oldpos); 736 } while (pos != -1); 737 738 buf.append(string, oldpos, len); 739 return buf.toString(); 740 } 741 742 /** 743 * Returns a substring of the input character sequence that omits all matching BMP characters from 744 * the beginning and from the end of the string. For example: 745 * 746 * <pre>{@code 747 * CharMatcher.anyOf("ab").trimFrom("abacatbab") 748 * }</pre> 749 * 750 * ... returns {@code "cat"}. 751 * 752 * <p>Note that: 753 * 754 * <pre>{@code 755 * CharMatcher.inRange('\0', ' ').trimFrom(str) 756 * }</pre> 757 * 758 * ... is equivalent to {@link String#trim()}. 759 */ 760 public String trimFrom(CharSequence sequence) { 761 int len = sequence.length(); 762 int first; 763 int last; 764 765 for (first = 0; first < len; first++) { 766 if (!matches(sequence.charAt(first))) { 767 break; 768 } 769 } 770 for (last = len - 1; last > first; last--) { 771 if (!matches(sequence.charAt(last))) { 772 break; 773 } 774 } 775 776 return sequence.subSequence(first, last + 1).toString(); 777 } 778 779 /** 780 * Returns a substring of the input character sequence that omits all matching BMP characters from 781 * the beginning of the string. For example: 782 * 783 * <pre>{@code 784 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab") 785 * }</pre> 786 * 787 * ... returns {@code "catbab"}. 788 */ 789 public String trimLeadingFrom(CharSequence sequence) { 790 int len = sequence.length(); 791 for (int first = 0; first < len; first++) { 792 if (!matches(sequence.charAt(first))) { 793 return sequence.subSequence(first, len).toString(); 794 } 795 } 796 return ""; 797 } 798 799 /** 800 * Returns a substring of the input character sequence that omits all matching BMP characters from 801 * the end of the string. For example: 802 * 803 * <pre>{@code 804 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab") 805 * }</pre> 806 * 807 * ... returns {@code "abacat"}. 808 */ 809 public String trimTrailingFrom(CharSequence sequence) { 810 int len = sequence.length(); 811 for (int last = len - 1; last >= 0; last--) { 812 if (!matches(sequence.charAt(last))) { 813 return sequence.subSequence(0, last + 1).toString(); 814 } 815 } 816 return ""; 817 } 818 819 /** 820 * Returns a string copy of the input character sequence, with each group of consecutive matching 821 * BMP characters replaced by a single replacement character. For example: 822 * 823 * <pre>{@code 824 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-') 825 * }</pre> 826 * 827 * ... returns {@code "b-p-r"}. 828 * 829 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 830 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 831 * character. 832 * 833 * @param sequence the character sequence to replace matching groups of characters in 834 * @param replacement the character to append to the result string in place of each group of 835 * matching characters in {@code sequence} 836 * @return the new string 837 */ 838 public String collapseFrom(CharSequence sequence, char replacement) { 839 // This implementation avoids unnecessary allocation. 840 int len = sequence.length(); 841 for (int i = 0; i < len; i++) { 842 char c = sequence.charAt(i); 843 if (matches(c)) { 844 if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 845 // a no-op replacement 846 i++; 847 } else { 848 StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); 849 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 850 } 851 } 852 } 853 // no replacement needed 854 return sequence.toString(); 855 } 856 857 /** 858 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 859 * groups of matching BMP characters at the start or end of the sequence are removed without 860 * replacement. 861 */ 862 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 863 // This implementation avoids unnecessary allocation. 864 int len = sequence.length(); 865 int first = 0; 866 int last = len - 1; 867 868 while (first < len && matches(sequence.charAt(first))) { 869 first++; 870 } 871 872 while (last > first && matches(sequence.charAt(last))) { 873 last--; 874 } 875 876 return (first == 0 && last == len - 1) 877 ? collapseFrom(sequence, replacement) 878 : finishCollapseFrom( 879 sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); 880 } 881 882 private String finishCollapseFrom( 883 CharSequence sequence, 884 int start, 885 int end, 886 char replacement, 887 StringBuilder builder, 888 boolean inMatchingGroup) { 889 for (int i = start; i < end; i++) { 890 char c = sequence.charAt(i); 891 if (matches(c)) { 892 if (!inMatchingGroup) { 893 builder.append(replacement); 894 inMatchingGroup = true; 895 } 896 } else { 897 builder.append(c); 898 inMatchingGroup = false; 899 } 900 } 901 return builder.toString(); 902 } 903 904 /** 905 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 906 * instead. 907 */ 908 @Deprecated 909 @Override 910 public boolean apply(Character character) { 911 return matches(character); 912 } 913 914 /** 915 * Returns a string representation of this {@code CharMatcher}, such as {@code 916 * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 917 */ 918 @Override 919 public String toString() { 920 return super.toString(); 921 } 922 923 /** 924 * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where 925 * "12AB" is the four hexadecimal digits representing the 16-bit code unit. 926 */ 927 private static String showCharacter(char c) { 928 String hex = "0123456789ABCDEF"; 929 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 930 for (int i = 0; i < 4; i++) { 931 tmp[5 - i] = hex.charAt(c & 0xF); 932 c = (char) (c >> 4); 933 } 934 return String.copyValueOf(tmp); 935 } 936 937 // Fast matchers 938 939 /** A matcher for which precomputation will not yield any significant benefit. */ 940 abstract static class FastMatcher extends CharMatcher { 941 942 @Override 943 public final CharMatcher precomputed() { 944 return this; 945 } 946 947 @Override 948 public CharMatcher negate() { 949 return new NegatedFastMatcher(this); 950 } 951 } 952 953 /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ 954 abstract static class NamedFastMatcher extends FastMatcher { 955 956 private final String description; 957 958 NamedFastMatcher(String description) { 959 this.description = checkNotNull(description); 960 } 961 962 @Override 963 public final String toString() { 964 return description; 965 } 966 } 967 968 /** Negation of a {@link FastMatcher}. */ 969 private static class NegatedFastMatcher extends Negated { 970 971 NegatedFastMatcher(CharMatcher original) { 972 super(original); 973 } 974 975 @Override 976 public final CharMatcher precomputed() { 977 return this; 978 } 979 } 980 981 /** Fast matcher using a {@link BitSet} table of matching characters. */ 982 @GwtIncompatible // used only from other GwtIncompatible code 983 private static final class BitSetMatcher extends NamedFastMatcher { 984 985 private final BitSet table; 986 987 private BitSetMatcher(BitSet table, String description) { 988 super(description); 989 if (table.length() + Long.SIZE < table.size()) { 990 table = (BitSet) table.clone(); 991 // If only we could actually call BitSet.trimToSize() ourselves... 992 } 993 this.table = table; 994 } 995 996 @Override 997 public boolean matches(char c) { 998 return table.get(c); 999 } 1000 1001 @Override 1002 void setBits(BitSet bitSet) { 1003 bitSet.or(table); 1004 } 1005 } 1006 1007 // Static constant implementation classes 1008 1009 /** Implementation of {@link #any()}. */ 1010 private static final class Any extends NamedFastMatcher { 1011 1012 static final CharMatcher INSTANCE = new Any(); 1013 1014 private Any() { 1015 super("CharMatcher.any()"); 1016 } 1017 1018 @Override 1019 public boolean matches(char c) { 1020 return true; 1021 } 1022 1023 @Override 1024 public int indexIn(CharSequence sequence) { 1025 return (sequence.length() == 0) ? -1 : 0; 1026 } 1027 1028 @Override 1029 public int indexIn(CharSequence sequence, int start) { 1030 int length = sequence.length(); 1031 checkPositionIndex(start, length); 1032 return (start == length) ? -1 : start; 1033 } 1034 1035 @Override 1036 public int lastIndexIn(CharSequence sequence) { 1037 return sequence.length() - 1; 1038 } 1039 1040 @Override 1041 public boolean matchesAllOf(CharSequence sequence) { 1042 checkNotNull(sequence); 1043 return true; 1044 } 1045 1046 @Override 1047 public boolean matchesNoneOf(CharSequence sequence) { 1048 return sequence.length() == 0; 1049 } 1050 1051 @Override 1052 public String removeFrom(CharSequence sequence) { 1053 checkNotNull(sequence); 1054 return ""; 1055 } 1056 1057 @Override 1058 public String replaceFrom(CharSequence sequence, char replacement) { 1059 char[] array = new char[sequence.length()]; 1060 Arrays.fill(array, replacement); 1061 return new String(array); 1062 } 1063 1064 @Override 1065 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1066 StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); 1067 for (int i = 0; i < sequence.length(); i++) { 1068 result.append(replacement); 1069 } 1070 return result.toString(); 1071 } 1072 1073 @Override 1074 public String collapseFrom(CharSequence sequence, char replacement) { 1075 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 1076 } 1077 1078 @Override 1079 public String trimFrom(CharSequence sequence) { 1080 checkNotNull(sequence); 1081 return ""; 1082 } 1083 1084 @Override 1085 public int countIn(CharSequence sequence) { 1086 return sequence.length(); 1087 } 1088 1089 @Override 1090 public CharMatcher and(CharMatcher other) { 1091 return checkNotNull(other); 1092 } 1093 1094 @Override 1095 public CharMatcher or(CharMatcher other) { 1096 checkNotNull(other); 1097 return this; 1098 } 1099 1100 @Override 1101 public CharMatcher negate() { 1102 return none(); 1103 } 1104 } 1105 1106 /** Implementation of {@link #none()}. */ 1107 private static final class None extends NamedFastMatcher { 1108 1109 static final CharMatcher INSTANCE = new None(); 1110 1111 private None() { 1112 super("CharMatcher.none()"); 1113 } 1114 1115 @Override 1116 public boolean matches(char c) { 1117 return false; 1118 } 1119 1120 @Override 1121 public int indexIn(CharSequence sequence) { 1122 checkNotNull(sequence); 1123 return -1; 1124 } 1125 1126 @Override 1127 public int indexIn(CharSequence sequence, int start) { 1128 int length = sequence.length(); 1129 checkPositionIndex(start, length); 1130 return -1; 1131 } 1132 1133 @Override 1134 public int lastIndexIn(CharSequence sequence) { 1135 checkNotNull(sequence); 1136 return -1; 1137 } 1138 1139 @Override 1140 public boolean matchesAllOf(CharSequence sequence) { 1141 return sequence.length() == 0; 1142 } 1143 1144 @Override 1145 public boolean matchesNoneOf(CharSequence sequence) { 1146 checkNotNull(sequence); 1147 return true; 1148 } 1149 1150 @Override 1151 public String removeFrom(CharSequence sequence) { 1152 return sequence.toString(); 1153 } 1154 1155 @Override 1156 public String replaceFrom(CharSequence sequence, char replacement) { 1157 return sequence.toString(); 1158 } 1159 1160 @Override 1161 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1162 checkNotNull(replacement); 1163 return sequence.toString(); 1164 } 1165 1166 @Override 1167 public String collapseFrom(CharSequence sequence, char replacement) { 1168 return sequence.toString(); 1169 } 1170 1171 @Override 1172 public String trimFrom(CharSequence sequence) { 1173 return sequence.toString(); 1174 } 1175 1176 @Override 1177 public String trimLeadingFrom(CharSequence sequence) { 1178 return sequence.toString(); 1179 } 1180 1181 @Override 1182 public String trimTrailingFrom(CharSequence sequence) { 1183 return sequence.toString(); 1184 } 1185 1186 @Override 1187 public int countIn(CharSequence sequence) { 1188 checkNotNull(sequence); 1189 return 0; 1190 } 1191 1192 @Override 1193 public CharMatcher and(CharMatcher other) { 1194 checkNotNull(other); 1195 return this; 1196 } 1197 1198 @Override 1199 public CharMatcher or(CharMatcher other) { 1200 return checkNotNull(other); 1201 } 1202 1203 @Override 1204 public CharMatcher negate() { 1205 return any(); 1206 } 1207 } 1208 1209 /** Implementation of {@link #whitespace()}. */ 1210 @VisibleForTesting 1211 static final class Whitespace extends NamedFastMatcher { 1212 1213 // TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function 1214 // whose key property is that it maps 25 characters into the 32-slot table without collision. 1215 // Basically this is an opportunistic fast implementation as opposed to "good code". For most 1216 // other use-cases, the reduction in readability isn't worth it. 1217 static final String TABLE = 1218 "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1219 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1220 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1221 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1222 static final int MULTIPLIER = 1682554634; 1223 static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); 1224 1225 static final CharMatcher INSTANCE = new Whitespace(); 1226 1227 Whitespace() { 1228 super("CharMatcher.whitespace()"); 1229 } 1230 1231 @Override 1232 public boolean matches(char c) { 1233 return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; 1234 } 1235 1236 @GwtIncompatible // used only from other GwtIncompatible code 1237 @Override 1238 void setBits(BitSet table) { 1239 for (int i = 0; i < TABLE.length(); i++) { 1240 table.set(TABLE.charAt(i)); 1241 } 1242 } 1243 } 1244 1245 /** Implementation of {@link #breakingWhitespace()}. */ 1246 private static final class BreakingWhitespace extends CharMatcher { 1247 1248 static final CharMatcher INSTANCE = new BreakingWhitespace(); 1249 1250 @Override 1251 public boolean matches(char c) { 1252 switch (c) { 1253 case '\t': 1254 case '\n': 1255 case '\013': 1256 case '\f': 1257 case '\r': 1258 case ' ': 1259 case '\u0085': 1260 case '\u1680': 1261 case '\u2028': 1262 case '\u2029': 1263 case '\u205f': 1264 case '\u3000': 1265 return true; 1266 case '\u2007': 1267 return false; 1268 default: 1269 return c >= '\u2000' && c <= '\u200a'; 1270 } 1271 } 1272 1273 @Override 1274 public String toString() { 1275 return "CharMatcher.breakingWhitespace()"; 1276 } 1277 } 1278 1279 /** Implementation of {@link #ascii()}. */ 1280 private static final class Ascii extends NamedFastMatcher { 1281 1282 static final CharMatcher INSTANCE = new Ascii(); 1283 1284 Ascii() { 1285 super("CharMatcher.ascii()"); 1286 } 1287 1288 @Override 1289 public boolean matches(char c) { 1290 return c <= '\u007f'; 1291 } 1292 } 1293 1294 /** Implementation that matches characters that fall within multiple ranges. */ 1295 private static class RangesMatcher extends CharMatcher { 1296 1297 private final String description; 1298 private final char[] rangeStarts; 1299 private final char[] rangeEnds; 1300 1301 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 1302 this.description = description; 1303 this.rangeStarts = rangeStarts; 1304 this.rangeEnds = rangeEnds; 1305 checkArgument(rangeStarts.length == rangeEnds.length); 1306 for (int i = 0; i < rangeStarts.length; i++) { 1307 checkArgument(rangeStarts[i] <= rangeEnds[i]); 1308 if (i + 1 < rangeStarts.length) { 1309 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 1310 } 1311 } 1312 } 1313 1314 @Override 1315 public boolean matches(char c) { 1316 int index = Arrays.binarySearch(rangeStarts, c); 1317 if (index >= 0) { 1318 return true; 1319 } else { 1320 index = ~index - 1; 1321 return index >= 0 && c <= rangeEnds[index]; 1322 } 1323 } 1324 1325 @Override 1326 public String toString() { 1327 return description; 1328 } 1329 } 1330 1331 /** Implementation of {@link #digit()}. */ 1332 private static final class Digit extends RangesMatcher { 1333 // Plug the following UnicodeSet pattern into 1334 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1335 // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] 1336 // and get the zeroes from there. 1337 1338 // Must be in ascending order. 1339 private static final String ZEROES = 1340 "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" 1341 + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" 1342 + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; 1343 1344 private static char[] zeroes() { 1345 return ZEROES.toCharArray(); 1346 } 1347 1348 private static char[] nines() { 1349 char[] nines = new char[ZEROES.length()]; 1350 for (int i = 0; i < ZEROES.length(); i++) { 1351 nines[i] = (char) (ZEROES.charAt(i) + 9); 1352 } 1353 return nines; 1354 } 1355 1356 static final CharMatcher INSTANCE = new Digit(); 1357 1358 private Digit() { 1359 super("CharMatcher.digit()", zeroes(), nines()); 1360 } 1361 } 1362 1363 /** Implementation of {@link #javaDigit()}. */ 1364 private static final class JavaDigit extends CharMatcher { 1365 1366 static final CharMatcher INSTANCE = new JavaDigit(); 1367 1368 @Override 1369 public boolean matches(char c) { 1370 return Character.isDigit(c); 1371 } 1372 1373 @Override 1374 public String toString() { 1375 return "CharMatcher.javaDigit()"; 1376 } 1377 } 1378 1379 /** Implementation of {@link #javaLetter()}. */ 1380 private static final class JavaLetter extends CharMatcher { 1381 1382 static final CharMatcher INSTANCE = new JavaLetter(); 1383 1384 @Override 1385 public boolean matches(char c) { 1386 return Character.isLetter(c); 1387 } 1388 1389 @Override 1390 public String toString() { 1391 return "CharMatcher.javaLetter()"; 1392 } 1393 } 1394 1395 /** Implementation of {@link #javaLetterOrDigit()}. */ 1396 private static final class JavaLetterOrDigit extends CharMatcher { 1397 1398 static final CharMatcher INSTANCE = new JavaLetterOrDigit(); 1399 1400 @Override 1401 public boolean matches(char c) { 1402 return Character.isLetterOrDigit(c); 1403 } 1404 1405 @Override 1406 public String toString() { 1407 return "CharMatcher.javaLetterOrDigit()"; 1408 } 1409 } 1410 1411 /** Implementation of {@link #javaUpperCase()}. */ 1412 private static final class JavaUpperCase extends CharMatcher { 1413 1414 static final CharMatcher INSTANCE = new JavaUpperCase(); 1415 1416 @Override 1417 public boolean matches(char c) { 1418 return Character.isUpperCase(c); 1419 } 1420 1421 @Override 1422 public String toString() { 1423 return "CharMatcher.javaUpperCase()"; 1424 } 1425 } 1426 1427 /** Implementation of {@link #javaLowerCase()}. */ 1428 private static final class JavaLowerCase extends CharMatcher { 1429 1430 static final CharMatcher INSTANCE = new JavaLowerCase(); 1431 1432 @Override 1433 public boolean matches(char c) { 1434 return Character.isLowerCase(c); 1435 } 1436 1437 @Override 1438 public String toString() { 1439 return "CharMatcher.javaLowerCase()"; 1440 } 1441 } 1442 1443 /** Implementation of {@link #javaIsoControl()}. */ 1444 private static final class JavaIsoControl extends NamedFastMatcher { 1445 1446 static final CharMatcher INSTANCE = new JavaIsoControl(); 1447 1448 private JavaIsoControl() { 1449 super("CharMatcher.javaIsoControl()"); 1450 } 1451 1452 @Override 1453 public boolean matches(char c) { 1454 return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); 1455 } 1456 } 1457 1458 /** Implementation of {@link #invisible()}. */ 1459 private static final class Invisible extends RangesMatcher { 1460 // Plug the following UnicodeSet pattern into 1461 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1462 // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] 1463 // with the "Abbreviate" option, and get the ranges from there. 1464 private static final String RANGE_STARTS = 1465 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u0890\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" 1466 + "\u3000\ud800\ufeff\ufff9"; 1467 private static final String RANGE_ENDS = // inclusive ends 1468 "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u0891\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" 1469 + "\u3000\uf8ff\ufeff\ufffb"; 1470 1471 static final CharMatcher INSTANCE = new Invisible(); 1472 1473 private Invisible() { 1474 super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); 1475 } 1476 } 1477 1478 /** Implementation of {@link #singleWidth()}. */ 1479 private static final class SingleWidth extends RangesMatcher { 1480 1481 static final CharMatcher INSTANCE = new SingleWidth(); 1482 1483 private SingleWidth() { 1484 super( 1485 "CharMatcher.singleWidth()", 1486 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 1487 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 1488 } 1489 } 1490 1491 // Non-static factory implementation classes 1492 1493 /** Implementation of {@link #negate()}. */ 1494 private static class Negated extends CharMatcher { 1495 1496 final CharMatcher original; 1497 1498 Negated(CharMatcher original) { 1499 this.original = checkNotNull(original); 1500 } 1501 1502 @Override 1503 public boolean matches(char c) { 1504 return !original.matches(c); 1505 } 1506 1507 @Override 1508 public boolean matchesAllOf(CharSequence sequence) { 1509 return original.matchesNoneOf(sequence); 1510 } 1511 1512 @Override 1513 public boolean matchesNoneOf(CharSequence sequence) { 1514 return original.matchesAllOf(sequence); 1515 } 1516 1517 @Override 1518 public int countIn(CharSequence sequence) { 1519 return sequence.length() - original.countIn(sequence); 1520 } 1521 1522 @GwtIncompatible // used only from other GwtIncompatible code 1523 @Override 1524 void setBits(BitSet table) { 1525 BitSet tmp = new BitSet(); 1526 original.setBits(tmp); 1527 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 1528 table.or(tmp); 1529 } 1530 1531 @Override 1532 public CharMatcher negate() { 1533 return original; 1534 } 1535 1536 @Override 1537 public String toString() { 1538 return original + ".negate()"; 1539 } 1540 } 1541 1542 /** Implementation of {@link #and(CharMatcher)}. */ 1543 private static final class And extends CharMatcher { 1544 1545 final CharMatcher first; 1546 final CharMatcher second; 1547 1548 And(CharMatcher a, CharMatcher b) { 1549 first = checkNotNull(a); 1550 second = checkNotNull(b); 1551 } 1552 1553 @Override 1554 public boolean matches(char c) { 1555 return first.matches(c) && second.matches(c); 1556 } 1557 1558 @GwtIncompatible // used only from other GwtIncompatible code 1559 @Override 1560 void setBits(BitSet table) { 1561 BitSet tmp1 = new BitSet(); 1562 first.setBits(tmp1); 1563 BitSet tmp2 = new BitSet(); 1564 second.setBits(tmp2); 1565 tmp1.and(tmp2); 1566 table.or(tmp1); 1567 } 1568 1569 @Override 1570 public String toString() { 1571 return "CharMatcher.and(" + first + ", " + second + ")"; 1572 } 1573 } 1574 1575 /** Implementation of {@link #or(CharMatcher)}. */ 1576 private static final class Or extends CharMatcher { 1577 1578 final CharMatcher first; 1579 final CharMatcher second; 1580 1581 Or(CharMatcher a, CharMatcher b) { 1582 first = checkNotNull(a); 1583 second = checkNotNull(b); 1584 } 1585 1586 @GwtIncompatible // used only from other GwtIncompatible code 1587 @Override 1588 void setBits(BitSet table) { 1589 first.setBits(table); 1590 second.setBits(table); 1591 } 1592 1593 @Override 1594 public boolean matches(char c) { 1595 return first.matches(c) || second.matches(c); 1596 } 1597 1598 @Override 1599 public String toString() { 1600 return "CharMatcher.or(" + first + ", " + second + ")"; 1601 } 1602 } 1603 1604 // Static factory implementations 1605 1606 /** Implementation of {@link #is(char)}. */ 1607 private static final class Is extends FastMatcher { 1608 1609 private final char match; 1610 1611 Is(char match) { 1612 this.match = match; 1613 } 1614 1615 @Override 1616 public boolean matches(char c) { 1617 return c == match; 1618 } 1619 1620 @Override 1621 public String replaceFrom(CharSequence sequence, char replacement) { 1622 return sequence.toString().replace(match, replacement); 1623 } 1624 1625 @Override 1626 public CharMatcher and(CharMatcher other) { 1627 return other.matches(match) ? this : none(); 1628 } 1629 1630 @Override 1631 public CharMatcher or(CharMatcher other) { 1632 return other.matches(match) ? other : super.or(other); 1633 } 1634 1635 @Override 1636 public CharMatcher negate() { 1637 return isNot(match); 1638 } 1639 1640 @GwtIncompatible // used only from other GwtIncompatible code 1641 @Override 1642 void setBits(BitSet table) { 1643 table.set(match); 1644 } 1645 1646 @Override 1647 public String toString() { 1648 return "CharMatcher.is('" + showCharacter(match) + "')"; 1649 } 1650 } 1651 1652 /** Implementation of {@link #isNot(char)}. */ 1653 private static final class IsNot extends FastMatcher { 1654 1655 private final char match; 1656 1657 IsNot(char match) { 1658 this.match = match; 1659 } 1660 1661 @Override 1662 public boolean matches(char c) { 1663 return c != match; 1664 } 1665 1666 @Override 1667 public CharMatcher and(CharMatcher other) { 1668 return other.matches(match) ? super.and(other) : other; 1669 } 1670 1671 @Override 1672 public CharMatcher or(CharMatcher other) { 1673 return other.matches(match) ? any() : this; 1674 } 1675 1676 @GwtIncompatible // used only from other GwtIncompatible code 1677 @Override 1678 void setBits(BitSet table) { 1679 table.set(0, match); 1680 table.set(match + 1, Character.MAX_VALUE + 1); 1681 } 1682 1683 @Override 1684 public CharMatcher negate() { 1685 return is(match); 1686 } 1687 1688 @Override 1689 public String toString() { 1690 return "CharMatcher.isNot('" + showCharacter(match) + "')"; 1691 } 1692 } 1693 1694 private static CharMatcher.IsEither isEither(char c1, char c2) { 1695 return new CharMatcher.IsEither(c1, c2); 1696 } 1697 1698 /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ 1699 private static final class IsEither extends FastMatcher { 1700 1701 private final char match1; 1702 private final char match2; 1703 1704 IsEither(char match1, char match2) { 1705 this.match1 = match1; 1706 this.match2 = match2; 1707 } 1708 1709 @Override 1710 public boolean matches(char c) { 1711 return c == match1 || c == match2; 1712 } 1713 1714 @GwtIncompatible // used only from other GwtIncompatible code 1715 @Override 1716 void setBits(BitSet table) { 1717 table.set(match1); 1718 table.set(match2); 1719 } 1720 1721 @Override 1722 public String toString() { 1723 return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; 1724 } 1725 } 1726 1727 /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ 1728 private static final class AnyOf extends CharMatcher { 1729 1730 private final char[] chars; 1731 1732 public AnyOf(CharSequence chars) { 1733 this.chars = chars.toString().toCharArray(); 1734 Arrays.sort(this.chars); 1735 } 1736 1737 @Override 1738 public boolean matches(char c) { 1739 return Arrays.binarySearch(chars, c) >= 0; 1740 } 1741 1742 @Override 1743 @GwtIncompatible // used only from other GwtIncompatible code 1744 void setBits(BitSet table) { 1745 for (char c : chars) { 1746 table.set(c); 1747 } 1748 } 1749 1750 @Override 1751 public String toString() { 1752 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 1753 for (char c : chars) { 1754 description.append(showCharacter(c)); 1755 } 1756 description.append("\")"); 1757 return description.toString(); 1758 } 1759 } 1760 1761 /** Implementation of {@link #inRange(char, char)}. */ 1762 private static final class InRange extends FastMatcher { 1763 1764 private final char startInclusive; 1765 private final char endInclusive; 1766 1767 InRange(char startInclusive, char endInclusive) { 1768 checkArgument(endInclusive >= startInclusive); 1769 this.startInclusive = startInclusive; 1770 this.endInclusive = endInclusive; 1771 } 1772 1773 @Override 1774 public boolean matches(char c) { 1775 return startInclusive <= c && c <= endInclusive; 1776 } 1777 1778 @GwtIncompatible // used only from other GwtIncompatible code 1779 @Override 1780 void setBits(BitSet table) { 1781 table.set(startInclusive, endInclusive + 1); 1782 } 1783 1784 @Override 1785 public String toString() { 1786 return "CharMatcher.inRange('" 1787 + showCharacter(startInclusive) 1788 + "', '" 1789 + showCharacter(endInclusive) 1790 + "')"; 1791 } 1792 } 1793 1794 /** Implementation of {@link #forPredicate(Predicate)}. */ 1795 private static final class ForPredicate extends CharMatcher { 1796 1797 private final Predicate<? super Character> predicate; 1798 1799 ForPredicate(Predicate<? super Character> predicate) { 1800 this.predicate = checkNotNull(predicate); 1801 } 1802 1803 @Override 1804 public boolean matches(char c) { 1805 return predicate.apply(c); 1806 } 1807 1808 @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily 1809 @Override 1810 public boolean apply(Character character) { 1811 return predicate.apply(checkNotNull(character)); 1812 } 1813 1814 @Override 1815 public String toString() { 1816 return "CharMatcher.forPredicate(" + predicate + ")"; 1817 } 1818 } 1819}