001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkPositionIndex; 020 021import com.google.common.annotations.GwtCompatible; 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.VisibleForTesting; 024import com.google.errorprone.annotations.InlineMe; 025import com.google.errorprone.annotations.InlineMeValidationDisabled; 026import java.util.Arrays; 027import java.util.BitSet; 028 029/** 030 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 031 * for any {@link Object}. Also offers basic text processing methods based on this function. 032 * Implementations are strongly encouraged to be side-effect-free and immutable. 033 * 034 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 035 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". 036 * 037 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a 038 * href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand 039 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code 040 * points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of 041 * assigned characters, including important CJK characters and emoji. 042 * 043 * <p>Supplementary characters are <a 044 * href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded 045 * into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as 046 * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. 047 * 048 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for 049 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For 050 * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. 051 * 052 * <p>Example usages: 053 * 054 * <pre> 055 * String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput); 056 * if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 057 * 058 * <p>See the Guava User Guide article on <a 059 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher} 060 * </a>. 061 * 062 * @author Kevin Bourrillion 063 * @since 1.0 064 */ 065@GwtCompatible(emulated = true) 066public abstract class CharMatcher implements Predicate<Character> { 067 /* 068 * N777777777NO 069 * N7777777777777N 070 * M777777777777777N 071 * $N877777777D77777M 072 * N M77777777ONND777M 073 * MN777777777NN D777 074 * N7ZN777777777NN ~M7778 075 * N777777777777MMNN88777N 076 * N777777777777MNZZZ7777O 077 * DZN7777O77777777777777 078 * N7OONND7777777D77777N 079 * 8$M++++?N???$77777$ 080 * M7++++N+M77777777N 081 * N77O777777777777$ M 082 * DNNM$$$$777777N D 083 * N$N:=N$777N7777M NZ 084 * 77Z::::N777777777 ODZZZ 085 * 77N::::::N77777777M NNZZZ$ 086 * $777:::::::77777777MN ZM8ZZZZZ 087 * 777M::::::Z7777777Z77 N++ZZZZNN 088 * 7777M:::::M7777777$777M $++IZZZZM 089 * M777$:::::N777777$M7777M +++++ZZZDN 090 * NN$::::::7777$$M777777N N+++ZZZZNZ 091 * N::::::N:7$O:77777777 N++++ZZZZN 092 * M::::::::::::N77777777+ +?+++++ZZZM 093 * 8::::::::::::D77777777M O+++++ZZ 094 * ::::::::::::M777777777N O+?D 095 * M:::::::::::M77777777778 77= 096 * D=::::::::::N7777777777N 777 097 * INN===::::::=77777777777N I777N 098 * ?777N========N7777777777787M N7777 099 * 77777$D======N77777777777N777N? N777777 100 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N 101 * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON 102 * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND 103 * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N 104 * 7 :N MNN$$$$M$ $$$777$8 8D8I 105 * NMM.:7O 777777778 106 * 7777777MN 107 * M NO .7: 108 * M : M 109 * 8 110 */ 111 112 // Constant matcher factory methods 113 114 /** 115 * Matches any character. 116 * 117 * @since 19.0 (since 1.0 as constant {@code ANY}) 118 */ 119 public static CharMatcher any() { 120 return Any.INSTANCE; 121 } 122 123 /** 124 * Matches no characters. 125 * 126 * @since 19.0 (since 1.0 as constant {@code NONE}) 127 */ 128 public static CharMatcher none() { 129 return None.INSTANCE; 130 } 131 132 /** 133 * Determines whether a character is whitespace according to the latest Unicode standard, as 134 * illustrated <a 135 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 136 * This is not the same definition used by other Java APIs. (See a <a 137 * href="https://docs.google.com/spreadsheets/d/1kq4ECwPjHX9B8QUCTPclgsDCXYaj7T-FlT4tB5q3ahk/edit">comparison 138 * of several definitions of "whitespace"</a>.) 139 * 140 * <p>All Unicode White_Space characters are on the BMP and thus supported by this API. 141 * 142 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to 143 * date. 144 * 145 * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) 146 */ 147 public static CharMatcher whitespace() { 148 return Whitespace.INSTANCE; 149 } 150 151 /** 152 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 153 * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a 154 * discussion of that term. 155 * 156 * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) 157 */ 158 public static CharMatcher breakingWhitespace() { 159 return BreakingWhitespace.INSTANCE; 160 } 161 162 /** 163 * Determines whether a character is ASCII, meaning that its code point is less than 128. 164 * 165 * @since 19.0 (since 1.0 as constant {@code ASCII}) 166 */ 167 public static CharMatcher ascii() { 168 return Ascii.INSTANCE; 169 } 170 171 /** 172 * Determines whether a character is a BMP digit according to <a 173 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If 174 * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 175 * 176 * @deprecated Many digits are supplementary characters; see the class documentation. 177 * @since 19.0 (since 1.0 as constant {@code DIGIT}) 178 */ 179 @Deprecated 180 public static CharMatcher digit() { 181 return Digit.INSTANCE; 182 } 183 184 /** 185 * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) 186 * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', 187 * '9')}. 188 * 189 * @deprecated Many digits are supplementary characters; see the class documentation. 190 * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) 191 */ 192 @Deprecated 193 public static CharMatcher javaDigit() { 194 return JavaDigit.INSTANCE; 195 } 196 197 /** 198 * Determines whether a character is a BMP letter according to {@linkplain 199 * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin 200 * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 201 * 202 * @deprecated Most letters are supplementary characters; see the class documentation. 203 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) 204 */ 205 @Deprecated 206 public static CharMatcher javaLetter() { 207 return JavaLetter.INSTANCE; 208 } 209 210 /** 211 * Determines whether a character is a BMP letter or digit according to {@linkplain 212 * Character#isLetterOrDigit(char) Java's definition}. 213 * 214 * @deprecated Most letters and digits are supplementary characters; see the class documentation. 215 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). 216 */ 217 @Deprecated 218 public static CharMatcher javaLetterOrDigit() { 219 return JavaLetterOrDigit.INSTANCE; 220 } 221 222 /** 223 * Determines whether a BMP character is upper case according to {@linkplain 224 * Character#isUpperCase(char) Java's definition}. 225 * 226 * @deprecated Some uppercase characters are supplementary characters; see the class 227 * documentation. 228 * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) 229 */ 230 @Deprecated 231 public static CharMatcher javaUpperCase() { 232 return JavaUpperCase.INSTANCE; 233 } 234 235 /** 236 * Determines whether a BMP character is lower case according to {@linkplain 237 * Character#isLowerCase(char) Java's definition}. 238 * 239 * @deprecated Some lowercase characters are supplementary characters; see the class 240 * documentation. 241 * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) 242 */ 243 @Deprecated 244 public static CharMatcher javaLowerCase() { 245 return JavaLowerCase.INSTANCE; 246 } 247 248 /** 249 * Determines whether a character is an ISO control character as specified by {@link 250 * Character#isISOControl(char)}. 251 * 252 * <p>All ISO control codes are on the BMP and thus supported by this API. 253 * 254 * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) 255 */ 256 public static CharMatcher javaIsoControl() { 257 return JavaIsoControl.INSTANCE; 258 } 259 260 /** 261 * Determines whether a character is invisible; that is, if its Unicode category is any of 262 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 263 * PRIVATE_USE according to ICU4J. 264 * 265 * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU). 266 * 267 * @deprecated Most invisible characters are supplementary characters; see the class 268 * documentation. 269 * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) 270 */ 271 @Deprecated 272 public static CharMatcher invisible() { 273 return Invisible.INSTANCE; 274 } 275 276 /** 277 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 278 * errs on the side of returning {@code false} (that is, it tends to assume a character is 279 * double-width). 280 * 281 * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to 282 * date. 283 * 284 * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>. 285 * 286 * @deprecated Many such characters are supplementary characters; see the class documentation. 287 * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) 288 */ 289 @Deprecated 290 public static CharMatcher singleWidth() { 291 return SingleWidth.INSTANCE; 292 } 293 294 // Static factories 295 296 /** Returns a {@code char} matcher that matches only one specified BMP character. */ 297 public static CharMatcher is(final char match) { 298 return new Is(match); 299 } 300 301 /** 302 * Returns a {@code char} matcher that matches any character except the BMP character specified. 303 * 304 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 305 */ 306 public static CharMatcher isNot(final char match) { 307 return new IsNot(match); 308 } 309 310 /** 311 * Returns a {@code char} matcher that matches any BMP character present in the given character 312 * sequence. Returns a bogus matcher if the sequence contains supplementary characters. 313 */ 314 public static CharMatcher anyOf(final CharSequence sequence) { 315 switch (sequence.length()) { 316 case 0: 317 return none(); 318 case 1: 319 return is(sequence.charAt(0)); 320 case 2: 321 return isEither(sequence.charAt(0), sequence.charAt(1)); 322 default: 323 // TODO(lowasser): is it potentially worth just going ahead and building a precomputed 324 // matcher? 325 return new AnyOf(sequence); 326 } 327 } 328 329 /** 330 * Returns a {@code char} matcher that matches any BMP character not present in the given 331 * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. 332 */ 333 public static CharMatcher noneOf(CharSequence sequence) { 334 return anyOf(sequence).negate(); 335 } 336 337 /** 338 * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints 339 * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 340 * CharMatcher.inRange('a', 'z')}. 341 * 342 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 343 */ 344 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 345 return new InRange(startInclusive, endInclusive); 346 } 347 348 /** 349 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 350 * which operates on primitive {@code char} instances instead. 351 */ 352 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 353 return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); 354 } 355 356 // Constructors 357 358 /** 359 * Constructor for use by subclasses. When subclassing, you may want to override {@code 360 * toString()} to provide a useful description. 361 */ 362 protected CharMatcher() {} 363 364 // Abstract methods 365 366 /** Determines a true or false value for the given character. */ 367 public abstract boolean matches(char c); 368 369 // Non-static factories 370 371 /** Returns a matcher that matches any character not matched by this matcher. */ 372 // This is not an override in java7, where Guava's Predicate does not extend the JDK's Predicate. 373 @SuppressWarnings("MissingOverride") 374 public CharMatcher negate() { 375 return new Negated(this); 376 } 377 378 /** 379 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 380 */ 381 public CharMatcher and(CharMatcher other) { 382 return new And(this, other); 383 } 384 385 /** 386 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 387 */ 388 public CharMatcher or(CharMatcher other) { 389 return new Or(this, other); 390 } 391 392 /** 393 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 394 * query than the original; your mileage may vary. Precomputation takes time and requires more 395 * memory, so it is only likely to be worthwhile if the precomputed matcher is queried very often. 396 * 397 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 398 * precomputed matcher is faster, but it certainly would consume more memory (which doesn't seem 399 * like a worthwhile tradeoff in a browser). 400 */ 401 public CharMatcher precomputed() { 402 return Platform.precomputeCharMatcher(this); 403 } 404 405 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 406 407 /** 408 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 409 * on {@link Platform} so that we can have different behavior in GWT. 410 * 411 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the 412 * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables 413 * for matchers that only match a few characters, and so on. In the worst-case scenario, it 414 * constructs an eight-kilobyte bit array and queries that. In many situations this produces a 415 * matcher which is faster to query than the original. 416 */ 417 @GwtIncompatible // SmallCharMatcher 418 CharMatcher precomputedInternal() { 419 final BitSet table = new BitSet(); 420 setBits(table); 421 int totalCharacters = table.cardinality(); 422 if (totalCharacters * 2 <= DISTINCT_CHARS) { 423 return precomputedPositive(totalCharacters, table, toString()); 424 } else { 425 // TODO(lowasser): is it worth it to worry about the last character of large matchers? 426 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 427 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 428 String suffix = ".negate()"; 429 final String description = toString(); 430 String negatedDescription = 431 description.endsWith(suffix) 432 ? description.substring(0, description.length() - suffix.length()) 433 : description + suffix; 434 return new NegatedFastMatcher( 435 precomputedPositive(negatedCharacters, table, negatedDescription)) { 436 @Override 437 public String toString() { 438 return description; 439 } 440 }; 441 } 442 } 443 444 /** 445 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 446 */ 447 @GwtIncompatible // SmallCharMatcher 448 private static CharMatcher precomputedPositive( 449 int totalCharacters, BitSet table, String description) { 450 switch (totalCharacters) { 451 case 0: 452 return none(); 453 case 1: 454 return is((char) table.nextSetBit(0)); 455 case 2: 456 char c1 = (char) table.nextSetBit(0); 457 char c2 = (char) table.nextSetBit(c1 + 1); 458 return isEither(c1, c2); 459 default: 460 return isSmall(totalCharacters, table.length()) 461 ? SmallCharMatcher.from(table, description) 462 : new BitSetMatcher(table, description); 463 } 464 } 465 466 @GwtIncompatible // SmallCharMatcher 467 private static boolean isSmall(int totalCharacters, int tableLength) { 468 return totalCharacters <= SmallCharMatcher.MAX_SIZE 469 && tableLength > (totalCharacters * 4 * Character.SIZE); 470 // err on the side of BitSetMatcher 471 } 472 473 /** Sets bits in {@code table} matched by this matcher. */ 474 @GwtIncompatible // used only from other GwtIncompatible code 475 void setBits(BitSet table) { 476 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 477 if (matches((char) c)) { 478 table.set(c); 479 } 480 } 481 } 482 483 // Text processing routines 484 485 /** 486 * Returns {@code true} if a character sequence contains at least one matching BMP character. 487 * Equivalent to {@code !matchesNoneOf(sequence)}. 488 * 489 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 490 * character, until this returns {@code true} or the end is reached. 491 * 492 * @param sequence the character sequence to examine, possibly empty 493 * @return {@code true} if this matcher matches at least one character in the sequence 494 * @since 8.0 495 */ 496 public boolean matchesAnyOf(CharSequence sequence) { 497 return !matchesNoneOf(sequence); 498 } 499 500 /** 501 * Returns {@code true} if a character sequence contains only matching BMP characters. 502 * 503 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 504 * character, until this returns {@code false} or the end is reached. 505 * 506 * @param sequence the character sequence to examine, possibly empty 507 * @return {@code true} if this matcher matches every character in the sequence, including when 508 * the sequence is empty 509 */ 510 public boolean matchesAllOf(CharSequence sequence) { 511 for (int i = sequence.length() - 1; i >= 0; i--) { 512 if (!matches(sequence.charAt(i))) { 513 return false; 514 } 515 } 516 return true; 517 } 518 519 /** 520 * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to 521 * {@code !matchesAnyOf(sequence)}. 522 * 523 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 524 * character, until this returns {@code true} or the end is reached. 525 * 526 * @param sequence the character sequence to examine, possibly empty 527 * @return {@code true} if this matcher matches no characters in the sequence, including when the 528 * sequence is empty 529 */ 530 public boolean matchesNoneOf(CharSequence sequence) { 531 return indexIn(sequence) == -1; 532 } 533 534 /** 535 * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if 536 * no matching character is present. 537 * 538 * <p>The default implementation iterates over the sequence in forward order calling {@link 539 * #matches} for each character. 540 * 541 * @param sequence the character sequence to examine from the beginning 542 * @return an index, or {@code -1} if no character matches 543 */ 544 public int indexIn(CharSequence sequence) { 545 return indexIn(sequence, 0); 546 } 547 548 /** 549 * Returns the index of the first matching BMP character in a character sequence, starting from a 550 * given position, or {@code -1} if no character matches after that position. 551 * 552 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 553 * start}, calling {@link #matches} for each character. 554 * 555 * @param sequence the character sequence to examine 556 * @param start the first index to examine; must be nonnegative and no greater than {@code 557 * sequence.length()} 558 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 559 * or {@code -1} if no character matches 560 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 561 * sequence.length()} 562 */ 563 public int indexIn(CharSequence sequence, int start) { 564 int length = sequence.length(); 565 checkPositionIndex(start, length); 566 for (int i = start; i < length; i++) { 567 if (matches(sequence.charAt(i))) { 568 return i; 569 } 570 } 571 return -1; 572 } 573 574 /** 575 * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if 576 * no matching character is present. 577 * 578 * <p>The default implementation iterates over the sequence in reverse order calling {@link 579 * #matches} for each character. 580 * 581 * @param sequence the character sequence to examine from the end 582 * @return an index, or {@code -1} if no character matches 583 */ 584 public int lastIndexIn(CharSequence sequence) { 585 for (int i = sequence.length() - 1; i >= 0; i--) { 586 if (matches(sequence.charAt(i))) { 587 return i; 588 } 589 } 590 return -1; 591 } 592 593 /** 594 * Returns the number of matching {@code char}s found in a character sequence. 595 * 596 * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). 597 */ 598 public int countIn(CharSequence sequence) { 599 int count = 0; 600 for (int i = 0; i < sequence.length(); i++) { 601 if (matches(sequence.charAt(i))) { 602 count++; 603 } 604 } 605 return count; 606 } 607 608 /** 609 * Returns a string containing all non-matching characters of a character sequence, in order. For 610 * example: 611 * 612 * <pre>{@code 613 * CharMatcher.is('a').removeFrom("bazaar") 614 * }</pre> 615 * 616 * ... returns {@code "bzr"}. 617 */ 618 public String removeFrom(CharSequence sequence) { 619 String string = sequence.toString(); 620 int pos = indexIn(string); 621 if (pos == -1) { 622 return string; 623 } 624 625 char[] chars = string.toCharArray(); 626 int spread = 1; 627 628 // This unusual loop comes from extensive benchmarking 629 OUT: 630 while (true) { 631 pos++; 632 while (true) { 633 if (pos == chars.length) { 634 break OUT; 635 } 636 if (matches(chars[pos])) { 637 break; 638 } 639 chars[pos - spread] = chars[pos]; 640 pos++; 641 } 642 spread++; 643 } 644 return new String(chars, 0, pos - spread); 645 } 646 647 /** 648 * Returns a string containing all matching BMP characters of a character sequence, in order. For 649 * example: 650 * 651 * <pre>{@code 652 * CharMatcher.is('a').retainFrom("bazaar") 653 * }</pre> 654 * 655 * ... returns {@code "aaa"}. 656 */ 657 public String retainFrom(CharSequence sequence) { 658 return negate().removeFrom(sequence); 659 } 660 661 /** 662 * Returns a string copy of the input character sequence, with each matching BMP character 663 * replaced by a given replacement character. For example: 664 * 665 * <pre>{@code 666 * CharMatcher.is('a').replaceFrom("radar", 'o') 667 * }</pre> 668 * 669 * ... returns {@code "rodor"}. 670 * 671 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 672 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 673 * character. 674 * 675 * @param sequence the character sequence to replace matching characters in 676 * @param replacement the character to append to the result string in place of each matching 677 * character in {@code sequence} 678 * @return the new string 679 */ 680 public String replaceFrom(CharSequence sequence, char replacement) { 681 String string = sequence.toString(); 682 int pos = indexIn(string); 683 if (pos == -1) { 684 return string; 685 } 686 char[] chars = string.toCharArray(); 687 chars[pos] = replacement; 688 for (int i = pos + 1; i < chars.length; i++) { 689 if (matches(chars[i])) { 690 chars[i] = replacement; 691 } 692 } 693 return new String(chars); 694 } 695 696 /** 697 * Returns a string copy of the input character sequence, with each matching BMP character 698 * replaced by a given replacement sequence. For example: 699 * 700 * <pre>{@code 701 * CharMatcher.is('a').replaceFrom("yaha", "oo") 702 * }</pre> 703 * 704 * ... returns {@code "yoohoo"}. 705 * 706 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 707 * off calling {@link #replaceFrom(CharSequence, char)} directly. 708 * 709 * @param sequence the character sequence to replace matching characters in 710 * @param replacement the characters to append to the result string in place of each matching 711 * character in {@code sequence} 712 * @return the new string 713 */ 714 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 715 int replacementLen = replacement.length(); 716 if (replacementLen == 0) { 717 return removeFrom(sequence); 718 } 719 if (replacementLen == 1) { 720 return replaceFrom(sequence, replacement.charAt(0)); 721 } 722 723 String string = sequence.toString(); 724 int pos = indexIn(string); 725 if (pos == -1) { 726 return string; 727 } 728 729 int len = string.length(); 730 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 731 732 int oldpos = 0; 733 do { 734 buf.append(string, oldpos, pos); 735 buf.append(replacement); 736 oldpos = pos + 1; 737 pos = indexIn(string, oldpos); 738 } while (pos != -1); 739 740 buf.append(string, oldpos, len); 741 return buf.toString(); 742 } 743 744 /** 745 * Returns a substring of the input character sequence that omits all matching BMP characters from 746 * the beginning and from the end of the string. For example: 747 * 748 * <pre>{@code 749 * CharMatcher.anyOf("ab").trimFrom("abacatbab") 750 * }</pre> 751 * 752 * ... returns {@code "cat"}. 753 * 754 * <p>Note that: 755 * 756 * <pre>{@code 757 * CharMatcher.inRange('\0', ' ').trimFrom(str) 758 * }</pre> 759 * 760 * ... is equivalent to {@link String#trim()}. 761 */ 762 public String trimFrom(CharSequence sequence) { 763 int len = sequence.length(); 764 int first; 765 int last; 766 767 for (first = 0; first < len; first++) { 768 if (!matches(sequence.charAt(first))) { 769 break; 770 } 771 } 772 for (last = len - 1; last > first; last--) { 773 if (!matches(sequence.charAt(last))) { 774 break; 775 } 776 } 777 778 return sequence.subSequence(first, last + 1).toString(); 779 } 780 781 /** 782 * Returns a substring of the input character sequence that omits all matching BMP characters from 783 * the beginning of the string. For example: 784 * 785 * <pre>{@code 786 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab") 787 * }</pre> 788 * 789 * ... returns {@code "catbab"}. 790 */ 791 public String trimLeadingFrom(CharSequence sequence) { 792 int len = sequence.length(); 793 for (int first = 0; first < len; first++) { 794 if (!matches(sequence.charAt(first))) { 795 return sequence.subSequence(first, len).toString(); 796 } 797 } 798 return ""; 799 } 800 801 /** 802 * Returns a substring of the input character sequence that omits all matching BMP characters from 803 * the end of the string. For example: 804 * 805 * <pre>{@code 806 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab") 807 * }</pre> 808 * 809 * ... returns {@code "abacat"}. 810 */ 811 public String trimTrailingFrom(CharSequence sequence) { 812 int len = sequence.length(); 813 for (int last = len - 1; last >= 0; last--) { 814 if (!matches(sequence.charAt(last))) { 815 return sequence.subSequence(0, last + 1).toString(); 816 } 817 } 818 return ""; 819 } 820 821 /** 822 * Returns a string copy of the input character sequence, with each group of consecutive matching 823 * BMP characters replaced by a single replacement character. For example: 824 * 825 * <pre>{@code 826 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-') 827 * }</pre> 828 * 829 * ... returns {@code "b-p-r"}. 830 * 831 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 832 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 833 * character. 834 * 835 * @param sequence the character sequence to replace matching groups of characters in 836 * @param replacement the character to append to the result string in place of each group of 837 * matching characters in {@code sequence} 838 * @return the new string 839 */ 840 public String collapseFrom(CharSequence sequence, char replacement) { 841 // This implementation avoids unnecessary allocation. 842 int len = sequence.length(); 843 for (int i = 0; i < len; i++) { 844 char c = sequence.charAt(i); 845 if (matches(c)) { 846 if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 847 // a no-op replacement 848 i++; 849 } else { 850 StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); 851 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 852 } 853 } 854 } 855 // no replacement needed 856 return sequence.toString(); 857 } 858 859 /** 860 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 861 * groups of matching BMP characters at the start or end of the sequence are removed without 862 * replacement. 863 */ 864 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 865 // This implementation avoids unnecessary allocation. 866 int len = sequence.length(); 867 int first = 0; 868 int last = len - 1; 869 870 while (first < len && matches(sequence.charAt(first))) { 871 first++; 872 } 873 874 while (last > first && matches(sequence.charAt(last))) { 875 last--; 876 } 877 878 return (first == 0 && last == len - 1) 879 ? collapseFrom(sequence, replacement) 880 : finishCollapseFrom( 881 sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); 882 } 883 884 private String finishCollapseFrom( 885 CharSequence sequence, 886 int start, 887 int end, 888 char replacement, 889 StringBuilder builder, 890 boolean inMatchingGroup) { 891 for (int i = start; i < end; i++) { 892 char c = sequence.charAt(i); 893 if (matches(c)) { 894 if (!inMatchingGroup) { 895 builder.append(replacement); 896 inMatchingGroup = true; 897 } 898 } else { 899 builder.append(c); 900 inMatchingGroup = false; 901 } 902 } 903 return builder.toString(); 904 } 905 906 /** 907 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 908 * instead. 909 */ 910 @InlineMe(replacement = "this.matches(character)") 911 @Deprecated 912 @Override 913 /* 914 * We can't compatibly make this `final` now (even after devising a way for `ForPredicate`, which 915 * currently overrides it, to keep the null check that it inserts). 916 */ 917 @InlineMeValidationDisabled( 918 "While apply() is not final, the inlining is still safe because all known overrides of" 919 + " apply() call matches().") 920 public boolean apply(Character character) { 921 return matches(character); 922 } 923 924 /** 925 * Returns a string representation of this {@code CharMatcher}, such as {@code 926 * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 927 */ 928 @Override 929 public String toString() { 930 return super.toString(); 931 } 932 933 /** 934 * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where 935 * "12AB" is the four hexadecimal digits representing the 16-bit code unit. 936 */ 937 private static String showCharacter(char c) { 938 String hex = "0123456789ABCDEF"; 939 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 940 for (int i = 0; i < 4; i++) { 941 tmp[5 - i] = hex.charAt(c & 0xF); 942 c = (char) (c >> 4); 943 } 944 return String.copyValueOf(tmp); 945 } 946 947 // Fast matchers 948 949 /** A matcher for which precomputation will not yield any significant benefit. */ 950 abstract static class FastMatcher extends CharMatcher { 951 952 @Override 953 public final CharMatcher precomputed() { 954 return this; 955 } 956 957 @Override 958 public CharMatcher negate() { 959 return new NegatedFastMatcher(this); 960 } 961 } 962 963 /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ 964 abstract static class NamedFastMatcher extends FastMatcher { 965 966 private final String description; 967 968 NamedFastMatcher(String description) { 969 this.description = checkNotNull(description); 970 } 971 972 @Override 973 public final String toString() { 974 return description; 975 } 976 } 977 978 /** Negation of a {@link FastMatcher}. */ 979 private static class NegatedFastMatcher extends Negated { 980 981 NegatedFastMatcher(CharMatcher original) { 982 super(original); 983 } 984 985 @Override 986 public final CharMatcher precomputed() { 987 return this; 988 } 989 } 990 991 /** Fast matcher using a {@link BitSet} table of matching characters. */ 992 @GwtIncompatible // used only from other GwtIncompatible code 993 private static final class BitSetMatcher extends NamedFastMatcher { 994 995 private final BitSet table; 996 997 private BitSetMatcher(BitSet table, String description) { 998 super(description); 999 if (table.length() + Long.SIZE < table.size()) { 1000 table = (BitSet) table.clone(); 1001 // If only we could actually call BitSet.trimToSize() ourselves... 1002 } 1003 this.table = table; 1004 } 1005 1006 @Override 1007 public boolean matches(char c) { 1008 return table.get(c); 1009 } 1010 1011 @Override 1012 void setBits(BitSet bitSet) { 1013 bitSet.or(table); 1014 } 1015 } 1016 1017 // Static constant implementation classes 1018 1019 /** Implementation of {@link #any()}. */ 1020 private static final class Any extends NamedFastMatcher { 1021 1022 static final CharMatcher INSTANCE = new Any(); 1023 1024 private Any() { 1025 super("CharMatcher.any()"); 1026 } 1027 1028 @Override 1029 public boolean matches(char c) { 1030 return true; 1031 } 1032 1033 @Override 1034 public int indexIn(CharSequence sequence) { 1035 return (sequence.length() == 0) ? -1 : 0; 1036 } 1037 1038 @Override 1039 public int indexIn(CharSequence sequence, int start) { 1040 int length = sequence.length(); 1041 checkPositionIndex(start, length); 1042 return (start == length) ? -1 : start; 1043 } 1044 1045 @Override 1046 public int lastIndexIn(CharSequence sequence) { 1047 return sequence.length() - 1; 1048 } 1049 1050 @Override 1051 public boolean matchesAllOf(CharSequence sequence) { 1052 checkNotNull(sequence); 1053 return true; 1054 } 1055 1056 @Override 1057 public boolean matchesNoneOf(CharSequence sequence) { 1058 return sequence.length() == 0; 1059 } 1060 1061 @Override 1062 public String removeFrom(CharSequence sequence) { 1063 checkNotNull(sequence); 1064 return ""; 1065 } 1066 1067 @Override 1068 public String replaceFrom(CharSequence sequence, char replacement) { 1069 char[] array = new char[sequence.length()]; 1070 Arrays.fill(array, replacement); 1071 return new String(array); 1072 } 1073 1074 @Override 1075 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1076 StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); 1077 for (int i = 0; i < sequence.length(); i++) { 1078 result.append(replacement); 1079 } 1080 return result.toString(); 1081 } 1082 1083 @Override 1084 public String collapseFrom(CharSequence sequence, char replacement) { 1085 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 1086 } 1087 1088 @Override 1089 public String trimFrom(CharSequence sequence) { 1090 checkNotNull(sequence); 1091 return ""; 1092 } 1093 1094 @Override 1095 public int countIn(CharSequence sequence) { 1096 return sequence.length(); 1097 } 1098 1099 @Override 1100 public CharMatcher and(CharMatcher other) { 1101 return checkNotNull(other); 1102 } 1103 1104 @Override 1105 public CharMatcher or(CharMatcher other) { 1106 checkNotNull(other); 1107 return this; 1108 } 1109 1110 @Override 1111 public CharMatcher negate() { 1112 return none(); 1113 } 1114 } 1115 1116 /** Implementation of {@link #none()}. */ 1117 private static final class None extends NamedFastMatcher { 1118 1119 static final CharMatcher INSTANCE = new None(); 1120 1121 private None() { 1122 super("CharMatcher.none()"); 1123 } 1124 1125 @Override 1126 public boolean matches(char c) { 1127 return false; 1128 } 1129 1130 @Override 1131 public int indexIn(CharSequence sequence) { 1132 checkNotNull(sequence); 1133 return -1; 1134 } 1135 1136 @Override 1137 public int indexIn(CharSequence sequence, int start) { 1138 int length = sequence.length(); 1139 checkPositionIndex(start, length); 1140 return -1; 1141 } 1142 1143 @Override 1144 public int lastIndexIn(CharSequence sequence) { 1145 checkNotNull(sequence); 1146 return -1; 1147 } 1148 1149 @Override 1150 public boolean matchesAllOf(CharSequence sequence) { 1151 return sequence.length() == 0; 1152 } 1153 1154 @Override 1155 public boolean matchesNoneOf(CharSequence sequence) { 1156 checkNotNull(sequence); 1157 return true; 1158 } 1159 1160 @Override 1161 public String removeFrom(CharSequence sequence) { 1162 return sequence.toString(); 1163 } 1164 1165 @Override 1166 public String replaceFrom(CharSequence sequence, char replacement) { 1167 return sequence.toString(); 1168 } 1169 1170 @Override 1171 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1172 checkNotNull(replacement); 1173 return sequence.toString(); 1174 } 1175 1176 @Override 1177 public String collapseFrom(CharSequence sequence, char replacement) { 1178 return sequence.toString(); 1179 } 1180 1181 @Override 1182 public String trimFrom(CharSequence sequence) { 1183 return sequence.toString(); 1184 } 1185 1186 @Override 1187 public String trimLeadingFrom(CharSequence sequence) { 1188 return sequence.toString(); 1189 } 1190 1191 @Override 1192 public String trimTrailingFrom(CharSequence sequence) { 1193 return sequence.toString(); 1194 } 1195 1196 @Override 1197 public int countIn(CharSequence sequence) { 1198 checkNotNull(sequence); 1199 return 0; 1200 } 1201 1202 @Override 1203 public CharMatcher and(CharMatcher other) { 1204 checkNotNull(other); 1205 return this; 1206 } 1207 1208 @Override 1209 public CharMatcher or(CharMatcher other) { 1210 return checkNotNull(other); 1211 } 1212 1213 @Override 1214 public CharMatcher negate() { 1215 return any(); 1216 } 1217 } 1218 1219 /** Implementation of {@link #whitespace()}. */ 1220 @VisibleForTesting 1221 static final class Whitespace extends NamedFastMatcher { 1222 1223 // TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function 1224 // whose key property is that it maps 25 characters into the 32-slot table without collision. 1225 // Basically this is an opportunistic fast implementation as opposed to "good code". For most 1226 // other use-cases, the reduction in readability isn't worth it. 1227 static final String TABLE = 1228 "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1229 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1230 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1231 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1232 static final int MULTIPLIER = 1682554634; 1233 static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); 1234 1235 static final CharMatcher INSTANCE = new Whitespace(); 1236 1237 Whitespace() { 1238 super("CharMatcher.whitespace()"); 1239 } 1240 1241 @Override 1242 public boolean matches(char c) { 1243 return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; 1244 } 1245 1246 @GwtIncompatible // used only from other GwtIncompatible code 1247 @Override 1248 void setBits(BitSet table) { 1249 for (int i = 0; i < TABLE.length(); i++) { 1250 table.set(TABLE.charAt(i)); 1251 } 1252 } 1253 } 1254 1255 /** Implementation of {@link #breakingWhitespace()}. */ 1256 private static final class BreakingWhitespace extends CharMatcher { 1257 1258 static final CharMatcher INSTANCE = new BreakingWhitespace(); 1259 1260 @Override 1261 public boolean matches(char c) { 1262 switch (c) { 1263 case '\t': 1264 case '\n': 1265 case '\013': 1266 case '\f': 1267 case '\r': 1268 case ' ': 1269 case '\u0085': 1270 case '\u1680': 1271 case '\u2028': 1272 case '\u2029': 1273 case '\u205f': 1274 case '\u3000': 1275 return true; 1276 case '\u2007': 1277 return false; 1278 default: 1279 return c >= '\u2000' && c <= '\u200a'; 1280 } 1281 } 1282 1283 @Override 1284 public String toString() { 1285 return "CharMatcher.breakingWhitespace()"; 1286 } 1287 } 1288 1289 /** Implementation of {@link #ascii()}. */ 1290 private static final class Ascii extends NamedFastMatcher { 1291 1292 static final CharMatcher INSTANCE = new Ascii(); 1293 1294 Ascii() { 1295 super("CharMatcher.ascii()"); 1296 } 1297 1298 @Override 1299 public boolean matches(char c) { 1300 return c <= '\u007f'; 1301 } 1302 } 1303 1304 /** Implementation that matches characters that fall within multiple ranges. */ 1305 private static class RangesMatcher extends CharMatcher { 1306 1307 private final String description; 1308 private final char[] rangeStarts; 1309 private final char[] rangeEnds; 1310 1311 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 1312 this.description = description; 1313 this.rangeStarts = rangeStarts; 1314 this.rangeEnds = rangeEnds; 1315 checkArgument(rangeStarts.length == rangeEnds.length); 1316 for (int i = 0; i < rangeStarts.length; i++) { 1317 checkArgument(rangeStarts[i] <= rangeEnds[i]); 1318 if (i + 1 < rangeStarts.length) { 1319 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 1320 } 1321 } 1322 } 1323 1324 @Override 1325 public boolean matches(char c) { 1326 int index = Arrays.binarySearch(rangeStarts, c); 1327 if (index >= 0) { 1328 return true; 1329 } else { 1330 index = ~index - 1; 1331 return index >= 0 && c <= rangeEnds[index]; 1332 } 1333 } 1334 1335 @Override 1336 public String toString() { 1337 return description; 1338 } 1339 } 1340 1341 /** Implementation of {@link #digit()}. */ 1342 private static final class Digit extends RangesMatcher { 1343 // Plug the following UnicodeSet pattern into 1344 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1345 // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] 1346 // and get the zeroes from there. 1347 1348 // Must be in ascending order. 1349 private static final String ZEROES = 1350 "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" 1351 + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" 1352 + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; 1353 1354 private static char[] zeroes() { 1355 return ZEROES.toCharArray(); 1356 } 1357 1358 private static char[] nines() { 1359 char[] nines = new char[ZEROES.length()]; 1360 for (int i = 0; i < ZEROES.length(); i++) { 1361 nines[i] = (char) (ZEROES.charAt(i) + 9); 1362 } 1363 return nines; 1364 } 1365 1366 static final CharMatcher INSTANCE = new Digit(); 1367 1368 private Digit() { 1369 super("CharMatcher.digit()", zeroes(), nines()); 1370 } 1371 } 1372 1373 /** Implementation of {@link #javaDigit()}. */ 1374 private static final class JavaDigit extends CharMatcher { 1375 1376 static final CharMatcher INSTANCE = new JavaDigit(); 1377 1378 @Override 1379 public boolean matches(char c) { 1380 return Character.isDigit(c); 1381 } 1382 1383 @Override 1384 public String toString() { 1385 return "CharMatcher.javaDigit()"; 1386 } 1387 } 1388 1389 /** Implementation of {@link #javaLetter()}. */ 1390 private static final class JavaLetter extends CharMatcher { 1391 1392 static final CharMatcher INSTANCE = new JavaLetter(); 1393 1394 @Override 1395 public boolean matches(char c) { 1396 return Character.isLetter(c); 1397 } 1398 1399 @Override 1400 public String toString() { 1401 return "CharMatcher.javaLetter()"; 1402 } 1403 } 1404 1405 /** Implementation of {@link #javaLetterOrDigit()}. */ 1406 private static final class JavaLetterOrDigit extends CharMatcher { 1407 1408 static final CharMatcher INSTANCE = new JavaLetterOrDigit(); 1409 1410 @Override 1411 public boolean matches(char c) { 1412 return Character.isLetterOrDigit(c); 1413 } 1414 1415 @Override 1416 public String toString() { 1417 return "CharMatcher.javaLetterOrDigit()"; 1418 } 1419 } 1420 1421 /** Implementation of {@link #javaUpperCase()}. */ 1422 private static final class JavaUpperCase extends CharMatcher { 1423 1424 static final CharMatcher INSTANCE = new JavaUpperCase(); 1425 1426 @Override 1427 public boolean matches(char c) { 1428 return Character.isUpperCase(c); 1429 } 1430 1431 @Override 1432 public String toString() { 1433 return "CharMatcher.javaUpperCase()"; 1434 } 1435 } 1436 1437 /** Implementation of {@link #javaLowerCase()}. */ 1438 private static final class JavaLowerCase extends CharMatcher { 1439 1440 static final CharMatcher INSTANCE = new JavaLowerCase(); 1441 1442 @Override 1443 public boolean matches(char c) { 1444 return Character.isLowerCase(c); 1445 } 1446 1447 @Override 1448 public String toString() { 1449 return "CharMatcher.javaLowerCase()"; 1450 } 1451 } 1452 1453 /** Implementation of {@link #javaIsoControl()}. */ 1454 private static final class JavaIsoControl extends NamedFastMatcher { 1455 1456 static final CharMatcher INSTANCE = new JavaIsoControl(); 1457 1458 private JavaIsoControl() { 1459 super("CharMatcher.javaIsoControl()"); 1460 } 1461 1462 @Override 1463 public boolean matches(char c) { 1464 return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); 1465 } 1466 } 1467 1468 /** Implementation of {@link #invisible()}. */ 1469 private static final class Invisible extends RangesMatcher { 1470 // Plug the following UnicodeSet pattern into 1471 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1472 // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] 1473 // with the "Abbreviate" option, and get the ranges from there. 1474 private static final String RANGE_STARTS = 1475 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u0890\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" 1476 + "\u3000\ud800\ufeff\ufff9"; 1477 private static final String RANGE_ENDS = // inclusive ends 1478 "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u0891\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" 1479 + "\u3000\uf8ff\ufeff\ufffb"; 1480 1481 static final CharMatcher INSTANCE = new Invisible(); 1482 1483 private Invisible() { 1484 super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); 1485 } 1486 } 1487 1488 /** Implementation of {@link #singleWidth()}. */ 1489 private static final class SingleWidth extends RangesMatcher { 1490 1491 static final CharMatcher INSTANCE = new SingleWidth(); 1492 1493 private SingleWidth() { 1494 super( 1495 "CharMatcher.singleWidth()", 1496 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 1497 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 1498 } 1499 } 1500 1501 // Non-static factory implementation classes 1502 1503 /** Implementation of {@link #negate()}. */ 1504 private static class Negated extends CharMatcher { 1505 1506 final CharMatcher original; 1507 1508 Negated(CharMatcher original) { 1509 this.original = checkNotNull(original); 1510 } 1511 1512 @Override 1513 public boolean matches(char c) { 1514 return !original.matches(c); 1515 } 1516 1517 @Override 1518 public boolean matchesAllOf(CharSequence sequence) { 1519 return original.matchesNoneOf(sequence); 1520 } 1521 1522 @Override 1523 public boolean matchesNoneOf(CharSequence sequence) { 1524 return original.matchesAllOf(sequence); 1525 } 1526 1527 @Override 1528 public int countIn(CharSequence sequence) { 1529 return sequence.length() - original.countIn(sequence); 1530 } 1531 1532 @GwtIncompatible // used only from other GwtIncompatible code 1533 @Override 1534 void setBits(BitSet table) { 1535 BitSet tmp = new BitSet(); 1536 original.setBits(tmp); 1537 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 1538 table.or(tmp); 1539 } 1540 1541 @Override 1542 public CharMatcher negate() { 1543 return original; 1544 } 1545 1546 @Override 1547 public String toString() { 1548 return original + ".negate()"; 1549 } 1550 } 1551 1552 /** Implementation of {@link #and(CharMatcher)}. */ 1553 private static final class And extends CharMatcher { 1554 1555 final CharMatcher first; 1556 final CharMatcher second; 1557 1558 And(CharMatcher a, CharMatcher b) { 1559 first = checkNotNull(a); 1560 second = checkNotNull(b); 1561 } 1562 1563 @Override 1564 public boolean matches(char c) { 1565 return first.matches(c) && second.matches(c); 1566 } 1567 1568 @GwtIncompatible // used only from other GwtIncompatible code 1569 @Override 1570 void setBits(BitSet table) { 1571 BitSet tmp1 = new BitSet(); 1572 first.setBits(tmp1); 1573 BitSet tmp2 = new BitSet(); 1574 second.setBits(tmp2); 1575 tmp1.and(tmp2); 1576 table.or(tmp1); 1577 } 1578 1579 @Override 1580 public String toString() { 1581 return "CharMatcher.and(" + first + ", " + second + ")"; 1582 } 1583 } 1584 1585 /** Implementation of {@link #or(CharMatcher)}. */ 1586 private static final class Or extends CharMatcher { 1587 1588 final CharMatcher first; 1589 final CharMatcher second; 1590 1591 Or(CharMatcher a, CharMatcher b) { 1592 first = checkNotNull(a); 1593 second = checkNotNull(b); 1594 } 1595 1596 @GwtIncompatible // used only from other GwtIncompatible code 1597 @Override 1598 void setBits(BitSet table) { 1599 first.setBits(table); 1600 second.setBits(table); 1601 } 1602 1603 @Override 1604 public boolean matches(char c) { 1605 return first.matches(c) || second.matches(c); 1606 } 1607 1608 @Override 1609 public String toString() { 1610 return "CharMatcher.or(" + first + ", " + second + ")"; 1611 } 1612 } 1613 1614 // Static factory implementations 1615 1616 /** Implementation of {@link #is(char)}. */ 1617 private static final class Is extends FastMatcher { 1618 1619 private final char match; 1620 1621 Is(char match) { 1622 this.match = match; 1623 } 1624 1625 @Override 1626 public boolean matches(char c) { 1627 return c == match; 1628 } 1629 1630 @Override 1631 public String replaceFrom(CharSequence sequence, char replacement) { 1632 return sequence.toString().replace(match, replacement); 1633 } 1634 1635 @Override 1636 public CharMatcher and(CharMatcher other) { 1637 return other.matches(match) ? this : none(); 1638 } 1639 1640 @Override 1641 public CharMatcher or(CharMatcher other) { 1642 return other.matches(match) ? other : super.or(other); 1643 } 1644 1645 @Override 1646 public CharMatcher negate() { 1647 return isNot(match); 1648 } 1649 1650 @GwtIncompatible // used only from other GwtIncompatible code 1651 @Override 1652 void setBits(BitSet table) { 1653 table.set(match); 1654 } 1655 1656 @Override 1657 public String toString() { 1658 return "CharMatcher.is('" + showCharacter(match) + "')"; 1659 } 1660 } 1661 1662 /** Implementation of {@link #isNot(char)}. */ 1663 private static final class IsNot extends FastMatcher { 1664 1665 private final char match; 1666 1667 IsNot(char match) { 1668 this.match = match; 1669 } 1670 1671 @Override 1672 public boolean matches(char c) { 1673 return c != match; 1674 } 1675 1676 @Override 1677 public CharMatcher and(CharMatcher other) { 1678 return other.matches(match) ? super.and(other) : other; 1679 } 1680 1681 @Override 1682 public CharMatcher or(CharMatcher other) { 1683 return other.matches(match) ? any() : this; 1684 } 1685 1686 @GwtIncompatible // used only from other GwtIncompatible code 1687 @Override 1688 void setBits(BitSet table) { 1689 table.set(0, match); 1690 table.set(match + 1, Character.MAX_VALUE + 1); 1691 } 1692 1693 @Override 1694 public CharMatcher negate() { 1695 return is(match); 1696 } 1697 1698 @Override 1699 public String toString() { 1700 return "CharMatcher.isNot('" + showCharacter(match) + "')"; 1701 } 1702 } 1703 1704 private static CharMatcher.IsEither isEither(char c1, char c2) { 1705 return new CharMatcher.IsEither(c1, c2); 1706 } 1707 1708 /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ 1709 private static final class IsEither extends FastMatcher { 1710 1711 private final char match1; 1712 private final char match2; 1713 1714 IsEither(char match1, char match2) { 1715 this.match1 = match1; 1716 this.match2 = match2; 1717 } 1718 1719 @Override 1720 public boolean matches(char c) { 1721 return c == match1 || c == match2; 1722 } 1723 1724 @GwtIncompatible // used only from other GwtIncompatible code 1725 @Override 1726 void setBits(BitSet table) { 1727 table.set(match1); 1728 table.set(match2); 1729 } 1730 1731 @Override 1732 public String toString() { 1733 return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; 1734 } 1735 } 1736 1737 /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ 1738 private static final class AnyOf extends CharMatcher { 1739 1740 private final char[] chars; 1741 1742 public AnyOf(CharSequence chars) { 1743 this.chars = chars.toString().toCharArray(); 1744 Arrays.sort(this.chars); 1745 } 1746 1747 @Override 1748 public boolean matches(char c) { 1749 return Arrays.binarySearch(chars, c) >= 0; 1750 } 1751 1752 @Override 1753 @GwtIncompatible // used only from other GwtIncompatible code 1754 void setBits(BitSet table) { 1755 for (char c : chars) { 1756 table.set(c); 1757 } 1758 } 1759 1760 @Override 1761 public String toString() { 1762 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 1763 for (char c : chars) { 1764 description.append(showCharacter(c)); 1765 } 1766 description.append("\")"); 1767 return description.toString(); 1768 } 1769 } 1770 1771 /** Implementation of {@link #inRange(char, char)}. */ 1772 private static final class InRange extends FastMatcher { 1773 1774 private final char startInclusive; 1775 private final char endInclusive; 1776 1777 InRange(char startInclusive, char endInclusive) { 1778 checkArgument(endInclusive >= startInclusive); 1779 this.startInclusive = startInclusive; 1780 this.endInclusive = endInclusive; 1781 } 1782 1783 @Override 1784 public boolean matches(char c) { 1785 return startInclusive <= c && c <= endInclusive; 1786 } 1787 1788 @GwtIncompatible // used only from other GwtIncompatible code 1789 @Override 1790 void setBits(BitSet table) { 1791 table.set(startInclusive, endInclusive + 1); 1792 } 1793 1794 @Override 1795 public String toString() { 1796 return "CharMatcher.inRange('" 1797 + showCharacter(startInclusive) 1798 + "', '" 1799 + showCharacter(endInclusive) 1800 + "')"; 1801 } 1802 } 1803 1804 /** Implementation of {@link #forPredicate(Predicate)}. */ 1805 private static final class ForPredicate extends CharMatcher { 1806 1807 private final Predicate<? super Character> predicate; 1808 1809 ForPredicate(Predicate<? super Character> predicate) { 1810 this.predicate = checkNotNull(predicate); 1811 } 1812 1813 @Override 1814 public boolean matches(char c) { 1815 return predicate.apply(c); 1816 } 1817 1818 @Deprecated 1819 @Override 1820 public boolean apply(Character character) { 1821 return predicate.apply(checkNotNull(character)); 1822 } 1823 1824 @Override 1825 public String toString() { 1826 return "CharMatcher.forPredicate(" + predicate + ")"; 1827 } 1828 } 1829}