001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkPositionIndex; 020 021import com.google.common.annotations.GwtCompatible; 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.VisibleForTesting; 024import java.util.Arrays; 025import java.util.BitSet; 026 027/** 028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 029 * for any {@link Object}. Also offers basic text processing methods based on this function. 030 * Implementations are strongly encouraged to be side-effect-free and immutable. 031 * 032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 033 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". 034 * 035 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, 036 * <a href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. 037 * It does not understand 038 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode 039 * code points</a> in the range {@code 0x10000} to {@code 0x10FFFF} 040 * which includes the majority of assigned characters, including important CJK characters and emoji. 041 * 042 * <p>Supplementary characters are 043 * <a href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary"> 044 * encoded into a {@code String} using surrogate pairs</a>, 045 * and a {@code CharMatcher} treats these just as two separate characters. 046 * {@link #countIn} counts each supplementary character as 2 {@code char}s. 047 * 048 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for 049 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). 050 * For basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. 051 * 052 * <p>Example usages: 053 * 054 * <pre> 055 * String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput); 056 * if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 057 * 058 * <p>See the Guava User Guide article on <a 059 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher} 060 * </a>. 061 * 062 * @author Kevin Bourrillion 063 * @since 1.0 064 */ 065@GwtCompatible(emulated = true) 066public abstract class CharMatcher implements Predicate<Character> { 067 /* 068 * N777777777NO 069 * N7777777777777N 070 * M777777777777777N 071 * $N877777777D77777M 072 * N M77777777ONND777M 073 * MN777777777NN D777 074 * N7ZN777777777NN ~M7778 075 * N777777777777MMNN88777N 076 * N777777777777MNZZZ7777O 077 * DZN7777O77777777777777 078 * N7OONND7777777D77777N 079 * 8$M++++?N???$77777$ 080 * M7++++N+M77777777N 081 * N77O777777777777$ M 082 * DNNM$$$$777777N D 083 * N$N:=N$777N7777M NZ 084 * 77Z::::N777777777 ODZZZ 085 * 77N::::::N77777777M NNZZZ$ 086 * $777:::::::77777777MN ZM8ZZZZZ 087 * 777M::::::Z7777777Z77 N++ZZZZNN 088 * 7777M:::::M7777777$777M $++IZZZZM 089 * M777$:::::N777777$M7777M +++++ZZZDN 090 * NN$::::::7777$$M777777N N+++ZZZZNZ 091 * N::::::N:7$O:77777777 N++++ZZZZN 092 * M::::::::::::N77777777+ +?+++++ZZZM 093 * 8::::::::::::D77777777M O+++++ZZ 094 * ::::::::::::M777777777N O+?D 095 * M:::::::::::M77777777778 77= 096 * D=::::::::::N7777777777N 777 097 * INN===::::::=77777777777N I777N 098 * ?777N========N7777777777787M N7777 099 * 77777$D======N77777777777N777N? N777777 100 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N 101 * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON 102 * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND 103 * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N 104 * 7 :N MNN$$$$M$ $$$777$8 8D8I 105 * NMM.:7O 777777778 106 * 7777777MN 107 * M NO .7: 108 * M : M 109 * 8 110 */ 111 112 // Constant matcher factory methods 113 114 /** 115 * Matches any character. 116 * 117 * @since 19.0 (since 1.0 as constant {@code ANY}) 118 */ 119 public static CharMatcher any() { 120 return Any.INSTANCE; 121 } 122 123 /** 124 * Matches no characters. 125 * 126 * @since 19.0 (since 1.0 as constant {@code NONE}) 127 */ 128 public static CharMatcher none() { 129 return None.INSTANCE; 130 } 131 132 /** 133 * Determines whether a character is whitespace according to the latest Unicode standard, as 134 * illustrated 135 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 136 * This is not the same definition used by other Java APIs. (See a 137 * <a href="https://goo.gl/Y6SLWx">comparison of several definitions of 138 * "whitespace"</a>.) 139 * 140 * <p>All Unicode White_Space characters are on the BMP and thus supported by this API. 141 * 142 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to 143 * date. 144 * 145 * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) 146 */ 147 public static CharMatcher whitespace() { 148 return Whitespace.INSTANCE; 149 } 150 151 /** 152 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 153 * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a 154 * discussion of that term. 155 * 156 * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) 157 */ 158 public static CharMatcher breakingWhitespace() { 159 return BreakingWhitespace.INSTANCE; 160 } 161 162 /** 163 * Determines whether a character is ASCII, meaning that its code point is less than 128. 164 * 165 * @since 19.0 (since 1.0 as constant {@code ASCII}) 166 */ 167 public static CharMatcher ascii() { 168 return Ascii.INSTANCE; 169 } 170 171 /** 172 * Determines whether a character is a BMP digit according to 173 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If 174 * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 175 * 176 * @deprecated Many digits are supplementary characters; see the class documentation. 177 * @since 19.0 (since 1.0 as constant {@code DIGIT}) 178 */ 179 @Deprecated 180 public static CharMatcher digit() { 181 return Digit.INSTANCE; 182 } 183 184 /** 185 * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) 186 * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', 187 * '9')}. 188 * 189 * @deprecated Many digits are supplementary characters; see the class documentation. 190 * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) 191 */ 192 @Deprecated 193 public static CharMatcher javaDigit() { 194 return JavaDigit.INSTANCE; 195 } 196 197 /** 198 * Determines whether a character is a BMP letter according to 199 * {@linkplain Character#isLetter(char) Java's definition}. 200 * If you only care to match letters of the Latin alphabet, you can use 201 * {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 202 * 203 * @deprecated Most letters are supplementary characters; see the class documentation. 204 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) 205 */ 206 @Deprecated 207 public static CharMatcher javaLetter() { 208 return JavaLetter.INSTANCE; 209 } 210 211 /** 212 * Determines whether a character is a BMP letter or digit according to 213 * {@linkplain Character#isLetterOrDigit(char) Java's definition}. 214 * 215 * @deprecated Most letters and digits are supplementary characters; see the class documentation. 216 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). 217 */ 218 @Deprecated 219 public static CharMatcher javaLetterOrDigit() { 220 return JavaLetterOrDigit.INSTANCE; 221 } 222 223 /** 224 * Determines whether a BMP character is upper case according to 225 * {@linkplain Character#isUpperCase(char) Java's definition}. 226 * 227 * @deprecated Some uppercase characters are supplementary characters; 228 * see the class documentation. 229 * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) 230 */ 231 @Deprecated 232 public static CharMatcher javaUpperCase() { 233 return JavaUpperCase.INSTANCE; 234 } 235 236 /** 237 * Determines whether a BMP character is lower case according to 238 * {@linkplain Character#isLowerCase(char) Java's definition}. 239 * 240 * @deprecated Some lowercase characters are supplementary characters; 241 * see the class documentation. 242 * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) 243 */ 244 @Deprecated 245 public static CharMatcher javaLowerCase() { 246 return JavaLowerCase.INSTANCE; 247 } 248 249 /** 250 * Determines whether a character is an ISO control character as specified by 251 * {@link Character#isISOControl(char)}. 252 * 253 * <p>All ISO control codes are on the BMP and thus supported by this API. 254 * 255 * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) 256 */ 257 public static CharMatcher javaIsoControl() { 258 return JavaIsoControl.INSTANCE; 259 } 260 261 /** 262 * Determines whether a character is invisible; that is, if its Unicode category is any of 263 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 264 * PRIVATE_USE according to ICU4J. 265 * 266 * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU). 267 * 268 * @deprecated Most invisible characters are supplementary characters; 269 * see the class documentation. 270 * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) 271 */ 272 @Deprecated 273 public static CharMatcher invisible() { 274 return Invisible.INSTANCE; 275 } 276 277 /** 278 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 279 * errs on the side of returning {@code false} (that is, it tends to assume a character is 280 * double-width). 281 * 282 * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to 283 * date. 284 * 285 * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>. 286 * 287 * @deprecated Many such characters are supplementary characters; see the class documentation. 288 * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) 289 */ 290 @Deprecated 291 public static CharMatcher singleWidth() { 292 return SingleWidth.INSTANCE; 293 } 294 295 // Legacy constants 296 297 /** 298 * Determines whether a character is whitespace according to the latest Unicode 299 * standard, as illustrated 300 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 301 * This is not the same definition used by other Java APIs. (See a 302 * <a href="https://goo.gl/Y6SLWx">comparison of several definitions of 303 * "whitespace"</a>.) 304 * 305 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant 306 * to keep it up to date. 307 * 308 * @deprecated Use {@link #whitespace()} instead. This constant is scheduled to be 309 * removed in June 2018. 310 */ 311 @com.google.common.annotations.Beta 312 @Deprecated 313 public static final CharMatcher WHITESPACE = whitespace(); 314 315 /** 316 * Determines whether a character is a breaking whitespace (that is, a whitespace 317 * which can be interpreted as a break between words for formatting purposes). See 318 * {@link #whitespace} for a discussion of that term. 319 * 320 * @since 2.0 321 * @deprecated Use {@link #breakingWhitespace()} instead. This constant is scheduled 322 * to be removed in June 2018. 323 */ 324 @com.google.common.annotations.Beta 325 @Deprecated 326 public static final CharMatcher BREAKING_WHITESPACE = breakingWhitespace(); 327 328 /** 329 * Determines whether a character is ASCII, meaning that its code point is less than 330 * 128. 331 * 332 * @deprecated Use {@link #ascii()} instead. This constant is scheduled to be 333 * removed in June 2018. 334 */ 335 @com.google.common.annotations.Beta 336 @Deprecated 337 public static final CharMatcher ASCII = ascii(); 338 339 /** 340 * Determines whether a character is a digit according to 341 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D"> 342 * Unicode</a>. If you only care to match ASCII digits, you can use 343 * {@code inRange('0', '9')}. 344 * 345 * @deprecated Many digits are supplementary characters; see the class 346 * documentation. If you need to use this, use {@link #digit()} instead. This 347 * . constant is scheduled to be removed in June 2018. 348 */ 349 @com.google.common.annotations.Beta 350 @Deprecated 351 public static final CharMatcher DIGIT = digit(); 352 353 /** 354 * Determines whether a character is a digit according to 355 * {@linkplain Character#isDigit(char) Java's definition}. If you only care to match 356 * ASCII digits, you can use {@code inRange('0', '9')}. 357 * 358 * @deprecated Many digits are supplementary characters; see the class 359 * documentation. If you need to use this, use {@link #javaDigit()} instead. 360 * This constant is scheduled to be removed in June 2018. 361 */ 362 @com.google.common.annotations.Beta 363 @Deprecated 364 public static final CharMatcher JAVA_DIGIT = javaDigit(); 365 366 /** 367 * Determines whether a character is a letter according to 368 * {@linkplain Character#isLetter(char) Java's definition}. If you only care to 369 * match letters of the Latin alphabet, you can use 370 * {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 371 * 372 * @deprecated Most letters are supplementary characters; see the class 373 * documentation. If you need to use this, use {@link #javaLetter()} instead. 374 * This constant is scheduled to be removed in June 2018. 375 */ 376 @com.google.common.annotations.Beta 377 @Deprecated 378 public static final CharMatcher JAVA_LETTER = javaLetter(); 379 380 /** 381 * Determines whether a character is a letter or digit according to 382 * {@linkplain Character#isLetterOrDigit(char) Java's definition}. 383 * 384 * @deprecated Most letters and digits are supplementary characters; see the class 385 * documentation. If you need to use this, use {@link #javaLetterOrDigit()} 386 * instead. This constant is scheduled to be removed in June 2018. 387 */ 388 @com.google.common.annotations.Beta 389 @Deprecated 390 public static final CharMatcher JAVA_LETTER_OR_DIGIT = javaLetterOrDigit(); 391 392 /** 393 * Determines whether a character is upper case according to 394 * {@linkplain Character#isUpperCase(char) Java's definition}. 395 * 396 * @deprecated Some uppercase letters are supplementary characters; see the class 397 * documentation. If you need to use this, use {@link #javaUpperCase()} instead. 398 * This constant is scheduled to be removed in June 2018. 399 */ 400 @com.google.common.annotations.Beta 401 @Deprecated 402 public static final CharMatcher JAVA_UPPER_CASE = javaUpperCase(); 403 404 /** 405 * Determines whether a character is lower case according to 406 * {@linkplain Character#isLowerCase(char) Java's definition}. 407 * 408 * @deprecated Some lowercase letters are supplementary characters; see the class 409 * documentation. If you need to use this, use {@link #javaLowerCase()} instead. 410 * This constant is scheduled to be removed in June 2018. 411 */ 412 @com.google.common.annotations.Beta 413 @Deprecated 414 public static final CharMatcher JAVA_LOWER_CASE = javaLowerCase(); 415 416 /** 417 * Determines whether a character is an ISO control character as specified by 418 * {@link Character#isISOControl(char)}. 419 * 420 * @deprecated Use {@link #javaIsoControl()} instead. This constant is scheduled to 421 * be removed in June 2018. 422 */ 423 @com.google.common.annotations.Beta 424 @Deprecated 425 public static final CharMatcher JAVA_ISO_CONTROL = javaIsoControl(); 426 427 /** 428 * Determines whether a character is invisible; that is, if its Unicode category is 429 * any of SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, 430 * SURROGATE, and PRIVATE_USE according to ICU4J. 431 * 432 * @deprecated Most invisible characters are supplementary characters; see the class 433 * documentation. If you need to use this, use {@link #invisible()} instead. 434 * This constant is scheduled to be removed in June 2018. 435 */ 436 @com.google.common.annotations.Beta 437 @Deprecated 438 public static final CharMatcher INVISIBLE = invisible(); 439 440 /** 441 * Determines whether a character is single-width (not double-width). When in doubt, 442 * this matcher errs on the side of returning {@code false} (that is, it tends to 443 * assume a character is double-width). 444 * 445 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to 446 * keep it up to date. 447 * 448 * @deprecated Many such characters are supplementary characters; see the class 449 * documentation. If you need to use this, use {@link #singleWidth()} instead. 450 * This constant is scheduled to be removed in June 2018. 451 */ 452 @com.google.common.annotations.Beta 453 @Deprecated 454 public static final CharMatcher SINGLE_WIDTH = singleWidth(); 455 456 /** 457 * Matches any character. 458 * 459 * @deprecated Use {@link #any()} instead. This constant is scheduled to be 460 * removed in June 2018. 461 */ 462 @com.google.common.annotations.Beta 463 @Deprecated 464 public static final CharMatcher ANY = any(); 465 466 /** 467 * Matches no characters. 468 * 469 * @deprecated Use {@link #none()} instead. This constant is scheduled to be 470 * removed in June 2018. 471 */ 472 @com.google.common.annotations.Beta 473 @Deprecated 474 public static final CharMatcher NONE = none(); 475 476 // Static factories 477 478 /** 479 * Returns a {@code char} matcher that matches only one specified BMP character. 480 */ 481 public static CharMatcher is(final char match) { 482 return new Is(match); 483 } 484 485 /** 486 * Returns a {@code char} matcher that matches any character except the BMP character specified. 487 * 488 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 489 */ 490 public static CharMatcher isNot(final char match) { 491 return new IsNot(match); 492 } 493 494 /** 495 * Returns a {@code char} matcher that matches any BMP character present in the given character 496 * sequence. Returns a bogus matcher if the sequence contains supplementary characters. 497 */ 498 public static CharMatcher anyOf(final CharSequence sequence) { 499 switch (sequence.length()) { 500 case 0: 501 return none(); 502 case 1: 503 return is(sequence.charAt(0)); 504 case 2: 505 return isEither(sequence.charAt(0), sequence.charAt(1)); 506 default: 507 // TODO(lowasser): is it potentially worth just going ahead and building a precomputed 508 // matcher? 509 return new AnyOf(sequence); 510 } 511 } 512 513 /** 514 * Returns a {@code char} matcher that matches any BMP character not present in the given 515 * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. 516 */ 517 public static CharMatcher noneOf(CharSequence sequence) { 518 return anyOf(sequence).negate(); 519 } 520 521 /** 522 * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints 523 * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 524 * CharMatcher.inRange('a', 'z')}. 525 * 526 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 527 */ 528 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 529 return new InRange(startInclusive, endInclusive); 530 } 531 532 /** 533 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 534 * which operates on primitive {@code char} instances instead. 535 */ 536 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 537 return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); 538 } 539 540 // Constructors 541 542 /** 543 * Constructor for use by subclasses. When subclassing, you may want to override 544 * {@code toString()} to provide a useful description. 545 */ 546 protected CharMatcher() {} 547 548 // Abstract methods 549 550 /** Determines a true or false value for the given character. */ 551 public abstract boolean matches(char c); 552 553 // Non-static factories 554 555 /** 556 * Returns a matcher that matches any character not matched by this matcher. 557 */ 558 // @Override under Java 8 but not under Java 7 559 public CharMatcher negate() { 560 return new Negated(this); 561 } 562 563 /** 564 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 565 */ 566 public CharMatcher and(CharMatcher other) { 567 return new And(this, other); 568 } 569 570 /** 571 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 572 */ 573 public CharMatcher or(CharMatcher other) { 574 return new Or(this, other); 575 } 576 577 /** 578 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 579 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 580 * worthwhile only if the precomputed matcher is queried many thousands of times. 581 * 582 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 583 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 584 * worthwhile tradeoff in a browser. 585 */ 586 public CharMatcher precomputed() { 587 return Platform.precomputeCharMatcher(this); 588 } 589 590 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 591 592 /** 593 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 594 * on {@link Platform} so that we can have different behavior in GWT. 595 * 596 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the 597 * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables 598 * for matchers that only match a few characters, and so on. In the worst-case scenario, it 599 * constructs an eight-kilobyte bit array and queries that. In many situations this produces a 600 * matcher which is faster to query than the original. 601 */ 602 @GwtIncompatible // SmallCharMatcher 603 CharMatcher precomputedInternal() { 604 final BitSet table = new BitSet(); 605 setBits(table); 606 int totalCharacters = table.cardinality(); 607 if (totalCharacters * 2 <= DISTINCT_CHARS) { 608 return precomputedPositive(totalCharacters, table, toString()); 609 } else { 610 // TODO(lowasser): is it worth it to worry about the last character of large matchers? 611 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 612 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 613 String suffix = ".negate()"; 614 final String description = toString(); 615 String negatedDescription = 616 description.endsWith(suffix) 617 ? description.substring(0, description.length() - suffix.length()) 618 : description + suffix; 619 return new NegatedFastMatcher( 620 precomputedPositive(negatedCharacters, table, negatedDescription)) { 621 @Override 622 public String toString() { 623 return description; 624 } 625 }; 626 } 627 } 628 629 /** 630 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 631 */ 632 @GwtIncompatible // SmallCharMatcher 633 private static CharMatcher precomputedPositive( 634 int totalCharacters, BitSet table, String description) { 635 switch (totalCharacters) { 636 case 0: 637 return none(); 638 case 1: 639 return is((char) table.nextSetBit(0)); 640 case 2: 641 char c1 = (char) table.nextSetBit(0); 642 char c2 = (char) table.nextSetBit(c1 + 1); 643 return isEither(c1, c2); 644 default: 645 return isSmall(totalCharacters, table.length()) 646 ? SmallCharMatcher.from(table, description) 647 : new BitSetMatcher(table, description); 648 } 649 } 650 651 @GwtIncompatible // SmallCharMatcher 652 private static boolean isSmall(int totalCharacters, int tableLength) { 653 return totalCharacters <= SmallCharMatcher.MAX_SIZE 654 && tableLength > (totalCharacters * 4 * Character.SIZE); 655 // err on the side of BitSetMatcher 656 } 657 658 /** 659 * Sets bits in {@code table} matched by this matcher. 660 */ 661 @GwtIncompatible // used only from other GwtIncompatible code 662 void setBits(BitSet table) { 663 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 664 if (matches((char) c)) { 665 table.set(c); 666 } 667 } 668 } 669 670 // Text processing routines 671 672 /** 673 * Returns {@code true} if a character sequence contains at least one matching BMP character. 674 * Equivalent to {@code !matchesNoneOf(sequence)}. 675 * 676 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 677 * character, until this returns {@code true} or the end is reached. 678 * 679 * @param sequence the character sequence to examine, possibly empty 680 * @return {@code true} if this matcher matches at least one character in the sequence 681 * @since 8.0 682 */ 683 public boolean matchesAnyOf(CharSequence sequence) { 684 return !matchesNoneOf(sequence); 685 } 686 687 /** 688 * Returns {@code true} if a character sequence contains only matching BMP characters. 689 * 690 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 691 * character, until this returns {@code false} or the end is reached. 692 * 693 * @param sequence the character sequence to examine, possibly empty 694 * @return {@code true} if this matcher matches every character in the sequence, including when 695 * the sequence is empty 696 */ 697 public boolean matchesAllOf(CharSequence sequence) { 698 for (int i = sequence.length() - 1; i >= 0; i--) { 699 if (!matches(sequence.charAt(i))) { 700 return false; 701 } 702 } 703 return true; 704 } 705 706 /** 707 * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to 708 * {@code !matchesAnyOf(sequence)}. 709 * 710 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 711 * character, until this returns {@code true} or the end is reached. 712 * 713 * @param sequence the character sequence to examine, possibly empty 714 * @return {@code true} if this matcher matches no characters in the sequence, including when 715 * the sequence is empty 716 */ 717 public boolean matchesNoneOf(CharSequence sequence) { 718 return indexIn(sequence) == -1; 719 } 720 721 /** 722 * Returns the index of the first matching BMP character in a character sequence, 723 * or {@code -1} if no matching character is present. 724 * 725 * <p>The default implementation iterates over the sequence in forward order calling 726 * {@link #matches} for each character. 727 * 728 * @param sequence the character sequence to examine from the beginning 729 * @return an index, or {@code -1} if no character matches 730 */ 731 public int indexIn(CharSequence sequence) { 732 return indexIn(sequence, 0); 733 } 734 735 /** 736 * Returns the index of the first matching BMP character in a character sequence, starting from a 737 * given position, or {@code -1} if no character matches after that position. 738 * 739 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 740 * start}, calling {@link #matches} for each character. 741 * 742 * @param sequence the character sequence to examine 743 * @param start the first index to examine; must be nonnegative and no greater than {@code 744 * sequence.length()} 745 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 746 * or {@code -1} if no character matches 747 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 748 * sequence.length()} 749 */ 750 public int indexIn(CharSequence sequence, int start) { 751 int length = sequence.length(); 752 checkPositionIndex(start, length); 753 for (int i = start; i < length; i++) { 754 if (matches(sequence.charAt(i))) { 755 return i; 756 } 757 } 758 return -1; 759 } 760 761 /** 762 * Returns the index of the last matching BMP character in a character sequence, 763 * or {@code -1} if no matching character is present. 764 * 765 * <p>The default implementation iterates over the sequence in reverse order calling 766 * {@link #matches} for each character. 767 * 768 * @param sequence the character sequence to examine from the end 769 * @return an index, or {@code -1} if no character matches 770 */ 771 public int lastIndexIn(CharSequence sequence) { 772 for (int i = sequence.length() - 1; i >= 0; i--) { 773 if (matches(sequence.charAt(i))) { 774 return i; 775 } 776 } 777 return -1; 778 } 779 780 /** 781 * Returns the number of matching {@code char}s found in a character sequence. 782 * 783 * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). 784 */ 785 public int countIn(CharSequence sequence) { 786 int count = 0; 787 for (int i = 0; i < sequence.length(); i++) { 788 if (matches(sequence.charAt(i))) { 789 count++; 790 } 791 } 792 return count; 793 } 794 795 /** 796 * Returns a string containing all non-matching characters of a character sequence, in order. For 797 * example: <pre> {@code 798 * 799 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 800 * 801 * ... returns {@code "bzr"}. 802 */ 803 public String removeFrom(CharSequence sequence) { 804 String string = sequence.toString(); 805 int pos = indexIn(string); 806 if (pos == -1) { 807 return string; 808 } 809 810 char[] chars = string.toCharArray(); 811 int spread = 1; 812 813 // This unusual loop comes from extensive benchmarking 814 OUT: 815 while (true) { 816 pos++; 817 while (true) { 818 if (pos == chars.length) { 819 break OUT; 820 } 821 if (matches(chars[pos])) { 822 break; 823 } 824 chars[pos - spread] = chars[pos]; 825 pos++; 826 } 827 spread++; 828 } 829 return new String(chars, 0, pos - spread); 830 } 831 832 /** 833 * Returns a string containing all matching BMP characters of a character sequence, in order. For 834 * example: <pre> {@code 835 * 836 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 837 * 838 * ... returns {@code "aaa"}. 839 */ 840 public String retainFrom(CharSequence sequence) { 841 return negate().removeFrom(sequence); 842 } 843 844 /** 845 * Returns a string copy of the input character sequence, with each matching BMP character 846 * replaced by a given replacement character. For example: <pre> {@code 847 * 848 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 849 * 850 * ... returns {@code "rodor"}. 851 * 852 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 853 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 854 * character. 855 * 856 * @param sequence the character sequence to replace matching characters in 857 * @param replacement the character to append to the result string in place of each matching 858 * character in {@code sequence} 859 * @return the new string 860 */ 861 public String replaceFrom(CharSequence sequence, char replacement) { 862 String string = sequence.toString(); 863 int pos = indexIn(string); 864 if (pos == -1) { 865 return string; 866 } 867 char[] chars = string.toCharArray(); 868 chars[pos] = replacement; 869 for (int i = pos + 1; i < chars.length; i++) { 870 if (matches(chars[i])) { 871 chars[i] = replacement; 872 } 873 } 874 return new String(chars); 875 } 876 877 /** 878 * Returns a string copy of the input character sequence, with each matching BMP character 879 * replaced by a given replacement sequence. For example: <pre> {@code 880 * 881 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 882 * 883 * ... returns {@code "yoohoo"}. 884 * 885 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 886 * off calling {@link #replaceFrom(CharSequence, char)} directly. 887 * 888 * @param sequence the character sequence to replace matching characters in 889 * @param replacement the characters to append to the result string in place of each matching 890 * character in {@code sequence} 891 * @return the new string 892 */ 893 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 894 int replacementLen = replacement.length(); 895 if (replacementLen == 0) { 896 return removeFrom(sequence); 897 } 898 if (replacementLen == 1) { 899 return replaceFrom(sequence, replacement.charAt(0)); 900 } 901 902 String string = sequence.toString(); 903 int pos = indexIn(string); 904 if (pos == -1) { 905 return string; 906 } 907 908 int len = string.length(); 909 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 910 911 int oldpos = 0; 912 do { 913 buf.append(string, oldpos, pos); 914 buf.append(replacement); 915 oldpos = pos + 1; 916 pos = indexIn(string, oldpos); 917 } while (pos != -1); 918 919 buf.append(string, oldpos, len); 920 return buf.toString(); 921 } 922 923 /** 924 * Returns a substring of the input character sequence that omits all matching BMP characters 925 * from the beginning and from the end of the string. For example: <pre> {@code 926 * 927 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 928 * 929 * ... returns {@code "cat"}. 930 * 931 * <p>Note that: <pre> {@code 932 * 933 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 934 * 935 * ... is equivalent to {@link String#trim()}. 936 */ 937 public String trimFrom(CharSequence sequence) { 938 int len = sequence.length(); 939 int first; 940 int last; 941 942 for (first = 0; first < len; first++) { 943 if (!matches(sequence.charAt(first))) { 944 break; 945 } 946 } 947 for (last = len - 1; last > first; last--) { 948 if (!matches(sequence.charAt(last))) { 949 break; 950 } 951 } 952 953 return sequence.subSequence(first, last + 1).toString(); 954 } 955 956 /** 957 * Returns a substring of the input character sequence that omits all matching BMP characters 958 * from the beginning of the string. For example: <pre> {@code 959 * 960 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 961 * 962 * ... returns {@code "catbab"}. 963 */ 964 public String trimLeadingFrom(CharSequence sequence) { 965 int len = sequence.length(); 966 for (int first = 0; first < len; first++) { 967 if (!matches(sequence.charAt(first))) { 968 return sequence.subSequence(first, len).toString(); 969 } 970 } 971 return ""; 972 } 973 974 /** 975 * Returns a substring of the input character sequence that omits all matching BMP characters 976 * from the end of the string. For example: <pre> {@code 977 * 978 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 979 * 980 * ... returns {@code "abacat"}. 981 */ 982 public String trimTrailingFrom(CharSequence sequence) { 983 int len = sequence.length(); 984 for (int last = len - 1; last >= 0; last--) { 985 if (!matches(sequence.charAt(last))) { 986 return sequence.subSequence(0, last + 1).toString(); 987 } 988 } 989 return ""; 990 } 991 992 /** 993 * Returns a string copy of the input character sequence, with each group of consecutive 994 * matching BMP characters replaced by a single replacement character. For example: 995 * <pre> {@code 996 * 997 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 998 * 999 * ... returns {@code "b-p-r"}. 1000 * 1001 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1002 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1003 * character. 1004 * 1005 * @param sequence the character sequence to replace matching groups of characters in 1006 * @param replacement the character to append to the result string in place of each group of 1007 * matching characters in {@code sequence} 1008 * @return the new string 1009 */ 1010 public String collapseFrom(CharSequence sequence, char replacement) { 1011 // This implementation avoids unnecessary allocation. 1012 int len = sequence.length(); 1013 for (int i = 0; i < len; i++) { 1014 char c = sequence.charAt(i); 1015 if (matches(c)) { 1016 if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 1017 // a no-op replacement 1018 i++; 1019 } else { 1020 StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); 1021 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 1022 } 1023 } 1024 } 1025 // no replacement needed 1026 return sequence.toString(); 1027 } 1028 1029 /** 1030 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1031 * groups of matching BMP characters at the start or end of the sequence are removed without 1032 * replacement. 1033 */ 1034 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1035 // This implementation avoids unnecessary allocation. 1036 int len = sequence.length(); 1037 int first = 0; 1038 int last = len - 1; 1039 1040 while (first < len && matches(sequence.charAt(first))) { 1041 first++; 1042 } 1043 1044 while (last > first && matches(sequence.charAt(last))) { 1045 last--; 1046 } 1047 1048 return (first == 0 && last == len - 1) 1049 ? collapseFrom(sequence, replacement) 1050 : finishCollapseFrom( 1051 sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); 1052 } 1053 1054 private String finishCollapseFrom( 1055 CharSequence sequence, 1056 int start, 1057 int end, 1058 char replacement, 1059 StringBuilder builder, 1060 boolean inMatchingGroup) { 1061 for (int i = start; i < end; i++) { 1062 char c = sequence.charAt(i); 1063 if (matches(c)) { 1064 if (!inMatchingGroup) { 1065 builder.append(replacement); 1066 inMatchingGroup = true; 1067 } 1068 } else { 1069 builder.append(c); 1070 inMatchingGroup = false; 1071 } 1072 } 1073 return builder.toString(); 1074 } 1075 1076 /** 1077 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 1078 * instead. 1079 */ 1080 @Deprecated 1081 @Override 1082 public boolean apply(Character character) { 1083 return matches(character); 1084 } 1085 1086 /** 1087 * Returns a string representation of this {@code CharMatcher}, such as 1088 * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 1089 */ 1090 @Override 1091 public String toString() { 1092 return super.toString(); 1093 } 1094 1095 /** 1096 * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" 1097 * where "12AB" is the four hexadecimal digits representing the 16-bit code unit. 1098 */ 1099 private static String showCharacter(char c) { 1100 String hex = "0123456789ABCDEF"; 1101 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 1102 for (int i = 0; i < 4; i++) { 1103 tmp[5 - i] = hex.charAt(c & 0xF); 1104 c = (char) (c >> 4); 1105 } 1106 return String.copyValueOf(tmp); 1107 } 1108 1109 // Fast matchers 1110 1111 /** A matcher for which precomputation will not yield any significant benefit. */ 1112 abstract static class FastMatcher extends CharMatcher { 1113 1114 @Override 1115 public final CharMatcher precomputed() { 1116 return this; 1117 } 1118 1119 @Override 1120 public CharMatcher negate() { 1121 return new NegatedFastMatcher(this); 1122 } 1123 } 1124 1125 /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ 1126 abstract static class NamedFastMatcher extends FastMatcher { 1127 1128 private final String description; 1129 1130 NamedFastMatcher(String description) { 1131 this.description = checkNotNull(description); 1132 } 1133 1134 @Override 1135 public final String toString() { 1136 return description; 1137 } 1138 } 1139 1140 /** Negation of a {@link FastMatcher}. */ 1141 static class NegatedFastMatcher extends Negated { 1142 1143 NegatedFastMatcher(CharMatcher original) { 1144 super(original); 1145 } 1146 1147 @Override 1148 public final CharMatcher precomputed() { 1149 return this; 1150 } 1151 } 1152 1153 /** Fast matcher using a {@link BitSet} table of matching characters. */ 1154 @GwtIncompatible // used only from other GwtIncompatible code 1155 private static final class BitSetMatcher extends NamedFastMatcher { 1156 1157 private final BitSet table; 1158 1159 private BitSetMatcher(BitSet table, String description) { 1160 super(description); 1161 if (table.length() + Long.SIZE < table.size()) { 1162 table = (BitSet) table.clone(); 1163 // If only we could actually call BitSet.trimToSize() ourselves... 1164 } 1165 this.table = table; 1166 } 1167 1168 @Override 1169 public boolean matches(char c) { 1170 return table.get(c); 1171 } 1172 1173 @Override 1174 void setBits(BitSet bitSet) { 1175 bitSet.or(table); 1176 } 1177 } 1178 1179 // Static constant implementation classes 1180 1181 /** Implementation of {@link #any()}. */ 1182 private static final class Any extends NamedFastMatcher { 1183 1184 static final Any INSTANCE = new Any(); 1185 1186 private Any() { 1187 super("CharMatcher.any()"); 1188 } 1189 1190 @Override 1191 public boolean matches(char c) { 1192 return true; 1193 } 1194 1195 @Override 1196 public int indexIn(CharSequence sequence) { 1197 return (sequence.length() == 0) ? -1 : 0; 1198 } 1199 1200 @Override 1201 public int indexIn(CharSequence sequence, int start) { 1202 int length = sequence.length(); 1203 checkPositionIndex(start, length); 1204 return (start == length) ? -1 : start; 1205 } 1206 1207 @Override 1208 public int lastIndexIn(CharSequence sequence) { 1209 return sequence.length() - 1; 1210 } 1211 1212 @Override 1213 public boolean matchesAllOf(CharSequence sequence) { 1214 checkNotNull(sequence); 1215 return true; 1216 } 1217 1218 @Override 1219 public boolean matchesNoneOf(CharSequence sequence) { 1220 return sequence.length() == 0; 1221 } 1222 1223 @Override 1224 public String removeFrom(CharSequence sequence) { 1225 checkNotNull(sequence); 1226 return ""; 1227 } 1228 1229 @Override 1230 public String replaceFrom(CharSequence sequence, char replacement) { 1231 char[] array = new char[sequence.length()]; 1232 Arrays.fill(array, replacement); 1233 return new String(array); 1234 } 1235 1236 @Override 1237 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1238 StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); 1239 for (int i = 0; i < sequence.length(); i++) { 1240 result.append(replacement); 1241 } 1242 return result.toString(); 1243 } 1244 1245 @Override 1246 public String collapseFrom(CharSequence sequence, char replacement) { 1247 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 1248 } 1249 1250 @Override 1251 public String trimFrom(CharSequence sequence) { 1252 checkNotNull(sequence); 1253 return ""; 1254 } 1255 1256 @Override 1257 public int countIn(CharSequence sequence) { 1258 return sequence.length(); 1259 } 1260 1261 @Override 1262 public CharMatcher and(CharMatcher other) { 1263 return checkNotNull(other); 1264 } 1265 1266 @Override 1267 public CharMatcher or(CharMatcher other) { 1268 checkNotNull(other); 1269 return this; 1270 } 1271 1272 @Override 1273 public CharMatcher negate() { 1274 return none(); 1275 } 1276 } 1277 1278 /** Implementation of {@link #none()}. */ 1279 private static final class None extends NamedFastMatcher { 1280 1281 static final None INSTANCE = new None(); 1282 1283 private None() { 1284 super("CharMatcher.none()"); 1285 } 1286 1287 @Override 1288 public boolean matches(char c) { 1289 return false; 1290 } 1291 1292 @Override 1293 public int indexIn(CharSequence sequence) { 1294 checkNotNull(sequence); 1295 return -1; 1296 } 1297 1298 @Override 1299 public int indexIn(CharSequence sequence, int start) { 1300 int length = sequence.length(); 1301 checkPositionIndex(start, length); 1302 return -1; 1303 } 1304 1305 @Override 1306 public int lastIndexIn(CharSequence sequence) { 1307 checkNotNull(sequence); 1308 return -1; 1309 } 1310 1311 @Override 1312 public boolean matchesAllOf(CharSequence sequence) { 1313 return sequence.length() == 0; 1314 } 1315 1316 @Override 1317 public boolean matchesNoneOf(CharSequence sequence) { 1318 checkNotNull(sequence); 1319 return true; 1320 } 1321 1322 @Override 1323 public String removeFrom(CharSequence sequence) { 1324 return sequence.toString(); 1325 } 1326 1327 @Override 1328 public String replaceFrom(CharSequence sequence, char replacement) { 1329 return sequence.toString(); 1330 } 1331 1332 @Override 1333 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1334 checkNotNull(replacement); 1335 return sequence.toString(); 1336 } 1337 1338 @Override 1339 public String collapseFrom(CharSequence sequence, char replacement) { 1340 return sequence.toString(); 1341 } 1342 1343 @Override 1344 public String trimFrom(CharSequence sequence) { 1345 return sequence.toString(); 1346 } 1347 1348 @Override 1349 public String trimLeadingFrom(CharSequence sequence) { 1350 return sequence.toString(); 1351 } 1352 1353 @Override 1354 public String trimTrailingFrom(CharSequence sequence) { 1355 return sequence.toString(); 1356 } 1357 1358 @Override 1359 public int countIn(CharSequence sequence) { 1360 checkNotNull(sequence); 1361 return 0; 1362 } 1363 1364 @Override 1365 public CharMatcher and(CharMatcher other) { 1366 checkNotNull(other); 1367 return this; 1368 } 1369 1370 @Override 1371 public CharMatcher or(CharMatcher other) { 1372 return checkNotNull(other); 1373 } 1374 1375 @Override 1376 public CharMatcher negate() { 1377 return any(); 1378 } 1379 } 1380 1381 /** Implementation of {@link #whitespace()}. */ 1382 @VisibleForTesting 1383 static final class Whitespace extends NamedFastMatcher { 1384 1385 static final String TABLE = 1386 "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1387 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1388 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1389 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1390 static final int MULTIPLIER = 1682554634; 1391 static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); 1392 1393 static final Whitespace INSTANCE = new Whitespace(); 1394 1395 Whitespace() { 1396 super("CharMatcher.whitespace()"); 1397 } 1398 1399 @Override 1400 public boolean matches(char c) { 1401 return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; 1402 } 1403 1404 @GwtIncompatible // used only from other GwtIncompatible code 1405 @Override 1406 void setBits(BitSet table) { 1407 for (int i = 0; i < TABLE.length(); i++) { 1408 table.set(TABLE.charAt(i)); 1409 } 1410 } 1411 } 1412 1413 /** Implementation of {@link #breakingWhitespace()}. */ 1414 private static final class BreakingWhitespace extends CharMatcher { 1415 1416 static final CharMatcher INSTANCE = new BreakingWhitespace(); 1417 1418 @Override 1419 public boolean matches(char c) { 1420 switch (c) { 1421 case '\t': 1422 case '\n': 1423 case '\013': 1424 case '\f': 1425 case '\r': 1426 case ' ': 1427 case '\u0085': 1428 case '\u1680': 1429 case '\u2028': 1430 case '\u2029': 1431 case '\u205f': 1432 case '\u3000': 1433 return true; 1434 case '\u2007': 1435 return false; 1436 default: 1437 return c >= '\u2000' && c <= '\u200a'; 1438 } 1439 } 1440 1441 @Override 1442 public String toString() { 1443 return "CharMatcher.breakingWhitespace()"; 1444 } 1445 } 1446 1447 /** Implementation of {@link #ascii()}. */ 1448 private static final class Ascii extends NamedFastMatcher { 1449 1450 static final Ascii INSTANCE = new Ascii(); 1451 1452 Ascii() { 1453 super("CharMatcher.ascii()"); 1454 } 1455 1456 @Override 1457 public boolean matches(char c) { 1458 return c <= '\u007f'; 1459 } 1460 } 1461 1462 /** Implementation that matches characters that fall within multiple ranges. */ 1463 private static class RangesMatcher extends CharMatcher { 1464 1465 private final String description; 1466 private final char[] rangeStarts; 1467 private final char[] rangeEnds; 1468 1469 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 1470 this.description = description; 1471 this.rangeStarts = rangeStarts; 1472 this.rangeEnds = rangeEnds; 1473 checkArgument(rangeStarts.length == rangeEnds.length); 1474 for (int i = 0; i < rangeStarts.length; i++) { 1475 checkArgument(rangeStarts[i] <= rangeEnds[i]); 1476 if (i + 1 < rangeStarts.length) { 1477 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 1478 } 1479 } 1480 } 1481 1482 @Override 1483 public boolean matches(char c) { 1484 int index = Arrays.binarySearch(rangeStarts, c); 1485 if (index >= 0) { 1486 return true; 1487 } else { 1488 index = ~index - 1; 1489 return index >= 0 && c <= rangeEnds[index]; 1490 } 1491 } 1492 1493 @Override 1494 public String toString() { 1495 return description; 1496 } 1497 } 1498 1499 /** Implementation of {@link #digit()}. */ 1500 private static final class Digit extends RangesMatcher { 1501 // Plug the following UnicodeSet pattern into 1502 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1503 // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] 1504 // and get the zeroes from there. 1505 1506 // Must be in ascending order. 1507 private static final String ZEROES = 1508 "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" 1509 + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" 1510 + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; 1511 1512 private static char[] zeroes() { 1513 return ZEROES.toCharArray(); 1514 } 1515 1516 private static char[] nines() { 1517 char[] nines = new char[ZEROES.length()]; 1518 for (int i = 0; i < ZEROES.length(); i++) { 1519 nines[i] = (char) (ZEROES.charAt(i) + 9); 1520 } 1521 return nines; 1522 } 1523 1524 static final Digit INSTANCE = new Digit(); 1525 1526 private Digit() { 1527 super("CharMatcher.digit()", zeroes(), nines()); 1528 } 1529 } 1530 1531 /** Implementation of {@link #javaDigit()}. */ 1532 private static final class JavaDigit extends CharMatcher { 1533 1534 static final JavaDigit INSTANCE = new JavaDigit(); 1535 1536 @Override 1537 public boolean matches(char c) { 1538 return Character.isDigit(c); 1539 } 1540 1541 @Override 1542 public String toString() { 1543 return "CharMatcher.javaDigit()"; 1544 } 1545 } 1546 1547 /** Implementation of {@link #javaLetter()}. */ 1548 private static final class JavaLetter extends CharMatcher { 1549 1550 static final JavaLetter INSTANCE = new JavaLetter(); 1551 1552 @Override 1553 public boolean matches(char c) { 1554 return Character.isLetter(c); 1555 } 1556 1557 @Override 1558 public String toString() { 1559 return "CharMatcher.javaLetter()"; 1560 } 1561 } 1562 1563 /** Implementation of {@link #javaLetterOrDigit()}. */ 1564 private static final class JavaLetterOrDigit extends CharMatcher { 1565 1566 static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit(); 1567 1568 @Override 1569 public boolean matches(char c) { 1570 return Character.isLetterOrDigit(c); 1571 } 1572 1573 @Override 1574 public String toString() { 1575 return "CharMatcher.javaLetterOrDigit()"; 1576 } 1577 } 1578 1579 /** Implementation of {@link #javaUpperCase()}. */ 1580 private static final class JavaUpperCase extends CharMatcher { 1581 1582 static final JavaUpperCase INSTANCE = new JavaUpperCase(); 1583 1584 @Override 1585 public boolean matches(char c) { 1586 return Character.isUpperCase(c); 1587 } 1588 1589 @Override 1590 public String toString() { 1591 return "CharMatcher.javaUpperCase()"; 1592 } 1593 } 1594 1595 /** Implementation of {@link #javaLowerCase()}. */ 1596 private static final class JavaLowerCase extends CharMatcher { 1597 1598 static final JavaLowerCase INSTANCE = new JavaLowerCase(); 1599 1600 @Override 1601 public boolean matches(char c) { 1602 return Character.isLowerCase(c); 1603 } 1604 1605 @Override 1606 public String toString() { 1607 return "CharMatcher.javaLowerCase()"; 1608 } 1609 } 1610 1611 /** Implementation of {@link #javaIsoControl()}. */ 1612 private static final class JavaIsoControl extends NamedFastMatcher { 1613 1614 static final JavaIsoControl INSTANCE = new JavaIsoControl(); 1615 1616 private JavaIsoControl() { 1617 super("CharMatcher.javaIsoControl()"); 1618 } 1619 1620 @Override 1621 public boolean matches(char c) { 1622 return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); 1623 } 1624 } 1625 1626 /** Implementation of {@link #invisible()}. */ 1627 private static final class Invisible extends RangesMatcher { 1628 // Plug the following UnicodeSet pattern into 1629 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1630 // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] 1631 // with the "Abbreviate" option, and get the ranges from there. 1632 private static final String RANGE_STARTS = 1633 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" 1634 + "\u3000\ud800\ufeff\ufff9"; 1635 private static final String RANGE_ENDS = // inclusive ends 1636 "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" 1637 + "\u3000\uf8ff\ufeff\ufffb"; 1638 1639 static final Invisible INSTANCE = new Invisible(); 1640 1641 private Invisible() { 1642 super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); 1643 } 1644 } 1645 1646 /** Implementation of {@link #singleWidth()}. */ 1647 private static final class SingleWidth extends RangesMatcher { 1648 1649 static final SingleWidth INSTANCE = new SingleWidth(); 1650 1651 private SingleWidth() { 1652 super( 1653 "CharMatcher.singleWidth()", 1654 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 1655 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 1656 } 1657 } 1658 1659 // Non-static factory implementation classes 1660 1661 /** Implementation of {@link #negate()}. */ 1662 private static class Negated extends CharMatcher { 1663 1664 final CharMatcher original; 1665 1666 Negated(CharMatcher original) { 1667 this.original = checkNotNull(original); 1668 } 1669 1670 @Override 1671 public boolean matches(char c) { 1672 return !original.matches(c); 1673 } 1674 1675 @Override 1676 public boolean matchesAllOf(CharSequence sequence) { 1677 return original.matchesNoneOf(sequence); 1678 } 1679 1680 @Override 1681 public boolean matchesNoneOf(CharSequence sequence) { 1682 return original.matchesAllOf(sequence); 1683 } 1684 1685 @Override 1686 public int countIn(CharSequence sequence) { 1687 return sequence.length() - original.countIn(sequence); 1688 } 1689 1690 @GwtIncompatible // used only from other GwtIncompatible code 1691 @Override 1692 void setBits(BitSet table) { 1693 BitSet tmp = new BitSet(); 1694 original.setBits(tmp); 1695 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 1696 table.or(tmp); 1697 } 1698 1699 @Override 1700 public CharMatcher negate() { 1701 return original; 1702 } 1703 1704 @Override 1705 public String toString() { 1706 return original + ".negate()"; 1707 } 1708 } 1709 1710 /** Implementation of {@link #and(CharMatcher)}. */ 1711 private static final class And extends CharMatcher { 1712 1713 final CharMatcher first; 1714 final CharMatcher second; 1715 1716 And(CharMatcher a, CharMatcher b) { 1717 first = checkNotNull(a); 1718 second = checkNotNull(b); 1719 } 1720 1721 @Override 1722 public boolean matches(char c) { 1723 return first.matches(c) && second.matches(c); 1724 } 1725 1726 @GwtIncompatible // used only from other GwtIncompatible code 1727 @Override 1728 void setBits(BitSet table) { 1729 BitSet tmp1 = new BitSet(); 1730 first.setBits(tmp1); 1731 BitSet tmp2 = new BitSet(); 1732 second.setBits(tmp2); 1733 tmp1.and(tmp2); 1734 table.or(tmp1); 1735 } 1736 1737 @Override 1738 public String toString() { 1739 return "CharMatcher.and(" + first + ", " + second + ")"; 1740 } 1741 } 1742 1743 /** Implementation of {@link #or(CharMatcher)}. */ 1744 private static final class Or extends CharMatcher { 1745 1746 final CharMatcher first; 1747 final CharMatcher second; 1748 1749 Or(CharMatcher a, CharMatcher b) { 1750 first = checkNotNull(a); 1751 second = checkNotNull(b); 1752 } 1753 1754 @GwtIncompatible // used only from other GwtIncompatible code 1755 @Override 1756 void setBits(BitSet table) { 1757 first.setBits(table); 1758 second.setBits(table); 1759 } 1760 1761 @Override 1762 public boolean matches(char c) { 1763 return first.matches(c) || second.matches(c); 1764 } 1765 1766 @Override 1767 public String toString() { 1768 return "CharMatcher.or(" + first + ", " + second + ")"; 1769 } 1770 } 1771 1772 // Static factory implementations 1773 1774 /** Implementation of {@link #is(char)}. */ 1775 private static final class Is extends FastMatcher { 1776 1777 private final char match; 1778 1779 Is(char match) { 1780 this.match = match; 1781 } 1782 1783 @Override 1784 public boolean matches(char c) { 1785 return c == match; 1786 } 1787 1788 @Override 1789 public String replaceFrom(CharSequence sequence, char replacement) { 1790 return sequence.toString().replace(match, replacement); 1791 } 1792 1793 @Override 1794 public CharMatcher and(CharMatcher other) { 1795 return other.matches(match) ? this : none(); 1796 } 1797 1798 @Override 1799 public CharMatcher or(CharMatcher other) { 1800 return other.matches(match) ? other : super.or(other); 1801 } 1802 1803 @Override 1804 public CharMatcher negate() { 1805 return isNot(match); 1806 } 1807 1808 @GwtIncompatible // used only from other GwtIncompatible code 1809 @Override 1810 void setBits(BitSet table) { 1811 table.set(match); 1812 } 1813 1814 @Override 1815 public String toString() { 1816 return "CharMatcher.is('" + showCharacter(match) + "')"; 1817 } 1818 } 1819 1820 /** Implementation of {@link #isNot(char)}. */ 1821 private static final class IsNot extends FastMatcher { 1822 1823 private final char match; 1824 1825 IsNot(char match) { 1826 this.match = match; 1827 } 1828 1829 @Override 1830 public boolean matches(char c) { 1831 return c != match; 1832 } 1833 1834 @Override 1835 public CharMatcher and(CharMatcher other) { 1836 return other.matches(match) ? super.and(other) : other; 1837 } 1838 1839 @Override 1840 public CharMatcher or(CharMatcher other) { 1841 return other.matches(match) ? any() : this; 1842 } 1843 1844 @GwtIncompatible // used only from other GwtIncompatible code 1845 @Override 1846 void setBits(BitSet table) { 1847 table.set(0, match); 1848 table.set(match + 1, Character.MAX_VALUE + 1); 1849 } 1850 1851 @Override 1852 public CharMatcher negate() { 1853 return is(match); 1854 } 1855 1856 @Override 1857 public String toString() { 1858 return "CharMatcher.isNot('" + showCharacter(match) + "')"; 1859 } 1860 } 1861 1862 private static CharMatcher.IsEither isEither(char c1, char c2) { 1863 return new CharMatcher.IsEither(c1, c2); 1864 } 1865 1866 /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ 1867 private static final class IsEither extends FastMatcher { 1868 1869 private final char match1; 1870 private final char match2; 1871 1872 IsEither(char match1, char match2) { 1873 this.match1 = match1; 1874 this.match2 = match2; 1875 } 1876 1877 @Override 1878 public boolean matches(char c) { 1879 return c == match1 || c == match2; 1880 } 1881 1882 @GwtIncompatible // used only from other GwtIncompatible code 1883 @Override 1884 void setBits(BitSet table) { 1885 table.set(match1); 1886 table.set(match2); 1887 } 1888 1889 @Override 1890 public String toString() { 1891 return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; 1892 } 1893 } 1894 1895 /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ 1896 private static final class AnyOf extends CharMatcher { 1897 1898 private final char[] chars; 1899 1900 public AnyOf(CharSequence chars) { 1901 this.chars = chars.toString().toCharArray(); 1902 Arrays.sort(this.chars); 1903 } 1904 1905 @Override 1906 public boolean matches(char c) { 1907 return Arrays.binarySearch(chars, c) >= 0; 1908 } 1909 1910 @Override 1911 @GwtIncompatible // used only from other GwtIncompatible code 1912 void setBits(BitSet table) { 1913 for (char c : chars) { 1914 table.set(c); 1915 } 1916 } 1917 1918 @Override 1919 public String toString() { 1920 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 1921 for (char c : chars) { 1922 description.append(showCharacter(c)); 1923 } 1924 description.append("\")"); 1925 return description.toString(); 1926 } 1927 } 1928 1929 /** Implementation of {@link #inRange(char, char)}. */ 1930 private static final class InRange extends FastMatcher { 1931 1932 private final char startInclusive; 1933 private final char endInclusive; 1934 1935 InRange(char startInclusive, char endInclusive) { 1936 checkArgument(endInclusive >= startInclusive); 1937 this.startInclusive = startInclusive; 1938 this.endInclusive = endInclusive; 1939 } 1940 1941 @Override 1942 public boolean matches(char c) { 1943 return startInclusive <= c && c <= endInclusive; 1944 } 1945 1946 @GwtIncompatible // used only from other GwtIncompatible code 1947 @Override 1948 void setBits(BitSet table) { 1949 table.set(startInclusive, endInclusive + 1); 1950 } 1951 1952 @Override 1953 public String toString() { 1954 return "CharMatcher.inRange('" 1955 + showCharacter(startInclusive) 1956 + "', '" 1957 + showCharacter(endInclusive) 1958 + "')"; 1959 } 1960 } 1961 1962 /** Implementation of {@link #forPredicate(Predicate)}. */ 1963 private static final class ForPredicate extends CharMatcher { 1964 1965 private final Predicate<? super Character> predicate; 1966 1967 ForPredicate(Predicate<? super Character> predicate) { 1968 this.predicate = checkNotNull(predicate); 1969 } 1970 1971 @Override 1972 public boolean matches(char c) { 1973 return predicate.apply(c); 1974 } 1975 1976 @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily 1977 @Override 1978 public boolean apply(Character character) { 1979 return predicate.apply(checkNotNull(character)); 1980 } 1981 1982 @Override 1983 public String toString() { 1984 return "CharMatcher.forPredicate(" + predicate + ")"; 1985 } 1986 } 1987}