001/* 002 * Copyright (C) 2008 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.base; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkPositionIndex; 020 021import com.google.common.annotations.GwtCompatible; 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.VisibleForTesting; 024import java.util.Arrays; 025import java.util.BitSet; 026 027/** 028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does 029 * for any {@link Object}. Also offers basic text processing methods based on this function. 030 * Implementations are strongly encouraged to be side-effect-free and immutable. 031 * 032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean 033 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}". 034 * 035 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a 036 * href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand 037 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code 038 * points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of 039 * assigned characters, including important CJK characters and emoji. 040 * 041 * <p>Supplementary characters are <a 042 * href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded 043 * into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as 044 * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s. 045 * 046 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for 047 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For 048 * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner. 049 * 050 * <p>Example usages: 051 * 052 * <pre> 053 * String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput); 054 * if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre> 055 * 056 * <p>See the Guava User Guide article on <a 057 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher} 058 * </a>. 059 * 060 * @author Kevin Bourrillion 061 * @since 1.0 062 */ 063@GwtCompatible(emulated = true) 064public abstract class CharMatcher implements Predicate<Character> { 065 /* 066 * N777777777NO 067 * N7777777777777N 068 * M777777777777777N 069 * $N877777777D77777M 070 * N M77777777ONND777M 071 * MN777777777NN D777 072 * N7ZN777777777NN ~M7778 073 * N777777777777MMNN88777N 074 * N777777777777MNZZZ7777O 075 * DZN7777O77777777777777 076 * N7OONND7777777D77777N 077 * 8$M++++?N???$77777$ 078 * M7++++N+M77777777N 079 * N77O777777777777$ M 080 * DNNM$$$$777777N D 081 * N$N:=N$777N7777M NZ 082 * 77Z::::N777777777 ODZZZ 083 * 77N::::::N77777777M NNZZZ$ 084 * $777:::::::77777777MN ZM8ZZZZZ 085 * 777M::::::Z7777777Z77 N++ZZZZNN 086 * 7777M:::::M7777777$777M $++IZZZZM 087 * M777$:::::N777777$M7777M +++++ZZZDN 088 * NN$::::::7777$$M777777N N+++ZZZZNZ 089 * N::::::N:7$O:77777777 N++++ZZZZN 090 * M::::::::::::N77777777+ +?+++++ZZZM 091 * 8::::::::::::D77777777M O+++++ZZ 092 * ::::::::::::M777777777N O+?D 093 * M:::::::::::M77777777778 77= 094 * D=::::::::::N7777777777N 777 095 * INN===::::::=77777777777N I777N 096 * ?777N========N7777777777787M N7777 097 * 77777$D======N77777777777N777N? N777777 098 * I77777$$$N7===M$$77777777$77777777$MMZ77777777N 099 * $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON 100 * M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND 101 * O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N 102 * 7 :N MNN$$$$M$ $$$777$8 8D8I 103 * NMM.:7O 777777778 104 * 7777777MN 105 * M NO .7: 106 * M : M 107 * 8 108 */ 109 110 // Constant matcher factory methods 111 112 /** 113 * Matches any character. 114 * 115 * @since 19.0 (since 1.0 as constant {@code ANY}) 116 */ 117 public static CharMatcher any() { 118 return Any.INSTANCE; 119 } 120 121 /** 122 * Matches no characters. 123 * 124 * @since 19.0 (since 1.0 as constant {@code NONE}) 125 */ 126 public static CharMatcher none() { 127 return None.INSTANCE; 128 } 129 130 /** 131 * Determines whether a character is whitespace according to the latest Unicode standard, as 132 * illustrated <a 133 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 134 * This is not the same definition used by other Java APIs. (See a <a 135 * href="https://goo.gl/Y6SLWx">comparison of several definitions of "whitespace"</a>.) 136 * 137 * <p>All Unicode White_Space characters are on the BMP and thus supported by this API. 138 * 139 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to 140 * date. 141 * 142 * @since 19.0 (since 1.0 as constant {@code WHITESPACE}) 143 */ 144 public static CharMatcher whitespace() { 145 return Whitespace.INSTANCE; 146 } 147 148 /** 149 * Determines whether a character is a breaking whitespace (that is, a whitespace which can be 150 * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a 151 * discussion of that term. 152 * 153 * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE}) 154 */ 155 public static CharMatcher breakingWhitespace() { 156 return BreakingWhitespace.INSTANCE; 157 } 158 159 /** 160 * Determines whether a character is ASCII, meaning that its code point is less than 128. 161 * 162 * @since 19.0 (since 1.0 as constant {@code ASCII}) 163 */ 164 public static CharMatcher ascii() { 165 return Ascii.INSTANCE; 166 } 167 168 /** 169 * Determines whether a character is a BMP digit according to <a 170 * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If 171 * you only care to match ASCII digits, you can use {@code inRange('0', '9')}. 172 * 173 * @deprecated Many digits are supplementary characters; see the class documentation. 174 * @since 19.0 (since 1.0 as constant {@code DIGIT}) 175 */ 176 @Deprecated 177 public static CharMatcher digit() { 178 return Digit.INSTANCE; 179 } 180 181 /** 182 * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char) 183 * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0', 184 * '9')}. 185 * 186 * @deprecated Many digits are supplementary characters; see the class documentation. 187 * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT}) 188 */ 189 @Deprecated 190 public static CharMatcher javaDigit() { 191 return JavaDigit.INSTANCE; 192 } 193 194 /** 195 * Determines whether a character is a BMP letter according to {@linkplain 196 * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin 197 * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 198 * 199 * @deprecated Most letters are supplementary characters; see the class documentation. 200 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER}) 201 */ 202 @Deprecated 203 public static CharMatcher javaLetter() { 204 return JavaLetter.INSTANCE; 205 } 206 207 /** 208 * Determines whether a character is a BMP letter or digit according to {@linkplain 209 * Character#isLetterOrDigit(char) Java's definition}. 210 * 211 * @deprecated Most letters and digits are supplementary characters; see the class documentation. 212 * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}). 213 */ 214 @Deprecated 215 public static CharMatcher javaLetterOrDigit() { 216 return JavaLetterOrDigit.INSTANCE; 217 } 218 219 /** 220 * Determines whether a BMP character is upper case according to {@linkplain 221 * Character#isUpperCase(char) Java's definition}. 222 * 223 * @deprecated Some uppercase characters are supplementary characters; see the class 224 * documentation. 225 * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE}) 226 */ 227 @Deprecated 228 public static CharMatcher javaUpperCase() { 229 return JavaUpperCase.INSTANCE; 230 } 231 232 /** 233 * Determines whether a BMP character is lower case according to {@linkplain 234 * Character#isLowerCase(char) Java's definition}. 235 * 236 * @deprecated Some lowercase characters are supplementary characters; see the class 237 * documentation. 238 * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE}) 239 */ 240 @Deprecated 241 public static CharMatcher javaLowerCase() { 242 return JavaLowerCase.INSTANCE; 243 } 244 245 /** 246 * Determines whether a character is an ISO control character as specified by {@link 247 * Character#isISOControl(char)}. 248 * 249 * <p>All ISO control codes are on the BMP and thus supported by this API. 250 * 251 * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL}) 252 */ 253 public static CharMatcher javaIsoControl() { 254 return JavaIsoControl.INSTANCE; 255 } 256 257 /** 258 * Determines whether a character is invisible; that is, if its Unicode category is any of 259 * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and 260 * PRIVATE_USE according to ICU4J. 261 * 262 * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU). 263 * 264 * @deprecated Most invisible characters are supplementary characters; see the class 265 * documentation. 266 * @since 19.0 (since 1.0 as constant {@code INVISIBLE}) 267 */ 268 @Deprecated 269 public static CharMatcher invisible() { 270 return Invisible.INSTANCE; 271 } 272 273 /** 274 * Determines whether a character is single-width (not double-width). When in doubt, this matcher 275 * errs on the side of returning {@code false} (that is, it tends to assume a character is 276 * double-width). 277 * 278 * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to 279 * date. 280 * 281 * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>. 282 * 283 * @deprecated Many such characters are supplementary characters; see the class documentation. 284 * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH}) 285 */ 286 @Deprecated 287 public static CharMatcher singleWidth() { 288 return SingleWidth.INSTANCE; 289 } 290 291 // Legacy constants 292 293 /** 294 * Determines whether a character is whitespace according to the latest Unicode 295 * standard, as illustrated 296 * <a 297 // href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 298 * This is not the same definition used by other Java APIs. (See a 299 * <a href="https://goo.gl/Y6SLWx">comparison of several definitions of 300 * "whitespace"</a>.) 301 * 302 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant 303 * to keep it up to date. 304 * 305 * @deprecated Use {@link #whitespace()} instead. This constant is scheduled to be 306 * removed in June 2018. 307 */ 308 @com.google.common.annotations.Beta 309 @Deprecated 310 public static final CharMatcher WHITESPACE = whitespace(); 311 312 /** 313 * Determines whether a character is a breaking whitespace (that is, a whitespace 314 * which can be interpreted as a break between words for formatting purposes). See 315 * {@link #whitespace} for a discussion of that term. 316 * 317 * @since 2.0 318 * @deprecated Use {@link #breakingWhitespace()} instead. This constant is scheduled 319 * to be removed in June 2018. 320 */ 321 @com.google.common.annotations.Beta 322 @Deprecated 323 public static final CharMatcher BREAKING_WHITESPACE = breakingWhitespace(); 324 325 /** 326 * Determines whether a character is ASCII, meaning that its code point is less than 327 * 128. 328 * 329 * @deprecated Use {@link #ascii()} instead. This constant is scheduled to be 330 * removed in June 2018. 331 */ 332 @com.google.common.annotations.Beta 333 @Deprecated 334 public static final CharMatcher ASCII = ascii(); 335 336 /** 337 * Determines whether a character is a digit according to 338 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D"> 339 * Unicode</a>. If you only care to match ASCII digits, you can use 340 * {@code inRange('0', '9')}. 341 * 342 * @deprecated Many digits are supplementary characters; see the class 343 * documentation. If you need to use this, use {@link #digit()} instead. This 344 * . constant is scheduled to be removed in June 2018. 345 */ 346 @com.google.common.annotations.Beta 347 @Deprecated 348 public static final CharMatcher DIGIT = digit(); 349 350 /** 351 * Determines whether a character is a digit according to 352 * {@linkplain Character#isDigit(char) Java's definition}. If you only care to match 353 * ASCII digits, you can use {@code inRange('0', '9')}. 354 * 355 * @deprecated Many digits are supplementary characters; see the class 356 * documentation. If you need to use this, use {@link #javaDigit()} instead. 357 * This constant is scheduled to be removed in June 2018. 358 */ 359 @com.google.common.annotations.Beta 360 @Deprecated 361 public static final CharMatcher JAVA_DIGIT = javaDigit(); 362 363 /** 364 * Determines whether a character is a letter according to 365 * {@linkplain Character#isLetter(char) Java's definition}. If you only care to 366 * match letters of the Latin alphabet, you can use 367 * {@code inRange('a', 'z').or(inRange('A', 'Z'))}. 368 * 369 * @deprecated Most letters are supplementary characters; see the class 370 * documentation. If you need to use this, use {@link #javaLetter()} instead. 371 * This constant is scheduled to be removed in June 2018. 372 */ 373 @com.google.common.annotations.Beta 374 @Deprecated 375 public static final CharMatcher JAVA_LETTER = javaLetter(); 376 377 /** 378 * Determines whether a character is a letter or digit according to 379 * {@linkplain Character#isLetterOrDigit(char) Java's definition}. 380 * 381 * @deprecated Most letters and digits are supplementary characters; see the class 382 * documentation. If you need to use this, use {@link #javaLetterOrDigit()} 383 * instead. This constant is scheduled to be removed in June 2018. 384 */ 385 @com.google.common.annotations.Beta 386 @Deprecated 387 public static final CharMatcher JAVA_LETTER_OR_DIGIT = javaLetterOrDigit(); 388 389 /** 390 * Determines whether a character is upper case according to 391 * {@linkplain Character#isUpperCase(char) Java's definition}. 392 * 393 * @deprecated Some uppercase letters are supplementary characters; see the class 394 * documentation. If you need to use this, use {@link #javaUpperCase()} instead. 395 * This constant is scheduled to be removed in June 2018. 396 */ 397 @com.google.common.annotations.Beta 398 @Deprecated 399 public static final CharMatcher JAVA_UPPER_CASE = javaUpperCase(); 400 401 /** 402 * Determines whether a character is lower case according to 403 * {@linkplain Character#isLowerCase(char) Java's definition}. 404 * 405 * @deprecated Some lowercase letters are supplementary characters; see the class 406 * documentation. If you need to use this, use {@link #javaLowerCase()} instead. 407 * This constant is scheduled to be removed in June 2018. 408 */ 409 @com.google.common.annotations.Beta 410 @Deprecated 411 public static final CharMatcher JAVA_LOWER_CASE = javaLowerCase(); 412 413 /** 414 * Determines whether a character is an ISO control character as specified by 415 * {@link Character#isISOControl(char)}. 416 * 417 * @deprecated Use {@link #javaIsoControl()} instead. This constant is scheduled to 418 * be removed in June 2018. 419 */ 420 @com.google.common.annotations.Beta 421 @Deprecated 422 public static final CharMatcher JAVA_ISO_CONTROL = javaIsoControl(); 423 424 /** 425 * Determines whether a character is invisible; that is, if its Unicode category is 426 * any of SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, 427 * SURROGATE, and PRIVATE_USE according to ICU4J. 428 * 429 * @deprecated Most invisible characters are supplementary characters; see the class 430 * documentation. If you need to use this, use {@link #invisible()} instead. 431 * This constant is scheduled to be removed in June 2018. 432 */ 433 @com.google.common.annotations.Beta 434 @Deprecated 435 public static final CharMatcher INVISIBLE = invisible(); 436 437 /** 438 * Determines whether a character is single-width (not double-width). When in doubt, 439 * this matcher errs on the side of returning {@code false} (that is, it tends to 440 * assume a character is double-width). 441 * 442 * <p><b>Note:</b> as the reference file evolves, we will modify this constant to 443 * keep it up to date. 444 * 445 * @deprecated Many such characters are supplementary characters; see the class 446 * documentation. If you need to use this, use {@link #singleWidth()} instead. 447 * This constant is scheduled to be removed in June 2018. 448 */ 449 @com.google.common.annotations.Beta 450 @Deprecated 451 public static final CharMatcher SINGLE_WIDTH = singleWidth(); 452 453 /** 454 * Matches any character. 455 * 456 * @deprecated Use {@link #any()} instead. This constant is scheduled to be 457 * removed in June 2018. 458 */ 459 @com.google.common.annotations.Beta 460 @Deprecated 461 public static final CharMatcher ANY = any(); 462 463 /** 464 * Matches no characters. 465 * 466 * @deprecated Use {@link #none()} instead. This constant is scheduled to be 467 * removed in June 2018. 468 */ 469 @com.google.common.annotations.Beta 470 @Deprecated 471 public static final CharMatcher NONE = none(); 472 473 // Static factories 474 475 /** Returns a {@code char} matcher that matches only one specified BMP character. */ 476 public static CharMatcher is(final char match) { 477 return new Is(match); 478 } 479 480 /** 481 * Returns a {@code char} matcher that matches any character except the BMP character specified. 482 * 483 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 484 */ 485 public static CharMatcher isNot(final char match) { 486 return new IsNot(match); 487 } 488 489 /** 490 * Returns a {@code char} matcher that matches any BMP character present in the given character 491 * sequence. Returns a bogus matcher if the sequence contains supplementary characters. 492 */ 493 public static CharMatcher anyOf(final CharSequence sequence) { 494 switch (sequence.length()) { 495 case 0: 496 return none(); 497 case 1: 498 return is(sequence.charAt(0)); 499 case 2: 500 return isEither(sequence.charAt(0), sequence.charAt(1)); 501 default: 502 // TODO(lowasser): is it potentially worth just going ahead and building a precomputed 503 // matcher? 504 return new AnyOf(sequence); 505 } 506 } 507 508 /** 509 * Returns a {@code char} matcher that matches any BMP character not present in the given 510 * character sequence. Returns a bogus matcher if the sequence contains supplementary characters. 511 */ 512 public static CharMatcher noneOf(CharSequence sequence) { 513 return anyOf(sequence).negate(); 514 } 515 516 /** 517 * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints 518 * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code 519 * CharMatcher.inRange('a', 'z')}. 520 * 521 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 522 */ 523 public static CharMatcher inRange(final char startInclusive, final char endInclusive) { 524 return new InRange(startInclusive, endInclusive); 525 } 526 527 /** 528 * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but 529 * which operates on primitive {@code char} instances instead. 530 */ 531 public static CharMatcher forPredicate(final Predicate<? super Character> predicate) { 532 return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate); 533 } 534 535 // Constructors 536 537 /** 538 * Constructor for use by subclasses. When subclassing, you may want to override {@code 539 * toString()} to provide a useful description. 540 */ 541 protected CharMatcher() {} 542 543 // Abstract methods 544 545 /** Determines a true or false value for the given character. */ 546 public abstract boolean matches(char c); 547 548 // Non-static factories 549 550 /** Returns a matcher that matches any character not matched by this matcher. */ 551 // @Override under Java 8 but not under Java 7 552 public CharMatcher negate() { 553 return new Negated(this); 554 } 555 556 /** 557 * Returns a matcher that matches any character matched by both this matcher and {@code other}. 558 */ 559 public CharMatcher and(CharMatcher other) { 560 return new And(this, other); 561 } 562 563 /** 564 * Returns a matcher that matches any character matched by either this matcher or {@code other}. 565 */ 566 public CharMatcher or(CharMatcher other) { 567 return new Or(this, other); 568 } 569 570 /** 571 * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to 572 * query than the original; your mileage may vary. Precomputation takes time and is likely to be 573 * worthwhile only if the precomputed matcher is queried many thousands of times. 574 * 575 * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a 576 * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a 577 * worthwhile tradeoff in a browser. 578 */ 579 public CharMatcher precomputed() { 580 return Platform.precomputeCharMatcher(this); 581 } 582 583 private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1; 584 585 /** 586 * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method 587 * on {@link Platform} so that we can have different behavior in GWT. 588 * 589 * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the 590 * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables 591 * for matchers that only match a few characters, and so on. In the worst-case scenario, it 592 * constructs an eight-kilobyte bit array and queries that. In many situations this produces a 593 * matcher which is faster to query than the original. 594 */ 595 @GwtIncompatible // SmallCharMatcher 596 CharMatcher precomputedInternal() { 597 final BitSet table = new BitSet(); 598 setBits(table); 599 int totalCharacters = table.cardinality(); 600 if (totalCharacters * 2 <= DISTINCT_CHARS) { 601 return precomputedPositive(totalCharacters, table, toString()); 602 } else { 603 // TODO(lowasser): is it worth it to worry about the last character of large matchers? 604 table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 605 int negatedCharacters = DISTINCT_CHARS - totalCharacters; 606 String suffix = ".negate()"; 607 final String description = toString(); 608 String negatedDescription = 609 description.endsWith(suffix) 610 ? description.substring(0, description.length() - suffix.length()) 611 : description + suffix; 612 return new NegatedFastMatcher( 613 precomputedPositive(negatedCharacters, table, negatedDescription)) { 614 @Override 615 public String toString() { 616 return description; 617 } 618 }; 619 } 620 } 621 622 /** 623 * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper. 624 */ 625 @GwtIncompatible // SmallCharMatcher 626 private static CharMatcher precomputedPositive( 627 int totalCharacters, BitSet table, String description) { 628 switch (totalCharacters) { 629 case 0: 630 return none(); 631 case 1: 632 return is((char) table.nextSetBit(0)); 633 case 2: 634 char c1 = (char) table.nextSetBit(0); 635 char c2 = (char) table.nextSetBit(c1 + 1); 636 return isEither(c1, c2); 637 default: 638 return isSmall(totalCharacters, table.length()) 639 ? SmallCharMatcher.from(table, description) 640 : new BitSetMatcher(table, description); 641 } 642 } 643 644 @GwtIncompatible // SmallCharMatcher 645 private static boolean isSmall(int totalCharacters, int tableLength) { 646 return totalCharacters <= SmallCharMatcher.MAX_SIZE 647 && tableLength > (totalCharacters * 4 * Character.SIZE); 648 // err on the side of BitSetMatcher 649 } 650 651 /** Sets bits in {@code table} matched by this matcher. */ 652 @GwtIncompatible // used only from other GwtIncompatible code 653 void setBits(BitSet table) { 654 for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) { 655 if (matches((char) c)) { 656 table.set(c); 657 } 658 } 659 } 660 661 // Text processing routines 662 663 /** 664 * Returns {@code true} if a character sequence contains at least one matching BMP character. 665 * Equivalent to {@code !matchesNoneOf(sequence)}. 666 * 667 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 668 * character, until this returns {@code true} or the end is reached. 669 * 670 * @param sequence the character sequence to examine, possibly empty 671 * @return {@code true} if this matcher matches at least one character in the sequence 672 * @since 8.0 673 */ 674 public boolean matchesAnyOf(CharSequence sequence) { 675 return !matchesNoneOf(sequence); 676 } 677 678 /** 679 * Returns {@code true} if a character sequence contains only matching BMP characters. 680 * 681 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 682 * character, until this returns {@code false} or the end is reached. 683 * 684 * @param sequence the character sequence to examine, possibly empty 685 * @return {@code true} if this matcher matches every character in the sequence, including when 686 * the sequence is empty 687 */ 688 public boolean matchesAllOf(CharSequence sequence) { 689 for (int i = sequence.length() - 1; i >= 0; i--) { 690 if (!matches(sequence.charAt(i))) { 691 return false; 692 } 693 } 694 return true; 695 } 696 697 /** 698 * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to 699 * {@code !matchesAnyOf(sequence)}. 700 * 701 * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each 702 * character, until this returns {@code true} or the end is reached. 703 * 704 * @param sequence the character sequence to examine, possibly empty 705 * @return {@code true} if this matcher matches no characters in the sequence, including when the 706 * sequence is empty 707 */ 708 public boolean matchesNoneOf(CharSequence sequence) { 709 return indexIn(sequence) == -1; 710 } 711 712 /** 713 * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if 714 * no matching character is present. 715 * 716 * <p>The default implementation iterates over the sequence in forward order calling {@link 717 * #matches} for each character. 718 * 719 * @param sequence the character sequence to examine from the beginning 720 * @return an index, or {@code -1} if no character matches 721 */ 722 public int indexIn(CharSequence sequence) { 723 return indexIn(sequence, 0); 724 } 725 726 /** 727 * Returns the index of the first matching BMP character in a character sequence, starting from a 728 * given position, or {@code -1} if no character matches after that position. 729 * 730 * <p>The default implementation iterates over the sequence in forward order, beginning at {@code 731 * start}, calling {@link #matches} for each character. 732 * 733 * @param sequence the character sequence to examine 734 * @param start the first index to examine; must be nonnegative and no greater than {@code 735 * sequence.length()} 736 * @return the index of the first matching character, guaranteed to be no less than {@code start}, 737 * or {@code -1} if no character matches 738 * @throws IndexOutOfBoundsException if start is negative or greater than {@code 739 * sequence.length()} 740 */ 741 public int indexIn(CharSequence sequence, int start) { 742 int length = sequence.length(); 743 checkPositionIndex(start, length); 744 for (int i = start; i < length; i++) { 745 if (matches(sequence.charAt(i))) { 746 return i; 747 } 748 } 749 return -1; 750 } 751 752 /** 753 * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if 754 * no matching character is present. 755 * 756 * <p>The default implementation iterates over the sequence in reverse order calling {@link 757 * #matches} for each character. 758 * 759 * @param sequence the character sequence to examine from the end 760 * @return an index, or {@code -1} if no character matches 761 */ 762 public int lastIndexIn(CharSequence sequence) { 763 for (int i = sequence.length() - 1; i >= 0; i--) { 764 if (matches(sequence.charAt(i))) { 765 return i; 766 } 767 } 768 return -1; 769 } 770 771 /** 772 * Returns the number of matching {@code char}s found in a character sequence. 773 * 774 * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}(). 775 */ 776 public int countIn(CharSequence sequence) { 777 int count = 0; 778 for (int i = 0; i < sequence.length(); i++) { 779 if (matches(sequence.charAt(i))) { 780 count++; 781 } 782 } 783 return count; 784 } 785 786 /** 787 * Returns a string containing all non-matching characters of a character sequence, in order. For 788 * example: 789 * 790 * <pre>{@code 791 * CharMatcher.is('a').removeFrom("bazaar") 792 * }</pre> 793 * 794 * ... returns {@code "bzr"}. 795 */ 796 public String removeFrom(CharSequence sequence) { 797 String string = sequence.toString(); 798 int pos = indexIn(string); 799 if (pos == -1) { 800 return string; 801 } 802 803 char[] chars = string.toCharArray(); 804 int spread = 1; 805 806 // This unusual loop comes from extensive benchmarking 807 OUT: 808 while (true) { 809 pos++; 810 while (true) { 811 if (pos == chars.length) { 812 break OUT; 813 } 814 if (matches(chars[pos])) { 815 break; 816 } 817 chars[pos - spread] = chars[pos]; 818 pos++; 819 } 820 spread++; 821 } 822 return new String(chars, 0, pos - spread); 823 } 824 825 /** 826 * Returns a string containing all matching BMP characters of a character sequence, in order. For 827 * example: 828 * 829 * <pre>{@code 830 * CharMatcher.is('a').retainFrom("bazaar") 831 * }</pre> 832 * 833 * ... returns {@code "aaa"}. 834 */ 835 public String retainFrom(CharSequence sequence) { 836 return negate().removeFrom(sequence); 837 } 838 839 /** 840 * Returns a string copy of the input character sequence, with each matching BMP character 841 * replaced by a given replacement character. For example: 842 * 843 * <pre>{@code 844 * CharMatcher.is('a').replaceFrom("radar", 'o') 845 * }</pre> 846 * 847 * ... returns {@code "rodor"}. 848 * 849 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 850 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 851 * character. 852 * 853 * @param sequence the character sequence to replace matching characters in 854 * @param replacement the character to append to the result string in place of each matching 855 * character in {@code sequence} 856 * @return the new string 857 */ 858 public String replaceFrom(CharSequence sequence, char replacement) { 859 String string = sequence.toString(); 860 int pos = indexIn(string); 861 if (pos == -1) { 862 return string; 863 } 864 char[] chars = string.toCharArray(); 865 chars[pos] = replacement; 866 for (int i = pos + 1; i < chars.length; i++) { 867 if (matches(chars[i])) { 868 chars[i] = replacement; 869 } 870 } 871 return new String(chars); 872 } 873 874 /** 875 * Returns a string copy of the input character sequence, with each matching BMP character 876 * replaced by a given replacement sequence. For example: 877 * 878 * <pre>{@code 879 * CharMatcher.is('a').replaceFrom("yaha", "oo") 880 * }</pre> 881 * 882 * ... returns {@code "yoohoo"}. 883 * 884 * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better 885 * off calling {@link #replaceFrom(CharSequence, char)} directly. 886 * 887 * @param sequence the character sequence to replace matching characters in 888 * @param replacement the characters to append to the result string in place of each matching 889 * character in {@code sequence} 890 * @return the new string 891 */ 892 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 893 int replacementLen = replacement.length(); 894 if (replacementLen == 0) { 895 return removeFrom(sequence); 896 } 897 if (replacementLen == 1) { 898 return replaceFrom(sequence, replacement.charAt(0)); 899 } 900 901 String string = sequence.toString(); 902 int pos = indexIn(string); 903 if (pos == -1) { 904 return string; 905 } 906 907 int len = string.length(); 908 StringBuilder buf = new StringBuilder((len * 3 / 2) + 16); 909 910 int oldpos = 0; 911 do { 912 buf.append(string, oldpos, pos); 913 buf.append(replacement); 914 oldpos = pos + 1; 915 pos = indexIn(string, oldpos); 916 } while (pos != -1); 917 918 buf.append(string, oldpos, len); 919 return buf.toString(); 920 } 921 922 /** 923 * Returns a substring of the input character sequence that omits all matching BMP characters from 924 * the beginning and from the end of the string. For example: 925 * 926 * <pre>{@code 927 * CharMatcher.anyOf("ab").trimFrom("abacatbab") 928 * }</pre> 929 * 930 * ... returns {@code "cat"}. 931 * 932 * <p>Note that: 933 * 934 * <pre>{@code 935 * CharMatcher.inRange('\0', ' ').trimFrom(str) 936 * }</pre> 937 * 938 * ... is equivalent to {@link String#trim()}. 939 */ 940 public String trimFrom(CharSequence sequence) { 941 int len = sequence.length(); 942 int first; 943 int last; 944 945 for (first = 0; first < len; first++) { 946 if (!matches(sequence.charAt(first))) { 947 break; 948 } 949 } 950 for (last = len - 1; last > first; last--) { 951 if (!matches(sequence.charAt(last))) { 952 break; 953 } 954 } 955 956 return sequence.subSequence(first, last + 1).toString(); 957 } 958 959 /** 960 * Returns a substring of the input character sequence that omits all matching BMP characters from 961 * the beginning of the string. For example: 962 * 963 * <pre>{@code 964 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab") 965 * }</pre> 966 * 967 * ... returns {@code "catbab"}. 968 */ 969 public String trimLeadingFrom(CharSequence sequence) { 970 int len = sequence.length(); 971 for (int first = 0; first < len; first++) { 972 if (!matches(sequence.charAt(first))) { 973 return sequence.subSequence(first, len).toString(); 974 } 975 } 976 return ""; 977 } 978 979 /** 980 * Returns a substring of the input character sequence that omits all matching BMP characters from 981 * the end of the string. For example: 982 * 983 * <pre>{@code 984 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab") 985 * }</pre> 986 * 987 * ... returns {@code "abacat"}. 988 */ 989 public String trimTrailingFrom(CharSequence sequence) { 990 int len = sequence.length(); 991 for (int last = len - 1; last >= 0; last--) { 992 if (!matches(sequence.charAt(last))) { 993 return sequence.subSequence(0, last + 1).toString(); 994 } 995 } 996 return ""; 997 } 998 999 /** 1000 * Returns a string copy of the input character sequence, with each group of consecutive matching 1001 * BMP characters replaced by a single replacement character. For example: 1002 * 1003 * <pre>{@code 1004 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-') 1005 * }</pre> 1006 * 1007 * ... returns {@code "b-p-r"}. 1008 * 1009 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching 1010 * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each 1011 * character. 1012 * 1013 * @param sequence the character sequence to replace matching groups of characters in 1014 * @param replacement the character to append to the result string in place of each group of 1015 * matching characters in {@code sequence} 1016 * @return the new string 1017 */ 1018 public String collapseFrom(CharSequence sequence, char replacement) { 1019 // This implementation avoids unnecessary allocation. 1020 int len = sequence.length(); 1021 for (int i = 0; i < len; i++) { 1022 char c = sequence.charAt(i); 1023 if (matches(c)) { 1024 if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) { 1025 // a no-op replacement 1026 i++; 1027 } else { 1028 StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement); 1029 return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true); 1030 } 1031 } 1032 } 1033 // no replacement needed 1034 return sequence.toString(); 1035 } 1036 1037 /** 1038 * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that 1039 * groups of matching BMP characters at the start or end of the sequence are removed without 1040 * replacement. 1041 */ 1042 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1043 // This implementation avoids unnecessary allocation. 1044 int len = sequence.length(); 1045 int first = 0; 1046 int last = len - 1; 1047 1048 while (first < len && matches(sequence.charAt(first))) { 1049 first++; 1050 } 1051 1052 while (last > first && matches(sequence.charAt(last))) { 1053 last--; 1054 } 1055 1056 return (first == 0 && last == len - 1) 1057 ? collapseFrom(sequence, replacement) 1058 : finishCollapseFrom( 1059 sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false); 1060 } 1061 1062 private String finishCollapseFrom( 1063 CharSequence sequence, 1064 int start, 1065 int end, 1066 char replacement, 1067 StringBuilder builder, 1068 boolean inMatchingGroup) { 1069 for (int i = start; i < end; i++) { 1070 char c = sequence.charAt(i); 1071 if (matches(c)) { 1072 if (!inMatchingGroup) { 1073 builder.append(replacement); 1074 inMatchingGroup = true; 1075 } 1076 } else { 1077 builder.append(c); 1078 inMatchingGroup = false; 1079 } 1080 } 1081 return builder.toString(); 1082 } 1083 1084 /** 1085 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches} 1086 * instead. 1087 */ 1088 @Deprecated 1089 @Override 1090 public boolean apply(Character character) { 1091 return matches(character); 1092 } 1093 1094 /** 1095 * Returns a string representation of this {@code CharMatcher}, such as {@code 1096 * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}. 1097 */ 1098 @Override 1099 public String toString() { 1100 return super.toString(); 1101 } 1102 1103 /** 1104 * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where 1105 * "12AB" is the four hexadecimal digits representing the 16-bit code unit. 1106 */ 1107 private static String showCharacter(char c) { 1108 String hex = "0123456789ABCDEF"; 1109 char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'}; 1110 for (int i = 0; i < 4; i++) { 1111 tmp[5 - i] = hex.charAt(c & 0xF); 1112 c = (char) (c >> 4); 1113 } 1114 return String.copyValueOf(tmp); 1115 } 1116 1117 // Fast matchers 1118 1119 /** A matcher for which precomputation will not yield any significant benefit. */ 1120 abstract static class FastMatcher extends CharMatcher { 1121 1122 @Override 1123 public final CharMatcher precomputed() { 1124 return this; 1125 } 1126 1127 @Override 1128 public CharMatcher negate() { 1129 return new NegatedFastMatcher(this); 1130 } 1131 } 1132 1133 /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */ 1134 abstract static class NamedFastMatcher extends FastMatcher { 1135 1136 private final String description; 1137 1138 NamedFastMatcher(String description) { 1139 this.description = checkNotNull(description); 1140 } 1141 1142 @Override 1143 public final String toString() { 1144 return description; 1145 } 1146 } 1147 1148 /** Negation of a {@link FastMatcher}. */ 1149 static class NegatedFastMatcher extends Negated { 1150 1151 NegatedFastMatcher(CharMatcher original) { 1152 super(original); 1153 } 1154 1155 @Override 1156 public final CharMatcher precomputed() { 1157 return this; 1158 } 1159 } 1160 1161 /** Fast matcher using a {@link BitSet} table of matching characters. */ 1162 @GwtIncompatible // used only from other GwtIncompatible code 1163 private static final class BitSetMatcher extends NamedFastMatcher { 1164 1165 private final BitSet table; 1166 1167 private BitSetMatcher(BitSet table, String description) { 1168 super(description); 1169 if (table.length() + Long.SIZE < table.size()) { 1170 table = (BitSet) table.clone(); 1171 // If only we could actually call BitSet.trimToSize() ourselves... 1172 } 1173 this.table = table; 1174 } 1175 1176 @Override 1177 public boolean matches(char c) { 1178 return table.get(c); 1179 } 1180 1181 @Override 1182 void setBits(BitSet bitSet) { 1183 bitSet.or(table); 1184 } 1185 } 1186 1187 // Static constant implementation classes 1188 1189 /** Implementation of {@link #any()}. */ 1190 private static final class Any extends NamedFastMatcher { 1191 1192 static final Any INSTANCE = new Any(); 1193 1194 private Any() { 1195 super("CharMatcher.any()"); 1196 } 1197 1198 @Override 1199 public boolean matches(char c) { 1200 return true; 1201 } 1202 1203 @Override 1204 public int indexIn(CharSequence sequence) { 1205 return (sequence.length() == 0) ? -1 : 0; 1206 } 1207 1208 @Override 1209 public int indexIn(CharSequence sequence, int start) { 1210 int length = sequence.length(); 1211 checkPositionIndex(start, length); 1212 return (start == length) ? -1 : start; 1213 } 1214 1215 @Override 1216 public int lastIndexIn(CharSequence sequence) { 1217 return sequence.length() - 1; 1218 } 1219 1220 @Override 1221 public boolean matchesAllOf(CharSequence sequence) { 1222 checkNotNull(sequence); 1223 return true; 1224 } 1225 1226 @Override 1227 public boolean matchesNoneOf(CharSequence sequence) { 1228 return sequence.length() == 0; 1229 } 1230 1231 @Override 1232 public String removeFrom(CharSequence sequence) { 1233 checkNotNull(sequence); 1234 return ""; 1235 } 1236 1237 @Override 1238 public String replaceFrom(CharSequence sequence, char replacement) { 1239 char[] array = new char[sequence.length()]; 1240 Arrays.fill(array, replacement); 1241 return new String(array); 1242 } 1243 1244 @Override 1245 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1246 StringBuilder result = new StringBuilder(sequence.length() * replacement.length()); 1247 for (int i = 0; i < sequence.length(); i++) { 1248 result.append(replacement); 1249 } 1250 return result.toString(); 1251 } 1252 1253 @Override 1254 public String collapseFrom(CharSequence sequence, char replacement) { 1255 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 1256 } 1257 1258 @Override 1259 public String trimFrom(CharSequence sequence) { 1260 checkNotNull(sequence); 1261 return ""; 1262 } 1263 1264 @Override 1265 public int countIn(CharSequence sequence) { 1266 return sequence.length(); 1267 } 1268 1269 @Override 1270 public CharMatcher and(CharMatcher other) { 1271 return checkNotNull(other); 1272 } 1273 1274 @Override 1275 public CharMatcher or(CharMatcher other) { 1276 checkNotNull(other); 1277 return this; 1278 } 1279 1280 @Override 1281 public CharMatcher negate() { 1282 return none(); 1283 } 1284 } 1285 1286 /** Implementation of {@link #none()}. */ 1287 private static final class None extends NamedFastMatcher { 1288 1289 static final None INSTANCE = new None(); 1290 1291 private None() { 1292 super("CharMatcher.none()"); 1293 } 1294 1295 @Override 1296 public boolean matches(char c) { 1297 return false; 1298 } 1299 1300 @Override 1301 public int indexIn(CharSequence sequence) { 1302 checkNotNull(sequence); 1303 return -1; 1304 } 1305 1306 @Override 1307 public int indexIn(CharSequence sequence, int start) { 1308 int length = sequence.length(); 1309 checkPositionIndex(start, length); 1310 return -1; 1311 } 1312 1313 @Override 1314 public int lastIndexIn(CharSequence sequence) { 1315 checkNotNull(sequence); 1316 return -1; 1317 } 1318 1319 @Override 1320 public boolean matchesAllOf(CharSequence sequence) { 1321 return sequence.length() == 0; 1322 } 1323 1324 @Override 1325 public boolean matchesNoneOf(CharSequence sequence) { 1326 checkNotNull(sequence); 1327 return true; 1328 } 1329 1330 @Override 1331 public String removeFrom(CharSequence sequence) { 1332 return sequence.toString(); 1333 } 1334 1335 @Override 1336 public String replaceFrom(CharSequence sequence, char replacement) { 1337 return sequence.toString(); 1338 } 1339 1340 @Override 1341 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 1342 checkNotNull(replacement); 1343 return sequence.toString(); 1344 } 1345 1346 @Override 1347 public String collapseFrom(CharSequence sequence, char replacement) { 1348 return sequence.toString(); 1349 } 1350 1351 @Override 1352 public String trimFrom(CharSequence sequence) { 1353 return sequence.toString(); 1354 } 1355 1356 @Override 1357 public String trimLeadingFrom(CharSequence sequence) { 1358 return sequence.toString(); 1359 } 1360 1361 @Override 1362 public String trimTrailingFrom(CharSequence sequence) { 1363 return sequence.toString(); 1364 } 1365 1366 @Override 1367 public int countIn(CharSequence sequence) { 1368 checkNotNull(sequence); 1369 return 0; 1370 } 1371 1372 @Override 1373 public CharMatcher and(CharMatcher other) { 1374 checkNotNull(other); 1375 return this; 1376 } 1377 1378 @Override 1379 public CharMatcher or(CharMatcher other) { 1380 return checkNotNull(other); 1381 } 1382 1383 @Override 1384 public CharMatcher negate() { 1385 return any(); 1386 } 1387 } 1388 1389 /** Implementation of {@link #whitespace()}. */ 1390 @VisibleForTesting 1391 static final class Whitespace extends NamedFastMatcher { 1392 1393 static final String TABLE = 1394 "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000" 1395 + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680" 1396 + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009" 1397 + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000"; 1398 static final int MULTIPLIER = 1682554634; 1399 static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1); 1400 1401 static final Whitespace INSTANCE = new Whitespace(); 1402 1403 Whitespace() { 1404 super("CharMatcher.whitespace()"); 1405 } 1406 1407 @Override 1408 public boolean matches(char c) { 1409 return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c; 1410 } 1411 1412 @GwtIncompatible // used only from other GwtIncompatible code 1413 @Override 1414 void setBits(BitSet table) { 1415 for (int i = 0; i < TABLE.length(); i++) { 1416 table.set(TABLE.charAt(i)); 1417 } 1418 } 1419 } 1420 1421 /** Implementation of {@link #breakingWhitespace()}. */ 1422 private static final class BreakingWhitespace extends CharMatcher { 1423 1424 static final CharMatcher INSTANCE = new BreakingWhitespace(); 1425 1426 @Override 1427 public boolean matches(char c) { 1428 switch (c) { 1429 case '\t': 1430 case '\n': 1431 case '\013': 1432 case '\f': 1433 case '\r': 1434 case ' ': 1435 case '\u0085': 1436 case '\u1680': 1437 case '\u2028': 1438 case '\u2029': 1439 case '\u205f': 1440 case '\u3000': 1441 return true; 1442 case '\u2007': 1443 return false; 1444 default: 1445 return c >= '\u2000' && c <= '\u200a'; 1446 } 1447 } 1448 1449 @Override 1450 public String toString() { 1451 return "CharMatcher.breakingWhitespace()"; 1452 } 1453 } 1454 1455 /** Implementation of {@link #ascii()}. */ 1456 private static final class Ascii extends NamedFastMatcher { 1457 1458 static final Ascii INSTANCE = new Ascii(); 1459 1460 Ascii() { 1461 super("CharMatcher.ascii()"); 1462 } 1463 1464 @Override 1465 public boolean matches(char c) { 1466 return c <= '\u007f'; 1467 } 1468 } 1469 1470 /** Implementation that matches characters that fall within multiple ranges. */ 1471 private static class RangesMatcher extends CharMatcher { 1472 1473 private final String description; 1474 private final char[] rangeStarts; 1475 private final char[] rangeEnds; 1476 1477 RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) { 1478 this.description = description; 1479 this.rangeStarts = rangeStarts; 1480 this.rangeEnds = rangeEnds; 1481 checkArgument(rangeStarts.length == rangeEnds.length); 1482 for (int i = 0; i < rangeStarts.length; i++) { 1483 checkArgument(rangeStarts[i] <= rangeEnds[i]); 1484 if (i + 1 < rangeStarts.length) { 1485 checkArgument(rangeEnds[i] < rangeStarts[i + 1]); 1486 } 1487 } 1488 } 1489 1490 @Override 1491 public boolean matches(char c) { 1492 int index = Arrays.binarySearch(rangeStarts, c); 1493 if (index >= 0) { 1494 return true; 1495 } else { 1496 index = ~index - 1; 1497 return index >= 0 && c <= rangeEnds[index]; 1498 } 1499 } 1500 1501 @Override 1502 public String toString() { 1503 return description; 1504 } 1505 } 1506 1507 /** Implementation of {@link #digit()}. */ 1508 private static final class Digit extends RangesMatcher { 1509 // Plug the following UnicodeSet pattern into 1510 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1511 // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]] 1512 // and get the zeroes from there. 1513 1514 // Must be in ascending order. 1515 private static final String ZEROES = 1516 "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6" 1517 + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0" 1518 + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10"; 1519 1520 private static char[] zeroes() { 1521 return ZEROES.toCharArray(); 1522 } 1523 1524 private static char[] nines() { 1525 char[] nines = new char[ZEROES.length()]; 1526 for (int i = 0; i < ZEROES.length(); i++) { 1527 nines[i] = (char) (ZEROES.charAt(i) + 9); 1528 } 1529 return nines; 1530 } 1531 1532 static final Digit INSTANCE = new Digit(); 1533 1534 private Digit() { 1535 super("CharMatcher.digit()", zeroes(), nines()); 1536 } 1537 } 1538 1539 /** Implementation of {@link #javaDigit()}. */ 1540 private static final class JavaDigit extends CharMatcher { 1541 1542 static final JavaDigit INSTANCE = new JavaDigit(); 1543 1544 @Override 1545 public boolean matches(char c) { 1546 return Character.isDigit(c); 1547 } 1548 1549 @Override 1550 public String toString() { 1551 return "CharMatcher.javaDigit()"; 1552 } 1553 } 1554 1555 /** Implementation of {@link #javaLetter()}. */ 1556 private static final class JavaLetter extends CharMatcher { 1557 1558 static final JavaLetter INSTANCE = new JavaLetter(); 1559 1560 @Override 1561 public boolean matches(char c) { 1562 return Character.isLetter(c); 1563 } 1564 1565 @Override 1566 public String toString() { 1567 return "CharMatcher.javaLetter()"; 1568 } 1569 } 1570 1571 /** Implementation of {@link #javaLetterOrDigit()}. */ 1572 private static final class JavaLetterOrDigit extends CharMatcher { 1573 1574 static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit(); 1575 1576 @Override 1577 public boolean matches(char c) { 1578 return Character.isLetterOrDigit(c); 1579 } 1580 1581 @Override 1582 public String toString() { 1583 return "CharMatcher.javaLetterOrDigit()"; 1584 } 1585 } 1586 1587 /** Implementation of {@link #javaUpperCase()}. */ 1588 private static final class JavaUpperCase extends CharMatcher { 1589 1590 static final JavaUpperCase INSTANCE = new JavaUpperCase(); 1591 1592 @Override 1593 public boolean matches(char c) { 1594 return Character.isUpperCase(c); 1595 } 1596 1597 @Override 1598 public String toString() { 1599 return "CharMatcher.javaUpperCase()"; 1600 } 1601 } 1602 1603 /** Implementation of {@link #javaLowerCase()}. */ 1604 private static final class JavaLowerCase extends CharMatcher { 1605 1606 static final JavaLowerCase INSTANCE = new JavaLowerCase(); 1607 1608 @Override 1609 public boolean matches(char c) { 1610 return Character.isLowerCase(c); 1611 } 1612 1613 @Override 1614 public String toString() { 1615 return "CharMatcher.javaLowerCase()"; 1616 } 1617 } 1618 1619 /** Implementation of {@link #javaIsoControl()}. */ 1620 private static final class JavaIsoControl extends NamedFastMatcher { 1621 1622 static final JavaIsoControl INSTANCE = new JavaIsoControl(); 1623 1624 private JavaIsoControl() { 1625 super("CharMatcher.javaIsoControl()"); 1626 } 1627 1628 @Override 1629 public boolean matches(char c) { 1630 return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f'); 1631 } 1632 } 1633 1634 /** Implementation of {@link #invisible()}. */ 1635 private static final class Invisible extends RangesMatcher { 1636 // Plug the following UnicodeSet pattern into 1637 // https://unicode.org/cldr/utility/list-unicodeset.jsp 1638 // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]] 1639 // with the "Abbreviate" option, and get the ranges from there. 1640 private static final String RANGE_STARTS = 1641 "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u08e2\u1680\u180e\u2000\u2028\u205f\u2066" 1642 + "\u3000\ud800\ufeff\ufff9"; 1643 private static final String RANGE_ENDS = // inclusive ends 1644 "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u08e2\u1680\u180e\u200f\u202f\u2064\u206f" 1645 + "\u3000\uf8ff\ufeff\ufffb"; 1646 1647 static final Invisible INSTANCE = new Invisible(); 1648 1649 private Invisible() { 1650 super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray()); 1651 } 1652 } 1653 1654 /** Implementation of {@link #singleWidth()}. */ 1655 private static final class SingleWidth extends RangesMatcher { 1656 1657 static final SingleWidth INSTANCE = new SingleWidth(); 1658 1659 private SingleWidth() { 1660 super( 1661 "CharMatcher.singleWidth()", 1662 "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(), 1663 "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray()); 1664 } 1665 } 1666 1667 // Non-static factory implementation classes 1668 1669 /** Implementation of {@link #negate()}. */ 1670 private static class Negated extends CharMatcher { 1671 1672 final CharMatcher original; 1673 1674 Negated(CharMatcher original) { 1675 this.original = checkNotNull(original); 1676 } 1677 1678 @Override 1679 public boolean matches(char c) { 1680 return !original.matches(c); 1681 } 1682 1683 @Override 1684 public boolean matchesAllOf(CharSequence sequence) { 1685 return original.matchesNoneOf(sequence); 1686 } 1687 1688 @Override 1689 public boolean matchesNoneOf(CharSequence sequence) { 1690 return original.matchesAllOf(sequence); 1691 } 1692 1693 @Override 1694 public int countIn(CharSequence sequence) { 1695 return sequence.length() - original.countIn(sequence); 1696 } 1697 1698 @GwtIncompatible // used only from other GwtIncompatible code 1699 @Override 1700 void setBits(BitSet table) { 1701 BitSet tmp = new BitSet(); 1702 original.setBits(tmp); 1703 tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1); 1704 table.or(tmp); 1705 } 1706 1707 @Override 1708 public CharMatcher negate() { 1709 return original; 1710 } 1711 1712 @Override 1713 public String toString() { 1714 return original + ".negate()"; 1715 } 1716 } 1717 1718 /** Implementation of {@link #and(CharMatcher)}. */ 1719 private static final class And extends CharMatcher { 1720 1721 final CharMatcher first; 1722 final CharMatcher second; 1723 1724 And(CharMatcher a, CharMatcher b) { 1725 first = checkNotNull(a); 1726 second = checkNotNull(b); 1727 } 1728 1729 @Override 1730 public boolean matches(char c) { 1731 return first.matches(c) && second.matches(c); 1732 } 1733 1734 @GwtIncompatible // used only from other GwtIncompatible code 1735 @Override 1736 void setBits(BitSet table) { 1737 BitSet tmp1 = new BitSet(); 1738 first.setBits(tmp1); 1739 BitSet tmp2 = new BitSet(); 1740 second.setBits(tmp2); 1741 tmp1.and(tmp2); 1742 table.or(tmp1); 1743 } 1744 1745 @Override 1746 public String toString() { 1747 return "CharMatcher.and(" + first + ", " + second + ")"; 1748 } 1749 } 1750 1751 /** Implementation of {@link #or(CharMatcher)}. */ 1752 private static final class Or extends CharMatcher { 1753 1754 final CharMatcher first; 1755 final CharMatcher second; 1756 1757 Or(CharMatcher a, CharMatcher b) { 1758 first = checkNotNull(a); 1759 second = checkNotNull(b); 1760 } 1761 1762 @GwtIncompatible // used only from other GwtIncompatible code 1763 @Override 1764 void setBits(BitSet table) { 1765 first.setBits(table); 1766 second.setBits(table); 1767 } 1768 1769 @Override 1770 public boolean matches(char c) { 1771 return first.matches(c) || second.matches(c); 1772 } 1773 1774 @Override 1775 public String toString() { 1776 return "CharMatcher.or(" + first + ", " + second + ")"; 1777 } 1778 } 1779 1780 // Static factory implementations 1781 1782 /** Implementation of {@link #is(char)}. */ 1783 private static final class Is extends FastMatcher { 1784 1785 private final char match; 1786 1787 Is(char match) { 1788 this.match = match; 1789 } 1790 1791 @Override 1792 public boolean matches(char c) { 1793 return c == match; 1794 } 1795 1796 @Override 1797 public String replaceFrom(CharSequence sequence, char replacement) { 1798 return sequence.toString().replace(match, replacement); 1799 } 1800 1801 @Override 1802 public CharMatcher and(CharMatcher other) { 1803 return other.matches(match) ? this : none(); 1804 } 1805 1806 @Override 1807 public CharMatcher or(CharMatcher other) { 1808 return other.matches(match) ? other : super.or(other); 1809 } 1810 1811 @Override 1812 public CharMatcher negate() { 1813 return isNot(match); 1814 } 1815 1816 @GwtIncompatible // used only from other GwtIncompatible code 1817 @Override 1818 void setBits(BitSet table) { 1819 table.set(match); 1820 } 1821 1822 @Override 1823 public String toString() { 1824 return "CharMatcher.is('" + showCharacter(match) + "')"; 1825 } 1826 } 1827 1828 /** Implementation of {@link #isNot(char)}. */ 1829 private static final class IsNot extends FastMatcher { 1830 1831 private final char match; 1832 1833 IsNot(char match) { 1834 this.match = match; 1835 } 1836 1837 @Override 1838 public boolean matches(char c) { 1839 return c != match; 1840 } 1841 1842 @Override 1843 public CharMatcher and(CharMatcher other) { 1844 return other.matches(match) ? super.and(other) : other; 1845 } 1846 1847 @Override 1848 public CharMatcher or(CharMatcher other) { 1849 return other.matches(match) ? any() : this; 1850 } 1851 1852 @GwtIncompatible // used only from other GwtIncompatible code 1853 @Override 1854 void setBits(BitSet table) { 1855 table.set(0, match); 1856 table.set(match + 1, Character.MAX_VALUE + 1); 1857 } 1858 1859 @Override 1860 public CharMatcher negate() { 1861 return is(match); 1862 } 1863 1864 @Override 1865 public String toString() { 1866 return "CharMatcher.isNot('" + showCharacter(match) + "')"; 1867 } 1868 } 1869 1870 private static CharMatcher.IsEither isEither(char c1, char c2) { 1871 return new CharMatcher.IsEither(c1, c2); 1872 } 1873 1874 /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */ 1875 private static final class IsEither extends FastMatcher { 1876 1877 private final char match1; 1878 private final char match2; 1879 1880 IsEither(char match1, char match2) { 1881 this.match1 = match1; 1882 this.match2 = match2; 1883 } 1884 1885 @Override 1886 public boolean matches(char c) { 1887 return c == match1 || c == match2; 1888 } 1889 1890 @GwtIncompatible // used only from other GwtIncompatible code 1891 @Override 1892 void setBits(BitSet table) { 1893 table.set(match1); 1894 table.set(match2); 1895 } 1896 1897 @Override 1898 public String toString() { 1899 return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")"; 1900 } 1901 } 1902 1903 /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */ 1904 private static final class AnyOf extends CharMatcher { 1905 1906 private final char[] chars; 1907 1908 public AnyOf(CharSequence chars) { 1909 this.chars = chars.toString().toCharArray(); 1910 Arrays.sort(this.chars); 1911 } 1912 1913 @Override 1914 public boolean matches(char c) { 1915 return Arrays.binarySearch(chars, c) >= 0; 1916 } 1917 1918 @Override 1919 @GwtIncompatible // used only from other GwtIncompatible code 1920 void setBits(BitSet table) { 1921 for (char c : chars) { 1922 table.set(c); 1923 } 1924 } 1925 1926 @Override 1927 public String toString() { 1928 StringBuilder description = new StringBuilder("CharMatcher.anyOf(\""); 1929 for (char c : chars) { 1930 description.append(showCharacter(c)); 1931 } 1932 description.append("\")"); 1933 return description.toString(); 1934 } 1935 } 1936 1937 /** Implementation of {@link #inRange(char, char)}. */ 1938 private static final class InRange extends FastMatcher { 1939 1940 private final char startInclusive; 1941 private final char endInclusive; 1942 1943 InRange(char startInclusive, char endInclusive) { 1944 checkArgument(endInclusive >= startInclusive); 1945 this.startInclusive = startInclusive; 1946 this.endInclusive = endInclusive; 1947 } 1948 1949 @Override 1950 public boolean matches(char c) { 1951 return startInclusive <= c && c <= endInclusive; 1952 } 1953 1954 @GwtIncompatible // used only from other GwtIncompatible code 1955 @Override 1956 void setBits(BitSet table) { 1957 table.set(startInclusive, endInclusive + 1); 1958 } 1959 1960 @Override 1961 public String toString() { 1962 return "CharMatcher.inRange('" 1963 + showCharacter(startInclusive) 1964 + "', '" 1965 + showCharacter(endInclusive) 1966 + "')"; 1967 } 1968 } 1969 1970 /** Implementation of {@link #forPredicate(Predicate)}. */ 1971 private static final class ForPredicate extends CharMatcher { 1972 1973 private final Predicate<? super Character> predicate; 1974 1975 ForPredicate(Predicate<? super Character> predicate) { 1976 this.predicate = checkNotNull(predicate); 1977 } 1978 1979 @Override 1980 public boolean matches(char c) { 1981 return predicate.apply(c); 1982 } 1983 1984 @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily 1985 @Override 1986 public boolean apply(Character character) { 1987 return predicate.apply(checkNotNull(character)); 1988 } 1989 1990 @Override 1991 public String toString() { 1992 return "CharMatcher.forPredicate(" + predicate + ")"; 1993 } 1994 } 1995}