001 /* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.base; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 import static com.google.common.base.Preconditions.checkState; 022 023 import com.google.common.annotations.Beta; 024 import com.google.common.annotations.GwtCompatible; 025 import com.google.common.annotations.GwtIncompatible; 026 027 import java.util.Iterator; 028 import java.util.NoSuchElementException; 029 import java.util.regex.Matcher; 030 import java.util.regex.Pattern; 031 032 /** 033 * An object that divides strings (or other instances of {@code CharSequence}) 034 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter") 035 * which can be expressed as a single character, literal string, regular 036 * expression, {@code CharMatcher}, or by using a fixed substring length. This 037 * class provides the complementary functionality to {@link Joiner}. 038 * 039 * <p>Here is the most basic example of {@code Splitter} usage: <pre> {@code 040 * 041 * Splitter.on(',').split("foo,bar")}</pre> 042 * 043 * This invocation returns an {@code Iterable<String>} containing {@code "foo"} 044 * and {@code "bar"}, in that order. 045 * 046 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre> {@code 047 * 048 * Splitter.on(',').split("foo,,bar, quux")}</pre> 049 * 050 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}. 051 * Notice that the splitter does not assume that you want empty strings removed, 052 * or that you wish to trim whitespace. If you want features like these, simply 053 * ask for them: <pre> {@code 054 * 055 * private static final Splitter MY_SPLITTER = Splitter.on(',') 056 * .trimResults() 057 * .omitEmptyStrings();}</pre> 058 * 059 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable 060 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which 061 * the configuration methods are called is never significant; for instance, 062 * trimming is always applied first before checking for an empty result, 063 * regardless of the order in which the {@link #trimResults()} and 064 * {@link #omitEmptyStrings()} methods were invoked. 065 * 066 * <p><b>Warning: splitter instances are always immutable</b>; a configuration 067 * method such as {@code omitEmptyStrings} has no effect on the instance it 068 * is invoked on! You must store and use the new splitter instance returned by 069 * the method. This makes splitters thread-safe, and safe to store as {@code 070 * static final} constants (as illustrated above). <pre> {@code 071 * 072 * // Bad! Do not do this! 073 * Splitter splitter = Splitter.on('/'); 074 * splitter.trimResults(); // does nothing! 075 * return splitter.split("wrong / wrong / wrong");}</pre> 076 * 077 * The separator recognized by the splitter does not have to be a single 078 * literal character as in the examples above. See the methods {@link 079 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples 080 * of other ways to specify separators. 081 * 082 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of 083 * similar JDK methods; for instance, it does not silently discard trailing 084 * separators, as does {@link String#split(String)}, nor does it have a default 085 * behavior of using five particular whitespace characters as separators, like 086 * {@link java.util.StringTokenizer}. 087 * 088 * @author Julien Silland 089 * @author Jesse Wilson 090 * @author Kevin Bourrillion 091 * @since 1 092 */ 093 @GwtCompatible(emulated = true) 094 public final class Splitter { 095 private final CharMatcher trimmer; 096 private final boolean omitEmptyStrings; 097 private final Strategy strategy; 098 private final int limit; 099 100 private Splitter(Strategy strategy) { 101 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE); 102 } 103 104 private Splitter(Strategy strategy, boolean omitEmptyStrings, 105 CharMatcher trimmer, int limit) { 106 this.strategy = strategy; 107 this.omitEmptyStrings = omitEmptyStrings; 108 this.trimmer = trimmer; 109 this.limit = limit; 110 } 111 112 /** 113 * Returns a splitter that uses the given single-character separator. For 114 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable 115 * containing {@code ["foo", "", "bar"]}. 116 * 117 * @param separator the character to recognize as a separator 118 * @return a splitter, with default settings, that recognizes that separator 119 */ 120 public static Splitter on(char separator) { 121 return on(CharMatcher.is(separator)); 122 } 123 124 /** 125 * Returns a splitter that considers any single character matched by the 126 * given {@code CharMatcher} to be a separator. For example, {@code 127 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an 128 * iterable containing {@code ["foo", "", "bar", "quux"]}. 129 * 130 * @param separatorMatcher a {@link CharMatcher} that determines whether a 131 * character is a separator 132 * @return a splitter, with default settings, that uses this matcher 133 */ 134 public static Splitter on(final CharMatcher separatorMatcher) { 135 checkNotNull(separatorMatcher); 136 137 return new Splitter(new Strategy() { 138 @Override public SplittingIterator iterator( 139 Splitter splitter, final CharSequence toSplit) { 140 return new SplittingIterator(splitter, toSplit) { 141 @Override int separatorStart(int start) { 142 return separatorMatcher.indexIn(toSplit, start); 143 } 144 145 @Override int separatorEnd(int separatorPosition) { 146 return separatorPosition + 1; 147 } 148 }; 149 } 150 }); 151 } 152 153 /** 154 * Returns a splitter that uses the given fixed string as a separator. For 155 * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an 156 * iterable containing {@code ["foo", "bar", "baz,qux"]}. 157 * 158 * @param separator the literal, nonempty string to recognize as a separator 159 * @return a splitter, with default settings, that recognizes that separator 160 */ 161 public static Splitter on(final String separator) { 162 checkArgument(separator.length() != 0, 163 "The separator may not be the empty string."); 164 165 return new Splitter(new Strategy() { 166 @Override public SplittingIterator iterator( 167 Splitter splitter, CharSequence toSplit) { 168 return new SplittingIterator(splitter, toSplit) { 169 @Override public int separatorStart(int start) { 170 int delimeterLength = separator.length(); 171 172 positions: 173 for (int p = start, last = toSplit.length() - delimeterLength; 174 p <= last; p++) { 175 for (int i = 0; i < delimeterLength; i++) { 176 if (toSplit.charAt(i + p) != separator.charAt(i)) { 177 continue positions; 178 } 179 } 180 return p; 181 } 182 return -1; 183 } 184 185 @Override public int separatorEnd(int separatorPosition) { 186 return separatorPosition + separator.length(); 187 } 188 }; 189 } 190 }); 191 } 192 193 /** 194 * Returns a splitter that considers any subsequence matching {@code 195 * pattern} to be a separator. For example, {@code 196 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string 197 * into lines whether it uses DOS-style or UNIX-style line terminators. 198 * 199 * @param separatorPattern the pattern that determines whether a subsequence 200 * is a separator. This pattern may not match the empty string. 201 * @return a splitter, with default settings, that uses this pattern 202 * @throws IllegalArgumentException if {@code separatorPattern} matches the 203 * empty string 204 */ 205 @GwtIncompatible("java.util.regex") 206 public static Splitter on(final Pattern separatorPattern) { 207 checkNotNull(separatorPattern); 208 checkArgument(!separatorPattern.matcher("").matches(), 209 "The pattern may not match the empty string: %s", separatorPattern); 210 211 return new Splitter(new Strategy() { 212 @Override public SplittingIterator iterator( 213 final Splitter splitter, CharSequence toSplit) { 214 final Matcher matcher = separatorPattern.matcher(toSplit); 215 return new SplittingIterator(splitter, toSplit) { 216 @Override public int separatorStart(int start) { 217 return matcher.find(start) ? matcher.start() : -1; 218 } 219 220 @Override public int separatorEnd(int separatorPosition) { 221 return matcher.end(); 222 } 223 }; 224 } 225 }); 226 } 227 228 /** 229 * Returns a splitter that considers any subsequence matching a given 230 * pattern (regular expression) to be a separator. For example, {@code 231 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines 232 * whether it uses DOS-style or UNIX-style line terminators. This is 233 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}. 234 * 235 * @param separatorPattern the pattern that determines whether a subsequence 236 * is a separator. This pattern may not match the empty string. 237 * @return a splitter, with default settings, that uses this pattern 238 * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern} 239 * is a malformed expression 240 * @throws IllegalArgumentException if {@code separatorPattern} matches the 241 * empty string 242 */ 243 @GwtIncompatible("java.util.regex") 244 public static Splitter onPattern(String separatorPattern) { 245 return on(Pattern.compile(separatorPattern)); 246 } 247 248 /** 249 * Returns a splitter that divides strings into pieces of the given length. 250 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an 251 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be 252 * smaller than {@code length} but will never be empty. 253 * 254 * @param length the desired length of pieces after splitting 255 * @return a splitter, with default settings, that can split into fixed sized 256 * pieces 257 */ 258 public static Splitter fixedLength(final int length) { 259 checkArgument(length > 0, "The length may not be less than 1"); 260 261 return new Splitter(new Strategy() { 262 @Override public SplittingIterator iterator( 263 final Splitter splitter, CharSequence toSplit) { 264 return new SplittingIterator(splitter, toSplit) { 265 @Override public int separatorStart(int start) { 266 int nextChunkStart = start + length; 267 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1); 268 } 269 270 @Override public int separatorEnd(int separatorPosition) { 271 return separatorPosition; 272 } 273 }; 274 } 275 }); 276 } 277 278 /** 279 * Returns a splitter that behaves equivalently to {@code this} splitter, but 280 * automatically omits empty strings from the results. For example, {@code 281 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an 282 * iterable containing only {@code ["a", "b", "c"]}. 283 * 284 * <p>If either {@code trimResults} option is also specified when creating a 285 * splitter, that splitter always trims results first before checking for 286 * emptiness. So, for example, {@code 287 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns 288 * an empty iterable. 289 * 290 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)} 291 * to return an empty iterable, but when using this option, it can (if the 292 * input sequence consists of nothing but separators). 293 * 294 * @return a splitter with the desired configuration 295 */ 296 public Splitter omitEmptyStrings() { 297 return new Splitter(strategy, true, trimmer, limit); 298 } 299 300 /** 301 * Returns a splitter that behaves equivalently to {@code this} splitter but 302 * stops splitting after it reaches the limit. 303 * The limit defines the maximum number of items returned by the iterator. 304 * 305 * <p>For example, 306 * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable 307 * containing {@code ["a", "b", "c,d"]}. When omitting empty strings, the 308 * omitted strings do no count. Hence, 309 * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")} 310 * returns an iterable containing {@code ["a", "b", "c,d"}. 311 * When trim is requested, all entries, including the last are trimmed. Hence 312 * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")} 313 * results in @{code ["a", "b", "c , d"]}. 314 * 315 * @param limit the maximum number of items returns 316 * @return a splitter with the desired configuration 317 * @since 9 318 */ 319 @Beta 320 public Splitter limit(int limit) { 321 checkArgument(limit > 0, "must be greater then zero: %s", limit); 322 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 323 } 324 325 /** 326 * Returns a splitter that behaves equivalently to {@code this} splitter, but 327 * automatically removes leading and trailing {@linkplain 328 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent 329 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code 330 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable 331 * containing {@code ["a", "b", "c"]}. 332 * 333 * @return a splitter with the desired configuration 334 */ 335 public Splitter trimResults() { 336 return trimResults(CharMatcher.WHITESPACE); 337 } 338 339 /** 340 * Returns a splitter that behaves equivalently to {@code this} splitter, but 341 * removes all leading or trailing characters matching the given {@code 342 * CharMatcher} from each returned substring. For example, {@code 343 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")} 344 * returns an iterable containing {@code ["a ", "b_ ", "c"]}. 345 * 346 * @param trimmer a {@link CharMatcher} that determines whether a character 347 * should be removed from the beginning/end of a subsequence 348 * @return a splitter with the desired configuration 349 */ 350 // TODO(kevinb): throw if a trimmer was already specified! 351 public Splitter trimResults(CharMatcher trimmer) { 352 checkNotNull(trimmer); 353 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 354 } 355 356 /** 357 * Splits {@code sequence} into string components and makes them available 358 * through an {@link Iterator}, which may be lazily evaluated. 359 * 360 * @param sequence the sequence of characters to split 361 * @return an iteration over the segments split from the parameter. 362 */ 363 public Iterable<String> split(final CharSequence sequence) { 364 checkNotNull(sequence); 365 366 return new Iterable<String>() { 367 @Override public Iterator<String> iterator() { 368 return strategy.iterator(Splitter.this, sequence); 369 } 370 }; 371 } 372 373 private interface Strategy { 374 Iterator<String> iterator(Splitter splitter, CharSequence toSplit); 375 } 376 377 private abstract static class SplittingIterator 378 extends AbstractIterator<String> { 379 final CharSequence toSplit; 380 final CharMatcher trimmer; 381 final boolean omitEmptyStrings; 382 383 /** 384 * Returns the first index in {@code toSplit} at or after {@code start} 385 * that contains the separator. 386 */ 387 abstract int separatorStart(int start); 388 389 /** 390 * Returns the first index in {@code toSplit} after {@code 391 * separatorPosition} that does not contain a separator. This method is only 392 * invoked after a call to {@code separatorStart}. 393 */ 394 abstract int separatorEnd(int separatorPosition); 395 396 int offset = 0; 397 int limit; 398 399 protected SplittingIterator(Splitter splitter, CharSequence toSplit) { 400 this.trimmer = splitter.trimmer; 401 this.omitEmptyStrings = splitter.omitEmptyStrings; 402 this.limit = splitter.limit; 403 this.toSplit = toSplit; 404 } 405 406 @Override protected String computeNext() { 407 while (offset != -1) { 408 int start = offset; 409 int end; 410 411 int separatorPosition = separatorStart(offset); 412 if (separatorPosition == -1) { 413 end = toSplit.length(); 414 offset = -1; 415 } else { 416 end = separatorPosition; 417 offset = separatorEnd(separatorPosition); 418 } 419 420 while (start < end && trimmer.matches(toSplit.charAt(start))) { 421 start++; 422 } 423 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 424 end--; 425 } 426 427 if (omitEmptyStrings && start == end) { 428 continue; 429 } 430 431 if (limit == 1) { 432 // The limit has been reached, return the rest of the string as the 433 // final item. This is tested after empty string removal so that 434 // empty strings do not count towards the limit. 435 end = toSplit.length(); 436 offset = -1; 437 // Since we may have changed the end, we need to trim it again. 438 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 439 end--; 440 } 441 } else { 442 limit--; 443 } 444 445 return toSplit.subSequence(start, end).toString(); 446 } 447 return endOfData(); 448 } 449 } 450 451 /* 452 * Copied from common.collect.AbstractIterator. TODO(kevinb): un-fork if these 453 * packages are ever combined into a single library. 454 */ 455 private abstract static class AbstractIterator<T> implements Iterator<T> { 456 State state = State.NOT_READY; 457 458 enum State { 459 READY, NOT_READY, DONE, FAILED, 460 } 461 462 T next; 463 464 protected abstract T computeNext(); 465 466 protected final T endOfData() { 467 state = State.DONE; 468 return null; 469 } 470 471 @Override 472 public final boolean hasNext() { 473 checkState(state != State.FAILED); 474 switch (state) { 475 case DONE: 476 return false; 477 case READY: 478 return true; 479 default: 480 } 481 return tryToComputeNext(); 482 } 483 484 boolean tryToComputeNext() { 485 state = State.FAILED; // temporary pessimism 486 next = computeNext(); 487 if (state != State.DONE) { 488 state = State.READY; 489 return true; 490 } 491 return false; 492 } 493 494 @Override 495 public final T next() { 496 if (!hasNext()) { 497 throw new NoSuchElementException(); 498 } 499 state = State.NOT_READY; 500 return next; 501 } 502 503 @Override public void remove() { 504 throw new UnsupportedOperationException(); 505 } 506 } 507 }