001 /* 002 * Copyright (C) 2009 Google Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.base; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 import static com.google.common.base.Preconditions.checkState; 022 023 import com.google.common.annotations.GwtCompatible; 024 import com.google.common.annotations.GwtIncompatible; 025 026 import java.util.Iterator; 027 import java.util.NoSuchElementException; 028 import java.util.regex.Matcher; 029 import java.util.regex.Pattern; 030 031 /** 032 * An object that divides strings (or other instances of {@code CharSequence}) 033 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter") 034 * which can be expressed as a single character, literal string, regular 035 * expression, {@code CharMatcher}, or by using a fixed substring length. This 036 * class provides the complementary functionality to {@link Joiner}. 037 * 038 * <p>Here is the most basic example of {@code Splitter} usage: <pre> {@code 039 * 040 * Splitter.on(',').split("foo,bar")}</pre> 041 * 042 * This invocation returns an {@code Iterable<String>} containing {@code "foo"} 043 * and {@code "bar"}, in that order. 044 * 045 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre> {@code 046 * 047 * Splitter.on(',').split("foo,,bar, quux")}</pre> 048 * 049 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}. 050 * Notice that the splitter does not assume that you want empty strings removed, 051 * or that you wish to trim whitespace. If you want features like these, simply 052 * ask for them: <pre> {@code 053 * 054 * private static final Splitter MY_SPLITTER = Splitter.on(',') 055 * .trimResults() 056 * .omitEmptyStrings();}</pre> 057 * 058 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable 059 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which 060 * the configuration methods are called is never significant; for instance, 061 * trimming is always applied first before checking for an empty result, 062 * regardless of the order in which the {@link #trimResults()} and 063 * {@link #omitEmptyStrings()} methods were invoked. 064 * 065 * <p><b>Warning: splitter instances are always immutable</b>; a configuration 066 * method such as {@code omitEmptyStrings} has no effect on the instance it 067 * is invoked on! You must store and use the new splitter instance returned by 068 * the method. This makes splitters thread-safe, and safe to store as {@code 069 * static final} constants (as illustrated above). <pre> {@code 070 * 071 * // Bad! Do not do this! 072 * Splitter splitter = Splitter.on('/'); 073 * splitter.trimResults(); // does nothing! 074 * return splitter.split("wrong / wrong / wrong");}</pre> 075 * 076 * The separator recognized by the splitter does not have to be a single 077 * literal character as in the examples above. See the methods {@link 078 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples 079 * of other ways to specify separators. 080 * 081 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of 082 * similar JDK methods; for instance, it does not silently discard trailing 083 * separators, as does {@link String#split(String)}, nor does it have a default 084 * behavior of using five particular whitespace characters as separators, like 085 * {@link java.util.StringTokenizer}. 086 * 087 * @author Julien Silland 088 * @author Jesse Wilson 089 * @author Kevin Bourrillion 090 * @since 1 091 */ 092 @GwtCompatible(emulated = true) 093 public final class Splitter { 094 private final CharMatcher trimmer; 095 private final boolean omitEmptyStrings; 096 private final Strategy strategy; 097 private final int limit; 098 099 private Splitter(Strategy strategy) { 100 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE); 101 } 102 103 private Splitter(Strategy strategy, boolean omitEmptyStrings, 104 CharMatcher trimmer, int limit) { 105 this.strategy = strategy; 106 this.omitEmptyStrings = omitEmptyStrings; 107 this.trimmer = trimmer; 108 this.limit = limit; 109 } 110 111 /** 112 * Returns a splitter that uses the given single-character separator. For 113 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable 114 * containing {@code ["foo", "", "bar"]}. 115 * 116 * @param separator the character to recognize as a separator 117 * @return a splitter, with default settings, that recognizes that separator 118 */ 119 public static Splitter on(char separator) { 120 return on(CharMatcher.is(separator)); 121 } 122 123 /** 124 * Returns a splitter that considers any single character matched by the 125 * given {@code CharMatcher} to be a separator. For example, {@code 126 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an 127 * iterable containing {@code ["foo", "", "bar", "quux"]}. 128 * 129 * @param separatorMatcher a {@link CharMatcher} that determines whether a 130 * character is a separator 131 * @return a splitter, with default settings, that uses this matcher 132 */ 133 public static Splitter on(final CharMatcher separatorMatcher) { 134 checkNotNull(separatorMatcher); 135 136 return new Splitter(new Strategy() { 137 @Override public SplittingIterator iterator( 138 Splitter splitter, final CharSequence toSplit) { 139 return new SplittingIterator(splitter, toSplit) { 140 @Override int separatorStart(int start) { 141 return separatorMatcher.indexIn(toSplit, start); 142 } 143 144 @Override int separatorEnd(int separatorPosition) { 145 return separatorPosition + 1; 146 } 147 }; 148 } 149 }); 150 } 151 152 /** 153 * Returns a splitter that uses the given fixed string as a separator. For 154 * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an 155 * iterable containing {@code ["foo", "bar", "baz,qux"]}. 156 * 157 * @param separator the literal, nonempty string to recognize as a separator 158 * @return a splitter, with default settings, that recognizes that separator 159 */ 160 public static Splitter on(final String separator) { 161 checkArgument(separator.length() != 0, 162 "The separator may not be the empty string."); 163 164 return new Splitter(new Strategy() { 165 @Override public SplittingIterator iterator( 166 Splitter splitter, CharSequence toSplit) { 167 return new SplittingIterator(splitter, toSplit) { 168 @Override public int separatorStart(int start) { 169 int delimeterLength = separator.length(); 170 171 positions: 172 for (int p = start, last = toSplit.length() - delimeterLength; 173 p <= last; p++) { 174 for (int i = 0; i < delimeterLength; i++) { 175 if (toSplit.charAt(i + p) != separator.charAt(i)) { 176 continue positions; 177 } 178 } 179 return p; 180 } 181 return -1; 182 } 183 184 @Override public int separatorEnd(int separatorPosition) { 185 return separatorPosition + separator.length(); 186 } 187 }; 188 } 189 }); 190 } 191 192 /** 193 * Returns a splitter that considers any subsequence matching {@code 194 * pattern} to be a separator. For example, {@code 195 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string 196 * into lines whether it uses DOS-style or UNIX-style line terminators. 197 * 198 * @param separatorPattern the pattern that determines whether a subsequence 199 * is a separator. This pattern may not match the empty string. 200 * @return a splitter, with default settings, that uses this pattern 201 * @throws IllegalArgumentException if {@code separatorPattern} matches the 202 * empty string 203 */ 204 @GwtIncompatible("java.util.regex") 205 public static Splitter on(final Pattern separatorPattern) { 206 checkNotNull(separatorPattern); 207 checkArgument(!separatorPattern.matcher("").matches(), 208 "The pattern may not match the empty string: %s", separatorPattern); 209 210 return new Splitter(new Strategy() { 211 @Override public SplittingIterator iterator( 212 final Splitter splitter, CharSequence toSplit) { 213 final Matcher matcher = separatorPattern.matcher(toSplit); 214 return new SplittingIterator(splitter, toSplit) { 215 @Override public int separatorStart(int start) { 216 return matcher.find(start) ? matcher.start() : -1; 217 } 218 219 @Override public int separatorEnd(int separatorPosition) { 220 return matcher.end(); 221 } 222 }; 223 } 224 }); 225 } 226 227 /** 228 * Returns a splitter that considers any subsequence matching a given 229 * pattern (regular expression) to be a separator. For example, {@code 230 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines 231 * whether it uses DOS-style or UNIX-style line terminators. This is 232 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}. 233 * 234 * @param separatorPattern the pattern that determines whether a subsequence 235 * is a separator. This pattern may not match the empty string. 236 * @return a splitter, with default settings, that uses this pattern 237 * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern} 238 * is a malformed expression 239 * @throws IllegalArgumentException if {@code separatorPattern} matches the 240 * empty string 241 */ 242 @GwtIncompatible("java.util.regex") 243 public static Splitter onPattern(String separatorPattern) { 244 return on(Pattern.compile(separatorPattern)); 245 } 246 247 /** 248 * Returns a splitter that divides strings into pieces of the given length. 249 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an 250 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be 251 * smaller than {@code length} but will never be empty. 252 * 253 * @param length the desired length of pieces after splitting 254 * @return a splitter, with default settings, that can split into fixed sized 255 * pieces 256 */ 257 public static Splitter fixedLength(final int length) { 258 checkArgument(length > 0, "The length may not be less than 1"); 259 260 return new Splitter(new Strategy() { 261 @Override public SplittingIterator iterator( 262 final Splitter splitter, CharSequence toSplit) { 263 return new SplittingIterator(splitter, toSplit) { 264 @Override public int separatorStart(int start) { 265 int nextChunkStart = start + length; 266 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1); 267 } 268 269 @Override public int separatorEnd(int separatorPosition) { 270 return separatorPosition; 271 } 272 }; 273 } 274 }); 275 } 276 277 /** 278 * Returns a splitter that behaves equivalently to {@code this} splitter, but 279 * automatically omits empty strings from the results. For example, {@code 280 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an 281 * iterable containing only {@code ["a", "b", "c"]}. 282 * 283 * <p>If either {@code trimResults} option is also specified when creating a 284 * splitter, that splitter always trims results first before checking for 285 * emptiness. So, for example, {@code 286 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns 287 * an empty iterable. 288 * 289 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)} 290 * to return an empty iterable, but when using this option, it can (if the 291 * input sequence consists of nothing but separators). 292 * 293 * @return a splitter with the desired configuration 294 */ 295 public Splitter omitEmptyStrings() { 296 return new Splitter(strategy, true, trimmer, limit); 297 } 298 299 /** 300 * Returns a splitter that behaves equivalently to {@code this} splitter, but 301 * automatically removes leading and trailing {@linkplain 302 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent 303 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code 304 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable 305 * containing {@code ["a", "b", "c"]}. 306 * 307 * @return a splitter with the desired configuration 308 */ 309 public Splitter trimResults() { 310 return trimResults(CharMatcher.WHITESPACE); 311 } 312 313 /** 314 * Returns a splitter that behaves equivalently to {@code this} splitter, but 315 * removes all leading or trailing characters matching the given {@code 316 * CharMatcher} from each returned substring. For example, {@code 317 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")} 318 * returns an iterable containing {@code ["a ", "b_ ", "c"]}. 319 * 320 * @param trimmer a {@link CharMatcher} that determines whether a character 321 * should be removed from the beginning/end of a subsequence 322 * @return a splitter with the desired configuration 323 */ 324 // TODO(kevinb): throw if a trimmer was already specified! 325 public Splitter trimResults(CharMatcher trimmer) { 326 checkNotNull(trimmer); 327 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 328 } 329 330 /** 331 * Splits {@code sequence} into string components and makes them available 332 * through an {@link Iterator}, which may be lazily evaluated. 333 * 334 * @param sequence the sequence of characters to split 335 * @return an iteration over the segments split from the parameter. 336 */ 337 public Iterable<String> split(final CharSequence sequence) { 338 checkNotNull(sequence); 339 340 return new Iterable<String>() { 341 @Override public Iterator<String> iterator() { 342 return strategy.iterator(Splitter.this, sequence); 343 } 344 }; 345 } 346 347 private interface Strategy { 348 Iterator<String> iterator(Splitter splitter, CharSequence toSplit); 349 } 350 351 private abstract static class SplittingIterator 352 extends AbstractIterator<String> { 353 final CharSequence toSplit; 354 final CharMatcher trimmer; 355 final boolean omitEmptyStrings; 356 357 /** 358 * Returns the first index in {@code toSplit} at or after {@code start} 359 * that contains the separator. 360 */ 361 abstract int separatorStart(int start); 362 363 /** 364 * Returns the first index in {@code toSplit} after {@code 365 * separatorPosition} that does not contain a separator. This method is only 366 * invoked after a call to {@code separatorStart}. 367 */ 368 abstract int separatorEnd(int separatorPosition); 369 370 int offset = 0; 371 int limit; 372 373 protected SplittingIterator(Splitter splitter, CharSequence toSplit) { 374 this.trimmer = splitter.trimmer; 375 this.omitEmptyStrings = splitter.omitEmptyStrings; 376 this.limit = splitter.limit; 377 this.toSplit = toSplit; 378 } 379 380 @Override protected String computeNext() { 381 while (offset != -1) { 382 int start = offset; 383 int end; 384 385 int separatorPosition = separatorStart(offset); 386 if (separatorPosition == -1) { 387 end = toSplit.length(); 388 offset = -1; 389 } else { 390 end = separatorPosition; 391 offset = separatorEnd(separatorPosition); 392 } 393 394 while (start < end && trimmer.matches(toSplit.charAt(start))) { 395 start++; 396 } 397 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 398 end--; 399 } 400 401 if (omitEmptyStrings && start == end) { 402 continue; 403 } 404 405 if (limit == 1) { 406 // The limit has been reached, return the rest of the string as the 407 // final item. This is tested after empty string removal so that 408 // empty strings do not count towards the limit. 409 end = toSplit.length(); 410 offset = -1; 411 // Since we may have changed the end, we need to trim it again. 412 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 413 end--; 414 } 415 } else { 416 limit--; 417 } 418 419 return toSplit.subSequence(start, end).toString(); 420 } 421 return endOfData(); 422 } 423 } 424 425 /* 426 * Copied from common.collect.AbstractIterator. TODO(kevinb): un-fork if these 427 * packages are ever combined into a single library. 428 */ 429 private abstract static class AbstractIterator<T> implements Iterator<T> { 430 State state = State.NOT_READY; 431 432 enum State { 433 READY, NOT_READY, DONE, FAILED, 434 } 435 436 T next; 437 438 protected abstract T computeNext(); 439 440 protected final T endOfData() { 441 state = State.DONE; 442 return null; 443 } 444 445 public final boolean hasNext() { 446 checkState(state != State.FAILED); 447 switch (state) { 448 case DONE: 449 return false; 450 case READY: 451 return true; 452 default: 453 } 454 return tryToComputeNext(); 455 } 456 457 boolean tryToComputeNext() { 458 state = State.FAILED; // temporary pessimism 459 next = computeNext(); 460 if (state != State.DONE) { 461 state = State.READY; 462 return true; 463 } 464 return false; 465 } 466 467 public final T next() { 468 if (!hasNext()) { 469 throw new NoSuchElementException(); 470 } 471 state = State.NOT_READY; 472 return next; 473 } 474 475 @Override public void remove() { 476 throw new UnsupportedOperationException(); 477 } 478 } 479 }