001 /* 002 * Copyright (C) 2009 Google Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.base; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 import static com.google.common.base.Preconditions.checkState; 022 023 import com.google.common.annotations.GwtCompatible; 024 import com.google.common.annotations.GwtIncompatible; 025 026 import java.util.Iterator; 027 import java.util.NoSuchElementException; 028 import java.util.regex.Matcher; 029 import java.util.regex.Pattern; 030 031 /** 032 * An object that divides strings (or other instances of {@code CharSequence}) 033 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter") 034 * which can be expressed as a single character, literal string, regular 035 * expression, {@code CharMatcher}, or by using a fixed substring length. This 036 * class provides the complementary functionality to {@link Joiner}. 037 * 038 * <p>Here is the most basic example of {@code Splitter} usage: <pre> {@code 039 * 040 * Splitter.on(',').split("foo,bar")}</pre> 041 * 042 * This invocation returns an {@code Iterable<String>} containing {@code "foo"} 043 * and {@code "bar"}, in that order. 044 * 045 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre> {@code 046 * 047 * Splitter.on(',').split("foo,,bar, quux")}</pre> 048 * 049 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}. 050 * Notice that the splitter does not assume that you want empty strings removed, 051 * or that you wish to trim whitespace. If you want features like these, simply 052 * ask for them: <pre> {@code 053 * 054 * private static final Splitter MY_SPLITTER = Splitter.on(',') 055 * .trimResults() 056 * .omitEmptyStrings();}</pre> 057 * 058 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable 059 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which 060 * the configuration methods are called is never significant; for instance, 061 * trimming is always applied first before checking for an empty result, 062 * regardless of the order in which the {@link #trimResults()} and 063 * {@link #omitEmptyStrings()} methods were invoked. 064 * 065 * <p><b>Warning: splitter instances are always immutable</b>; a configuration 066 * method such as {@code omitEmptyStrings} has no effect on the instance it 067 * is invoked on! You must store and use the new splitter instance returned by 068 * the method. This makes splitters thread-safe, and safe to store as {@code 069 * static final} constants (as illustrated above). <pre> {@code 070 * 071 * // Bad! Do not do this! 072 * Splitter splitter = Splitter.on('/'); 073 * splitter.trimResults(); // does nothing! 074 * return splitter.split("wrong / wrong / wrong");}</pre> 075 * 076 * The separator recognized by the splitter does not have to be a single 077 * literal character as in the examples above. See the methods {@link 078 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples 079 * of other ways to specify separators. 080 * 081 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of 082 * similar JDK methods; for instance, it does not silently discard trailing 083 * separators, as does {@link String#split(String)}, nor does it have a default 084 * behavior of using five particular whitespace characters as separators, like 085 * {@link java.util.StringTokenizer}. 086 * 087 * @author Julien Silland 088 * @author Jesse Wilson 089 * @author Kevin Bourrillion 090 * @since 1 091 */ 092 @GwtCompatible 093 public final class Splitter { 094 private final CharMatcher trimmer; 095 private final boolean omitEmptyStrings; 096 private final Strategy strategy; 097 098 private Splitter(Strategy strategy) { 099 this(strategy, false, CharMatcher.NONE); 100 } 101 102 private Splitter(Strategy strategy, boolean omitEmptyStrings, 103 CharMatcher trimmer) { 104 this.strategy = strategy; 105 this.omitEmptyStrings = omitEmptyStrings; 106 this.trimmer = trimmer; 107 } 108 109 /** 110 * Returns a splitter that uses the given single-character separator. For 111 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable 112 * containing {@code ["foo", "", "bar"]}. 113 * 114 * @param separator the character to recognize as a separator 115 * @return a splitter, with default settings, that recognizes that separator 116 */ 117 public static Splitter on(char separator) { 118 return on(CharMatcher.is(separator)); 119 } 120 121 /** 122 * Returns a splitter that considers any single character matched by the 123 * given {@code CharMatcher} to be a separator. For example, {@code 124 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an 125 * iterable containing {@code ["foo", "", "bar", "quux"]}. 126 * 127 * @param separatorMatcher a {@link CharMatcher} that determines whether a 128 * character is a separator 129 * @return a splitter, with default settings, that uses this matcher 130 */ 131 public static Splitter on(final CharMatcher separatorMatcher) { 132 checkNotNull(separatorMatcher); 133 134 return new Splitter(new Strategy() { 135 @Override public SplittingIterator iterator( 136 Splitter splitter, final CharSequence toSplit) { 137 return new SplittingIterator(splitter, toSplit) { 138 @Override int separatorStart(int start) { 139 return separatorMatcher.indexIn(toSplit, start); 140 } 141 142 @Override int separatorEnd(int separatorPosition) { 143 return separatorPosition + 1; 144 } 145 }; 146 } 147 }); 148 } 149 150 /** 151 * Returns a splitter that uses the given fixed string as a separator. For 152 * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an 153 * iterable containing {@code ["foo", "bar", "baz,qux"]}. 154 * 155 * @param separator the literal, nonempty string to recognize as a separator 156 * @return a splitter, with default settings, that recognizes that separator 157 */ 158 public static Splitter on(final String separator) { 159 checkArgument(separator.length() != 0, 160 "The separator may not be the empty string."); 161 162 return new Splitter(new Strategy() { 163 @Override public SplittingIterator iterator( 164 Splitter splitter, CharSequence toSplit) { 165 return new SplittingIterator(splitter, toSplit) { 166 @Override public int separatorStart(int start) { 167 int delimeterLength = separator.length(); 168 169 positions: 170 for (int p = start, last = toSplit.length() - delimeterLength; 171 p <= last; p++) { 172 for (int i = 0; i < delimeterLength; i++) { 173 if (toSplit.charAt(i + p) != separator.charAt(i)) { 174 continue positions; 175 } 176 } 177 return p; 178 } 179 return -1; 180 } 181 182 @Override public int separatorEnd(int separatorPosition) { 183 return separatorPosition + separator.length(); 184 } 185 }; 186 } 187 }); 188 } 189 190 /** 191 * Returns a splitter that considers any subsequence matching {@code 192 * pattern} to be a separator. For example, {@code 193 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string 194 * into lines whether it uses DOS-style or UNIX-style line terminators. 195 * 196 * @param separatorPattern the pattern that determines whether a subsequence 197 * is a separator. This pattern may not match the empty string. 198 * @return a splitter, with default settings, that uses this pattern 199 * @throws IllegalArgumentException if {@code separatorPattern} matches the 200 * empty string 201 */ 202 @GwtIncompatible("java.util.regex") 203 public static Splitter on(final Pattern separatorPattern) { 204 checkNotNull(separatorPattern); 205 checkArgument(!separatorPattern.matcher("").matches(), 206 "The pattern may not match the empty string: %s", separatorPattern); 207 208 return new Splitter(new Strategy() { 209 @Override public SplittingIterator iterator( 210 final Splitter splitter, CharSequence toSplit) { 211 final Matcher matcher = separatorPattern.matcher(toSplit); 212 return new SplittingIterator(splitter, toSplit) { 213 @Override public int separatorStart(int start) { 214 return matcher.find(start) ? matcher.start() : -1; 215 } 216 217 @Override public int separatorEnd(int separatorPosition) { 218 return matcher.end(); 219 } 220 }; 221 } 222 }); 223 } 224 225 /** 226 * Returns a splitter that considers any subsequence matching a given 227 * pattern (regular expression) to be a separator. For example, {@code 228 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines 229 * whether it uses DOS-style or UNIX-style line terminators. This is 230 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}. 231 * 232 * @param separatorPattern the pattern that determines whether a subsequence 233 * is a separator. This pattern may not match the empty string. 234 * @return a splitter, with default settings, that uses this pattern 235 * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern} 236 * is a malformed expression 237 * @throws IllegalArgumentException if {@code separatorPattern} matches the 238 * empty string 239 */ 240 @GwtIncompatible("java.util.regex") 241 public static Splitter onPattern(String separatorPattern) { 242 return on(Pattern.compile(separatorPattern)); 243 } 244 245 /** 246 * Returns a splitter that divides strings into pieces of the given length. 247 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an 248 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be 249 * smaller than {@code length} but will never be empty. 250 * 251 * @param length the desired length of pieces after splitting 252 * @return a splitter, with default settings, that can split into fixed sized 253 * pieces 254 */ 255 public static Splitter fixedLength(final int length) { 256 checkArgument(length > 0, "The length may not be less than 1"); 257 258 return new Splitter(new Strategy() { 259 @Override public SplittingIterator iterator( 260 final Splitter splitter, CharSequence toSplit) { 261 return new SplittingIterator(splitter, toSplit) { 262 @Override public int separatorStart(int start) { 263 int nextChunkStart = start + length; 264 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1); 265 } 266 267 @Override public int separatorEnd(int separatorPosition) { 268 return separatorPosition; 269 } 270 }; 271 } 272 }); 273 } 274 275 /** 276 * Returns a splitter that behaves equivalently to {@code this} splitter, but 277 * automatically omits empty strings from the results. For example, {@code 278 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an 279 * iterable containing only {@code ["a", "b", "c"]}. 280 * 281 * <p>If either {@code trimResults} option is also specified when creating a 282 * splitter, that splitter always trims results first before checking for 283 * emptiness. So, for example, {@code 284 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns 285 * an empty iterable. 286 * 287 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)} 288 * to return an empty iterable, but when using this option, it can (if the 289 * input sequence consists of nothing but separators). 290 * 291 * @return a splitter with the desired configuration 292 */ 293 public Splitter omitEmptyStrings() { 294 return new Splitter(strategy, true, trimmer); 295 } 296 297 /** 298 * Returns a splitter that behaves equivalently to {@code this} splitter, but 299 * automatically removes leading and trailing {@linkplain 300 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent 301 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code 302 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable 303 * containing {@code ["a", "b", "c"]}. 304 * 305 * @return a splitter with the desired configuration 306 */ 307 public Splitter trimResults() { 308 return trimResults(CharMatcher.WHITESPACE); 309 } 310 311 /** 312 * Returns a splitter that behaves equivalently to {@code this} splitter, but 313 * removes all leading or trailing characters matching the given {@code 314 * CharMatcher} from each returned substring. For example, {@code 315 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")} 316 * returns an iterable containing {@code ["a ", "b_ ", "c"]}. 317 * 318 * @param trimmer a {@link CharMatcher} that determines whether a character 319 * should be removed from the beginning/end of a subsequence 320 * @return a splitter with the desired configuration 321 */ 322 // TODO: throw if a trimmer was already specified! 323 public Splitter trimResults(CharMatcher trimmer) { 324 checkNotNull(trimmer); 325 return new Splitter(strategy, omitEmptyStrings, trimmer); 326 } 327 328 /** 329 * Splits the {@link CharSequence} passed in parameter. 330 * 331 * @param sequence the sequence of characters to split 332 * @return an iteration over the segments split from the parameter. 333 */ 334 public Iterable<String> split(final CharSequence sequence) { 335 checkNotNull(sequence); 336 337 return new Iterable<String>() { 338 @Override public Iterator<String> iterator() { 339 return strategy.iterator(Splitter.this, sequence); 340 } 341 }; 342 } 343 344 private interface Strategy { 345 Iterator<String> iterator(Splitter splitter, CharSequence toSplit); 346 } 347 348 private abstract static class SplittingIterator 349 extends AbstractIterator<String> { 350 final CharSequence toSplit; 351 final CharMatcher trimmer; 352 final boolean omitEmptyStrings; 353 354 /** 355 * Returns the first index in {@code toSplit} at or after {@code start} 356 * that contains the separator. 357 */ 358 abstract int separatorStart(int start); 359 360 /** 361 * Returns the first index in {@code toSplit} after {@code 362 * separatorPosition} that does not contain a separator. This method is only 363 * invoked after a call to {@code separatorStart}. 364 */ 365 abstract int separatorEnd(int separatorPosition); 366 367 int offset = 0; 368 369 protected SplittingIterator(Splitter splitter, CharSequence toSplit) { 370 this.trimmer = splitter.trimmer; 371 this.omitEmptyStrings = splitter.omitEmptyStrings; 372 this.toSplit = toSplit; 373 } 374 375 @Override protected String computeNext() { 376 while (offset != -1) { 377 int start = offset; 378 int end; 379 380 int separatorPosition = separatorStart(offset); 381 if (separatorPosition == -1) { 382 end = toSplit.length(); 383 offset = -1; 384 } else { 385 end = separatorPosition; 386 offset = separatorEnd(separatorPosition); 387 } 388 389 while (start < end && trimmer.matches(toSplit.charAt(start))) { 390 start++; 391 } 392 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 393 end--; 394 } 395 396 if (omitEmptyStrings && start == end) { 397 continue; 398 } 399 400 return toSplit.subSequence(start, end).toString(); 401 } 402 return endOfData(); 403 } 404 } 405 406 /* 407 * Copied from common.collect.AbstractIterator. TODO: un-fork once these 408 * packages have been combined into a single library. 409 */ 410 private static abstract class AbstractIterator<T> implements Iterator<T> { 411 State state = State.NOT_READY; 412 413 enum State { 414 READY, NOT_READY, DONE, FAILED, 415 } 416 417 T next; 418 419 protected abstract T computeNext(); 420 421 protected final T endOfData() { 422 state = State.DONE; 423 return null; 424 } 425 426 public final boolean hasNext() { 427 checkState(state != State.FAILED); 428 switch (state) { 429 case DONE: 430 return false; 431 case READY: 432 return true; 433 default: 434 } 435 return tryToComputeNext(); 436 } 437 438 boolean tryToComputeNext() { 439 state = State.FAILED; // temporary pessimism 440 next = computeNext(); 441 if (state != State.DONE) { 442 state = State.READY; 443 return true; 444 } 445 return false; 446 } 447 448 public final T next() { 449 if (!hasNext()) { 450 throw new NoSuchElementException(); 451 } 452 state = State.NOT_READY; 453 return next; 454 } 455 456 @Override public void remove() { 457 throw new UnsupportedOperationException(); 458 } 459 } 460 }