001 /* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.base; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 022 import com.google.common.annotations.Beta; 023 import com.google.common.annotations.GwtCompatible; 024 import com.google.common.annotations.GwtIncompatible; 025 026 import java.util.Collections; 027 import java.util.Iterator; 028 import java.util.LinkedHashMap; 029 import java.util.Map; 030 import java.util.regex.Matcher; 031 import java.util.regex.Pattern; 032 033 import javax.annotation.CheckReturnValue; 034 035 /** 036 * An object that divides strings (or other instances of {@code CharSequence}) 037 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter") 038 * which can be expressed as a single character, literal string, regular 039 * expression, {@code CharMatcher}, or by using a fixed substring length. This 040 * class provides the complementary functionality to {@link Joiner}. 041 * 042 * <p>Here is the most basic example of {@code Splitter} usage: <pre> {@code 043 * 044 * Splitter.on(',').split("foo,bar")}</pre> 045 * 046 * This invocation returns an {@code Iterable<String>} containing {@code "foo"} 047 * and {@code "bar"}, in that order. 048 * 049 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre> {@code 050 * 051 * Splitter.on(',').split("foo,,bar, quux")}</pre> 052 * 053 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}. 054 * Notice that the splitter does not assume that you want empty strings removed, 055 * or that you wish to trim whitespace. If you want features like these, simply 056 * ask for them: <pre> {@code 057 * 058 * private static final Splitter MY_SPLITTER = Splitter.on(',') 059 * .trimResults() 060 * .omitEmptyStrings();}</pre> 061 * 062 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable 063 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which 064 * the configuration methods are called is never significant; for instance, 065 * trimming is always applied first before checking for an empty result, 066 * regardless of the order in which the {@link #trimResults()} and 067 * {@link #omitEmptyStrings()} methods were invoked. 068 * 069 * <p><b>Warning: splitter instances are always immutable</b>; a configuration 070 * method such as {@code omitEmptyStrings} has no effect on the instance it 071 * is invoked on! You must store and use the new splitter instance returned by 072 * the method. This makes splitters thread-safe, and safe to store as {@code 073 * static final} constants (as illustrated above). <pre> {@code 074 * 075 * // Bad! Do not do this! 076 * Splitter splitter = Splitter.on('/'); 077 * splitter.trimResults(); // does nothing! 078 * return splitter.split("wrong / wrong / wrong");}</pre> 079 * 080 * The separator recognized by the splitter does not have to be a single 081 * literal character as in the examples above. See the methods {@link 082 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples 083 * of other ways to specify separators. 084 * 085 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of 086 * similar JDK methods; for instance, it does not silently discard trailing 087 * separators, as does {@link String#split(String)}, nor does it have a default 088 * behavior of using five particular whitespace characters as separators, like 089 * {@link java.util.StringTokenizer}. 090 * 091 * <p>See the Guava User Guide article on <a href= 092 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#Splitter"> 093 * {@code Splitter}</a>. 094 * 095 * @author Julien Silland 096 * @author Jesse Wilson 097 * @author Kevin Bourrillion 098 * @author Louis Wasserman 099 * @since 1.0 100 */ 101 @GwtCompatible(emulated = true) 102 public final class Splitter { 103 private final CharMatcher trimmer; 104 private final boolean omitEmptyStrings; 105 private final Strategy strategy; 106 private final int limit; 107 108 private Splitter(Strategy strategy) { 109 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE); 110 } 111 112 private Splitter(Strategy strategy, boolean omitEmptyStrings, 113 CharMatcher trimmer, int limit) { 114 this.strategy = strategy; 115 this.omitEmptyStrings = omitEmptyStrings; 116 this.trimmer = trimmer; 117 this.limit = limit; 118 } 119 120 /** 121 * Returns a splitter that uses the given single-character separator. For 122 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable 123 * containing {@code ["foo", "", "bar"]}. 124 * 125 * @param separator the character to recognize as a separator 126 * @return a splitter, with default settings, that recognizes that separator 127 */ 128 public static Splitter on(char separator) { 129 return on(CharMatcher.is(separator)); 130 } 131 132 /** 133 * Returns a splitter that considers any single character matched by the 134 * given {@code CharMatcher} to be a separator. For example, {@code 135 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an 136 * iterable containing {@code ["foo", "", "bar", "quux"]}. 137 * 138 * @param separatorMatcher a {@link CharMatcher} that determines whether a 139 * character is a separator 140 * @return a splitter, with default settings, that uses this matcher 141 */ 142 public static Splitter on(final CharMatcher separatorMatcher) { 143 checkNotNull(separatorMatcher); 144 145 return new Splitter(new Strategy() { 146 @Override public SplittingIterator iterator( 147 Splitter splitter, final CharSequence toSplit) { 148 return new SplittingIterator(splitter, toSplit) { 149 @Override int separatorStart(int start) { 150 return separatorMatcher.indexIn(toSplit, start); 151 } 152 153 @Override int separatorEnd(int separatorPosition) { 154 return separatorPosition + 1; 155 } 156 }; 157 } 158 }); 159 } 160 161 /** 162 * Returns a splitter that uses the given fixed string as a separator. For 163 * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an 164 * iterable containing {@code ["foo", "bar", "baz,qux"]}. 165 * 166 * @param separator the literal, nonempty string to recognize as a separator 167 * @return a splitter, with default settings, that recognizes that separator 168 */ 169 public static Splitter on(final String separator) { 170 checkArgument(separator.length() != 0, 171 "The separator may not be the empty string."); 172 173 return new Splitter(new Strategy() { 174 @Override public SplittingIterator iterator( 175 Splitter splitter, CharSequence toSplit) { 176 return new SplittingIterator(splitter, toSplit) { 177 @Override public int separatorStart(int start) { 178 int delimeterLength = separator.length(); 179 180 positions: 181 for (int p = start, last = toSplit.length() - delimeterLength; 182 p <= last; p++) { 183 for (int i = 0; i < delimeterLength; i++) { 184 if (toSplit.charAt(i + p) != separator.charAt(i)) { 185 continue positions; 186 } 187 } 188 return p; 189 } 190 return -1; 191 } 192 193 @Override public int separatorEnd(int separatorPosition) { 194 return separatorPosition + separator.length(); 195 } 196 }; 197 } 198 }); 199 } 200 201 /** 202 * Returns a splitter that considers any subsequence matching {@code 203 * pattern} to be a separator. For example, {@code 204 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string 205 * into lines whether it uses DOS-style or UNIX-style line terminators. 206 * 207 * @param separatorPattern the pattern that determines whether a subsequence 208 * is a separator. This pattern may not match the empty string. 209 * @return a splitter, with default settings, that uses this pattern 210 * @throws IllegalArgumentException if {@code separatorPattern} matches the 211 * empty string 212 */ 213 @GwtIncompatible("java.util.regex") 214 public static Splitter on(final Pattern separatorPattern) { 215 checkNotNull(separatorPattern); 216 checkArgument(!separatorPattern.matcher("").matches(), 217 "The pattern may not match the empty string: %s", separatorPattern); 218 219 return new Splitter(new Strategy() { 220 @Override public SplittingIterator iterator( 221 final Splitter splitter, CharSequence toSplit) { 222 final Matcher matcher = separatorPattern.matcher(toSplit); 223 return new SplittingIterator(splitter, toSplit) { 224 @Override public int separatorStart(int start) { 225 return matcher.find(start) ? matcher.start() : -1; 226 } 227 228 @Override public int separatorEnd(int separatorPosition) { 229 return matcher.end(); 230 } 231 }; 232 } 233 }); 234 } 235 236 /** 237 * Returns a splitter that considers any subsequence matching a given 238 * pattern (regular expression) to be a separator. For example, {@code 239 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines 240 * whether it uses DOS-style or UNIX-style line terminators. This is 241 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}. 242 * 243 * @param separatorPattern the pattern that determines whether a subsequence 244 * is a separator. This pattern may not match the empty string. 245 * @return a splitter, with default settings, that uses this pattern 246 * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern} 247 * is a malformed expression 248 * @throws IllegalArgumentException if {@code separatorPattern} matches the 249 * empty string 250 */ 251 @GwtIncompatible("java.util.regex") 252 public static Splitter onPattern(String separatorPattern) { 253 return on(Pattern.compile(separatorPattern)); 254 } 255 256 /** 257 * Returns a splitter that divides strings into pieces of the given length. 258 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an 259 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be 260 * smaller than {@code length} but will never be empty. 261 * 262 * @param length the desired length of pieces after splitting 263 * @return a splitter, with default settings, that can split into fixed sized 264 * pieces 265 */ 266 public static Splitter fixedLength(final int length) { 267 checkArgument(length > 0, "The length may not be less than 1"); 268 269 return new Splitter(new Strategy() { 270 @Override public SplittingIterator iterator( 271 final Splitter splitter, CharSequence toSplit) { 272 return new SplittingIterator(splitter, toSplit) { 273 @Override public int separatorStart(int start) { 274 int nextChunkStart = start + length; 275 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1); 276 } 277 278 @Override public int separatorEnd(int separatorPosition) { 279 return separatorPosition; 280 } 281 }; 282 } 283 }); 284 } 285 286 /** 287 * Returns a splitter that behaves equivalently to {@code this} splitter, but 288 * automatically omits empty strings from the results. For example, {@code 289 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an 290 * iterable containing only {@code ["a", "b", "c"]}. 291 * 292 * <p>If either {@code trimResults} option is also specified when creating a 293 * splitter, that splitter always trims results first before checking for 294 * emptiness. So, for example, {@code 295 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns 296 * an empty iterable. 297 * 298 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)} 299 * to return an empty iterable, but when using this option, it can (if the 300 * input sequence consists of nothing but separators). 301 * 302 * @return a splitter with the desired configuration 303 */ 304 @CheckReturnValue 305 public Splitter omitEmptyStrings() { 306 return new Splitter(strategy, true, trimmer, limit); 307 } 308 309 /** 310 * Returns a splitter that behaves equivalently to {@code this} splitter but 311 * stops splitting after it reaches the limit. 312 * The limit defines the maximum number of items returned by the iterator. 313 * 314 * <p>For example, 315 * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable 316 * containing {@code ["a", "b", "c,d"]}. When omitting empty strings, the 317 * omitted strings do no count. Hence, 318 * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")} 319 * returns an iterable containing {@code ["a", "b", "c,d"}. 320 * When trim is requested, all entries, including the last are trimmed. Hence 321 * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")} 322 * results in @{code ["a", "b", "c , d"]}. 323 * 324 * @param limit the maximum number of items returns 325 * @return a splitter with the desired configuration 326 * @since 9.0 327 */ 328 @CheckReturnValue 329 public Splitter limit(int limit) { 330 checkArgument(limit > 0, "must be greater than zero: %s", limit); 331 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 332 } 333 334 /** 335 * Returns a splitter that behaves equivalently to {@code this} splitter, but 336 * automatically removes leading and trailing {@linkplain 337 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent 338 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code 339 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable 340 * containing {@code ["a", "b", "c"]}. 341 * 342 * @return a splitter with the desired configuration 343 */ 344 @CheckReturnValue 345 public Splitter trimResults() { 346 return trimResults(CharMatcher.WHITESPACE); 347 } 348 349 /** 350 * Returns a splitter that behaves equivalently to {@code this} splitter, but 351 * removes all leading or trailing characters matching the given {@code 352 * CharMatcher} from each returned substring. For example, {@code 353 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")} 354 * returns an iterable containing {@code ["a ", "b_ ", "c"]}. 355 * 356 * @param trimmer a {@link CharMatcher} that determines whether a character 357 * should be removed from the beginning/end of a subsequence 358 * @return a splitter with the desired configuration 359 */ 360 // TODO(kevinb): throw if a trimmer was already specified! 361 @CheckReturnValue 362 public Splitter trimResults(CharMatcher trimmer) { 363 checkNotNull(trimmer); 364 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 365 } 366 367 /** 368 * Splits {@code sequence} into string components and makes them available 369 * through an {@link Iterator}, which may be lazily evaluated. 370 * 371 * @param sequence the sequence of characters to split 372 * @return an iteration over the segments split from the parameter. 373 */ 374 public Iterable<String> split(final CharSequence sequence) { 375 checkNotNull(sequence); 376 377 return new Iterable<String>() { 378 @Override public Iterator<String> iterator() { 379 return spliterator(sequence); 380 } 381 @Override public String toString() { 382 return Joiner.on(", ") 383 .appendTo(new StringBuilder().append('['), this) 384 .append(']') 385 .toString(); 386 } 387 }; 388 } 389 390 private Iterator<String> spliterator(CharSequence sequence) { 391 return strategy.iterator(this, sequence); 392 } 393 394 /** 395 * Returns a {@code MapSplitter} which splits entries based on this splitter, 396 * and splits entries into keys and values using the specified separator. 397 * 398 * @since 10.0 399 */ 400 @CheckReturnValue 401 @Beta 402 public MapSplitter withKeyValueSeparator(String separator) { 403 return withKeyValueSeparator(on(separator)); 404 } 405 406 /** 407 * Returns a {@code MapSplitter} which splits entries based on this splitter, 408 * and splits entries into keys and values using the specified key-value 409 * splitter. 410 * 411 * @since 10.0 412 */ 413 @CheckReturnValue 414 @Beta 415 public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) { 416 return new MapSplitter(this, keyValueSplitter); 417 } 418 419 /** 420 * An object that splits strings into maps as {@code Splitter} splits 421 * iterables and lists. Like {@code Splitter}, it is thread-safe and 422 * immutable. 423 * 424 * @since 10.0 425 */ 426 @Beta 427 public static final class MapSplitter { 428 private static final String INVALID_ENTRY_MESSAGE = 429 "Chunk [%s] is not a valid entry"; 430 private final Splitter outerSplitter; 431 private final Splitter entrySplitter; 432 433 private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) { 434 this.outerSplitter = outerSplitter; // only "this" is passed 435 this.entrySplitter = checkNotNull(entrySplitter); 436 } 437 438 /** 439 * Splits {@code sequence} into substrings, splits each substring into 440 * an entry, and returns an unmodifiable map with each of the entries. For 441 * example, <code> 442 * Splitter.on(';').trimResults().withKeyValueSeparator("=>") 443 * .split("a=>b ; c=>b") 444 * </code> will return a mapping from {@code "a"} to {@code "b"} and 445 * {@code "c"} to {@code b}. 446 * 447 * <p>The returned map preserves the order of the entries from 448 * {@code sequence}. 449 * 450 * @throws IllegalArgumentException if the specified sequence does not split 451 * into valid map entries, or if there are duplicate keys 452 */ 453 public Map<String, String> split(CharSequence sequence) { 454 Map<String, String> map = new LinkedHashMap<String, String>(); 455 for (String entry : outerSplitter.split(sequence)) { 456 Iterator<String> entryFields = entrySplitter.spliterator(entry); 457 458 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 459 String key = entryFields.next(); 460 checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key); 461 462 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 463 String value = entryFields.next(); 464 map.put(key, value); 465 466 checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 467 } 468 return Collections.unmodifiableMap(map); 469 } 470 } 471 472 private interface Strategy { 473 Iterator<String> iterator(Splitter splitter, CharSequence toSplit); 474 } 475 476 private abstract static class SplittingIterator extends AbstractIterator<String> { 477 final CharSequence toSplit; 478 final CharMatcher trimmer; 479 final boolean omitEmptyStrings; 480 481 /** 482 * Returns the first index in {@code toSplit} at or after {@code start} 483 * that contains the separator. 484 */ 485 abstract int separatorStart(int start); 486 487 /** 488 * Returns the first index in {@code toSplit} after {@code 489 * separatorPosition} that does not contain a separator. This method is only 490 * invoked after a call to {@code separatorStart}. 491 */ 492 abstract int separatorEnd(int separatorPosition); 493 494 int offset = 0; 495 int limit; 496 497 protected SplittingIterator(Splitter splitter, CharSequence toSplit) { 498 this.trimmer = splitter.trimmer; 499 this.omitEmptyStrings = splitter.omitEmptyStrings; 500 this.limit = splitter.limit; 501 this.toSplit = toSplit; 502 } 503 504 @Override protected String computeNext() { 505 /* 506 * The returned string will be from the end of the last match to the 507 * beginning of the next one. nextStart is the start position of the 508 * returned substring, while offset is the place to start looking for a 509 * separator. 510 */ 511 int nextStart = offset; 512 while (offset != -1) { 513 int start = nextStart; 514 int end; 515 516 int separatorPosition = separatorStart(offset); 517 if (separatorPosition == -1) { 518 end = toSplit.length(); 519 offset = -1; 520 } else { 521 end = separatorPosition; 522 offset = separatorEnd(separatorPosition); 523 } 524 if (offset == nextStart) { 525 /* 526 * This occurs when some pattern has an empty match, even if it 527 * doesn't match the empty string -- for example, if it requires 528 * lookahead or the like. The offset must be increased to look for 529 * separators beyond this point, without changing the start position 530 * of the next returned substring -- so nextStart stays the same. 531 */ 532 offset++; 533 if (offset >= toSplit.length()) { 534 offset = -1; 535 } 536 continue; 537 } 538 539 while (start < end && trimmer.matches(toSplit.charAt(start))) { 540 start++; 541 } 542 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 543 end--; 544 } 545 546 if (omitEmptyStrings && start == end) { 547 // Don't include the (unused) separator in next split string. 548 nextStart = offset; 549 continue; 550 } 551 552 if (limit == 1) { 553 // The limit has been reached, return the rest of the string as the 554 // final item. This is tested after empty string removal so that 555 // empty strings do not count towards the limit. 556 end = toSplit.length(); 557 offset = -1; 558 // Since we may have changed the end, we need to trim it again. 559 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 560 end--; 561 } 562 } else { 563 limit--; 564 } 565 566 return toSplit.subSequence(start, end).toString(); 567 } 568 return endOfData(); 569 } 570 } 571 }