001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package com.google.common.base; 018 019import static com.google.common.base.Preconditions.checkArgument; 020import static com.google.common.base.Preconditions.checkNotNull; 021 022import com.google.common.annotations.Beta; 023import com.google.common.annotations.GwtCompatible; 024import com.google.common.annotations.GwtIncompatible; 025 026import java.util.Collections; 027import java.util.Iterator; 028import java.util.LinkedHashMap; 029import java.util.Map; 030import java.util.regex.Matcher; 031import java.util.regex.Pattern; 032 033import javax.annotation.CheckReturnValue; 034 035/** 036 * Extracts non-overlapping substrings from an input string, typically by 037 * recognizing appearances of a <i>separator</i> sequence. This separator can be 038 * specified as a single {@linkplain #on(char) character}, fixed {@linkplain 039 * #on(String) string}, {@linkplain #onPattern regular expression} or {@link 040 * #on(CharMatcher) CharMatcher} instance. Or, instead of using a separator at 041 * all, a splitter can extract adjacent substrings of a given {@linkplain 042 * #fixedLength fixed length}. 043 * 044 * <p>For example, this expression: <pre> {@code 045 * 046 * Splitter.on(',').split("foo,bar,qux")}</pre> 047 * 048 * ... produces an {@code Iterable} containing {@code "foo"}, {@code "bar"} and 049 * {@code "qux"}, in that order. 050 * 051 * <p>By default, {@code Splitter}'s behavior is simplistic and unassuming. The 052 * following expression: <pre> {@code 053 * 054 * Splitter.on(',').split(" foo,,, bar ,")}</pre> 055 * 056 * ... yields the substrings {@code [" foo", "", "", " bar ", ""]}. If this 057 * is not the desired behavior, use configuration methods to obtain a <i>new</i> 058 * splitter instance with modified behavior: <pre> {@code 059 * 060 * private static final Splitter MY_SPLITTER = Splitter.on(',') 061 * .trimResults() 062 * .omitEmptyStrings();}</pre> 063 * 064 * Now {@code MY_SPLITTER.split("foo,,, bar ,")} returns just {@code ["foo", 065 * "bar"]}. Note that the order in which these configuration methods are called 066 * is never significant. 067 * 068 * <p><b>Warning:</b> Splitter instances are immutable. Invoking a configuration 069 * method has no effect on the receiving instance; you must store and use the 070 * new splitter instance it returns instead. <pre> {@code 071 * 072 * // Do NOT do this 073 * Splitter splitter = Splitter.on('/'); 074 * splitter.trimResults(); // does nothing! 075 * return splitter.split("wrong / wrong / wrong");}</pre> 076 * 077 * <p>For separator-based splitters that do not use {@code omitEmptyStrings}, an 078 * input string containing {@code n} occurrences of the separator naturally 079 * yields an iterable of size {@code n + 1}. So if the separator does not occur 080 * anywhere in the input, a single substring is returned containing the entire 081 * input. Consequently, all splitters split the empty string to {@code [""]} 082 * (note: even fixed-length splitters). 083 * 084 * <p>Splitter instances are thread-safe immutable, and are therefore safe to 085 * store as {@code static final} constants. 086 * 087 * <p>The {@link Joiner} class provides the inverse operation to splitting, but 088 * note that a round-trip between the two should be assumed to be lossy. 089 * 090 * <p>See the Guava User Guide article on <a href= 091 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#Splitter"> 092 * {@code Splitter}</a>. 093 * 094 * @author Julien Silland 095 * @author Jesse Wilson 096 * @author Kevin Bourrillion 097 * @author Louis Wasserman 098 * @since 1.0 099 */ 100@GwtCompatible(emulated = true) 101public final class Splitter { 102 private final CharMatcher trimmer; 103 private final boolean omitEmptyStrings; 104 private final Strategy strategy; 105 private final int limit; 106 107 private Splitter(Strategy strategy) { 108 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE); 109 } 110 111 private Splitter(Strategy strategy, boolean omitEmptyStrings, 112 CharMatcher trimmer, int limit) { 113 this.strategy = strategy; 114 this.omitEmptyStrings = omitEmptyStrings; 115 this.trimmer = trimmer; 116 this.limit = limit; 117 } 118 119 /** 120 * Returns a splitter that uses the given single-character separator. For 121 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable 122 * containing {@code ["foo", "", "bar"]}. 123 * 124 * @param separator the character to recognize as a separator 125 * @return a splitter, with default settings, that recognizes that separator 126 */ 127 public static Splitter on(char separator) { 128 return on(CharMatcher.is(separator)); 129 } 130 131 /** 132 * Returns a splitter that considers any single character matched by the 133 * given {@code CharMatcher} to be a separator. For example, {@code 134 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an 135 * iterable containing {@code ["foo", "", "bar", "quux"]}. 136 * 137 * @param separatorMatcher a {@link CharMatcher} that determines whether a 138 * character is a separator 139 * @return a splitter, with default settings, that uses this matcher 140 */ 141 public static Splitter on(final CharMatcher separatorMatcher) { 142 checkNotNull(separatorMatcher); 143 144 return new Splitter(new Strategy() { 145 @Override public SplittingIterator iterator( 146 Splitter splitter, final CharSequence toSplit) { 147 return new SplittingIterator(splitter, toSplit) { 148 @Override int separatorStart(int start) { 149 return separatorMatcher.indexIn(toSplit, start); 150 } 151 152 @Override int separatorEnd(int separatorPosition) { 153 return separatorPosition + 1; 154 } 155 }; 156 } 157 }); 158 } 159 160 /** 161 * Returns a splitter that uses the given fixed string as a separator. For 162 * example, {@code Splitter.on(", ").split("foo, bar,baz")} returns an 163 * iterable containing {@code ["foo", "bar,baz"]}. 164 * 165 * @param separator the literal, nonempty string to recognize as a separator 166 * @return a splitter, with default settings, that recognizes that separator 167 */ 168 public static Splitter on(final String separator) { 169 checkArgument(separator.length() != 0, 170 "The separator may not be the empty string."); 171 172 return new Splitter(new Strategy() { 173 @Override public SplittingIterator iterator( 174 Splitter splitter, CharSequence toSplit) { 175 return new SplittingIterator(splitter, toSplit) { 176 @Override public int separatorStart(int start) { 177 int delimeterLength = separator.length(); 178 179 positions: 180 for (int p = start, last = toSplit.length() - delimeterLength; 181 p <= last; p++) { 182 for (int i = 0; i < delimeterLength; i++) { 183 if (toSplit.charAt(i + p) != separator.charAt(i)) { 184 continue positions; 185 } 186 } 187 return p; 188 } 189 return -1; 190 } 191 192 @Override public int separatorEnd(int separatorPosition) { 193 return separatorPosition + separator.length(); 194 } 195 }; 196 } 197 }); 198 } 199 200 /** 201 * Returns a splitter that considers any subsequence matching {@code 202 * pattern} to be a separator. For example, {@code 203 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string 204 * into lines whether it uses DOS-style or UNIX-style line terminators. 205 * 206 * @param separatorPattern the pattern that determines whether a subsequence 207 * is a separator. This pattern may not match the empty string. 208 * @return a splitter, with default settings, that uses this pattern 209 * @throws IllegalArgumentException if {@code separatorPattern} matches the 210 * empty string 211 */ 212 @GwtIncompatible("java.util.regex") 213 public static Splitter on(final Pattern separatorPattern) { 214 checkNotNull(separatorPattern); 215 checkArgument(!separatorPattern.matcher("").matches(), 216 "The pattern may not match the empty string: %s", separatorPattern); 217 218 return new Splitter(new Strategy() { 219 @Override public SplittingIterator iterator( 220 final Splitter splitter, CharSequence toSplit) { 221 final Matcher matcher = separatorPattern.matcher(toSplit); 222 return new SplittingIterator(splitter, toSplit) { 223 @Override public int separatorStart(int start) { 224 return matcher.find(start) ? matcher.start() : -1; 225 } 226 227 @Override public int separatorEnd(int separatorPosition) { 228 return matcher.end(); 229 } 230 }; 231 } 232 }); 233 } 234 235 /** 236 * Returns a splitter that considers any subsequence matching a given 237 * pattern (regular expression) to be a separator. For example, {@code 238 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines 239 * whether it uses DOS-style or UNIX-style line terminators. This is 240 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}. 241 * 242 * @param separatorPattern the pattern that determines whether a subsequence 243 * is a separator. This pattern may not match the empty string. 244 * @return a splitter, with default settings, that uses this pattern 245 * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern} 246 * is a malformed expression 247 * @throws IllegalArgumentException if {@code separatorPattern} matches the 248 * empty string 249 */ 250 @GwtIncompatible("java.util.regex") 251 public static Splitter onPattern(String separatorPattern) { 252 return on(Pattern.compile(separatorPattern)); 253 } 254 255 /** 256 * Returns a splitter that divides strings into pieces of the given length. 257 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an 258 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be 259 * smaller than {@code length} but will never be empty. 260 * 261 * <p><b>Exception:</b> for consistency with separator-based splitters, {@code 262 * split("")} does not yield an empty iterable, but an iterable containing 263 * {@code ""}. This is the only case in which {@code 264 * Iterables.size(split(input))} does not equal {@code 265 * IntMath.divide(input.length(), length, CEILING)}. To avoid this behavior, 266 * use {@code omitEmptyStrings}. 267 * 268 * @param length the desired length of pieces after splitting, a positive 269 * integer 270 * @return a splitter, with default settings, that can split into fixed sized 271 * pieces 272 * @throws IllegalArgumentException if {@code length} is zero or negative 273 */ 274 public static Splitter fixedLength(final int length) { 275 checkArgument(length > 0, "The length may not be less than 1"); 276 277 return new Splitter(new Strategy() { 278 @Override public SplittingIterator iterator( 279 final Splitter splitter, CharSequence toSplit) { 280 return new SplittingIterator(splitter, toSplit) { 281 @Override public int separatorStart(int start) { 282 int nextChunkStart = start + length; 283 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1); 284 } 285 286 @Override public int separatorEnd(int separatorPosition) { 287 return separatorPosition; 288 } 289 }; 290 } 291 }); 292 } 293 294 /** 295 * Returns a splitter that behaves equivalently to {@code this} splitter, but 296 * automatically omits empty strings from the results. For example, {@code 297 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an 298 * iterable containing only {@code ["a", "b", "c"]}. 299 * 300 * <p>If either {@code trimResults} option is also specified when creating a 301 * splitter, that splitter always trims results first before checking for 302 * emptiness. So, for example, {@code 303 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns 304 * an empty iterable. 305 * 306 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)} 307 * to return an empty iterable, but when using this option, it can (if the 308 * input sequence consists of nothing but separators). 309 * 310 * @return a splitter with the desired configuration 311 */ 312 @CheckReturnValue 313 public Splitter omitEmptyStrings() { 314 return new Splitter(strategy, true, trimmer, limit); 315 } 316 317 /** 318 * Returns a splitter that behaves equivalently to {@code this} splitter but 319 * stops splitting after it reaches the limit. 320 * The limit defines the maximum number of items returned by the iterator. 321 * 322 * <p>For example, 323 * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable 324 * containing {@code ["a", "b", "c,d"]}. When omitting empty strings, the 325 * omitted strings do no count. Hence, 326 * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")} 327 * returns an iterable containing {@code ["a", "b", "c,d"}. 328 * When trim is requested, all entries, including the last are trimmed. Hence 329 * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")} 330 * results in @{code ["a", "b", "c , d"]}. 331 * 332 * @param limit the maximum number of items returns 333 * @return a splitter with the desired configuration 334 * @since 9.0 335 */ 336 @CheckReturnValue 337 public Splitter limit(int limit) { 338 checkArgument(limit > 0, "must be greater than zero: %s", limit); 339 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 340 } 341 342 /** 343 * Returns a splitter that behaves equivalently to {@code this} splitter, but 344 * automatically removes leading and trailing {@linkplain 345 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent 346 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code 347 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable 348 * containing {@code ["a", "b", "c"]}. 349 * 350 * @return a splitter with the desired configuration 351 */ 352 @CheckReturnValue 353 public Splitter trimResults() { 354 return trimResults(CharMatcher.WHITESPACE); 355 } 356 357 /** 358 * Returns a splitter that behaves equivalently to {@code this} splitter, but 359 * removes all leading or trailing characters matching the given {@code 360 * CharMatcher} from each returned substring. For example, {@code 361 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")} 362 * returns an iterable containing {@code ["a ", "b_ ", "c"]}. 363 * 364 * @param trimmer a {@link CharMatcher} that determines whether a character 365 * should be removed from the beginning/end of a subsequence 366 * @return a splitter with the desired configuration 367 */ 368 // TODO(kevinb): throw if a trimmer was already specified! 369 @CheckReturnValue 370 public Splitter trimResults(CharMatcher trimmer) { 371 checkNotNull(trimmer); 372 return new Splitter(strategy, omitEmptyStrings, trimmer, limit); 373 } 374 375 /** 376 * Splits {@code sequence} into string components and makes them available 377 * through an {@link Iterator}, which may be lazily evaluated. 378 * 379 * @param sequence the sequence of characters to split 380 * @return an iteration over the segments split from the parameter. 381 */ 382 public Iterable<String> split(final CharSequence sequence) { 383 checkNotNull(sequence); 384 385 return new Iterable<String>() { 386 @Override public Iterator<String> iterator() { 387 return spliterator(sequence); 388 } 389 @Override public String toString() { 390 return Joiner.on(", ") 391 .appendTo(new StringBuilder().append('['), this) 392 .append(']') 393 .toString(); 394 } 395 }; 396 } 397 398 private Iterator<String> spliterator(CharSequence sequence) { 399 return strategy.iterator(this, sequence); 400 } 401 402 /** 403 * Returns a {@code MapSplitter} which splits entries based on this splitter, 404 * and splits entries into keys and values using the specified separator. 405 * 406 * @since 10.0 407 */ 408 @CheckReturnValue 409 @Beta 410 public MapSplitter withKeyValueSeparator(String separator) { 411 return withKeyValueSeparator(on(separator)); 412 } 413 414 /** 415 * Returns a {@code MapSplitter} which splits entries based on this splitter, 416 * and splits entries into keys and values using the specified separator. 417 * 418 * @since 14.0 419 */ 420 @CheckReturnValue 421 @Beta 422 public MapSplitter withKeyValueSeparator(char separator) { 423 return withKeyValueSeparator(on(separator)); 424 } 425 426 /** 427 * Returns a {@code MapSplitter} which splits entries based on this splitter, 428 * and splits entries into keys and values using the specified key-value 429 * splitter. 430 * 431 * @since 10.0 432 */ 433 @CheckReturnValue 434 @Beta 435 public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) { 436 return new MapSplitter(this, keyValueSplitter); 437 } 438 439 /** 440 * An object that splits strings into maps as {@code Splitter} splits 441 * iterables and lists. Like {@code Splitter}, it is thread-safe and 442 * immutable. 443 * 444 * @since 10.0 445 */ 446 @Beta 447 public static final class MapSplitter { 448 private static final String INVALID_ENTRY_MESSAGE = 449 "Chunk [%s] is not a valid entry"; 450 private final Splitter outerSplitter; 451 private final Splitter entrySplitter; 452 453 private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) { 454 this.outerSplitter = outerSplitter; // only "this" is passed 455 this.entrySplitter = checkNotNull(entrySplitter); 456 } 457 458 /** 459 * Splits {@code sequence} into substrings, splits each substring into 460 * an entry, and returns an unmodifiable map with each of the entries. For 461 * example, <code> 462 * Splitter.on(';').trimResults().withKeyValueSeparator("=>") 463 * .split("a=>b ; c=>b") 464 * </code> will return a mapping from {@code "a"} to {@code "b"} and 465 * {@code "c"} to {@code b}. 466 * 467 * <p>The returned map preserves the order of the entries from 468 * {@code sequence}. 469 * 470 * @throws IllegalArgumentException if the specified sequence does not split 471 * into valid map entries, or if there are duplicate keys 472 */ 473 public Map<String, String> split(CharSequence sequence) { 474 Map<String, String> map = new LinkedHashMap<String, String>(); 475 for (String entry : outerSplitter.split(sequence)) { 476 Iterator<String> entryFields = entrySplitter.spliterator(entry); 477 478 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 479 String key = entryFields.next(); 480 checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key); 481 482 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 483 String value = entryFields.next(); 484 map.put(key, value); 485 486 checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry); 487 } 488 return Collections.unmodifiableMap(map); 489 } 490 } 491 492 private interface Strategy { 493 Iterator<String> iterator(Splitter splitter, CharSequence toSplit); 494 } 495 496 private abstract static class SplittingIterator extends AbstractIterator<String> { 497 final CharSequence toSplit; 498 final CharMatcher trimmer; 499 final boolean omitEmptyStrings; 500 501 /** 502 * Returns the first index in {@code toSplit} at or after {@code start} 503 * that contains the separator. 504 */ 505 abstract int separatorStart(int start); 506 507 /** 508 * Returns the first index in {@code toSplit} after {@code 509 * separatorPosition} that does not contain a separator. This method is only 510 * invoked after a call to {@code separatorStart}. 511 */ 512 abstract int separatorEnd(int separatorPosition); 513 514 int offset = 0; 515 int limit; 516 517 protected SplittingIterator(Splitter splitter, CharSequence toSplit) { 518 this.trimmer = splitter.trimmer; 519 this.omitEmptyStrings = splitter.omitEmptyStrings; 520 this.limit = splitter.limit; 521 this.toSplit = toSplit; 522 } 523 524 @Override protected String computeNext() { 525 /* 526 * The returned string will be from the end of the last match to the 527 * beginning of the next one. nextStart is the start position of the 528 * returned substring, while offset is the place to start looking for a 529 * separator. 530 */ 531 int nextStart = offset; 532 while (offset != -1) { 533 int start = nextStart; 534 int end; 535 536 int separatorPosition = separatorStart(offset); 537 if (separatorPosition == -1) { 538 end = toSplit.length(); 539 offset = -1; 540 } else { 541 end = separatorPosition; 542 offset = separatorEnd(separatorPosition); 543 } 544 if (offset == nextStart) { 545 /* 546 * This occurs when some pattern has an empty match, even if it 547 * doesn't match the empty string -- for example, if it requires 548 * lookahead or the like. The offset must be increased to look for 549 * separators beyond this point, without changing the start position 550 * of the next returned substring -- so nextStart stays the same. 551 */ 552 offset++; 553 if (offset >= toSplit.length()) { 554 offset = -1; 555 } 556 continue; 557 } 558 559 while (start < end && trimmer.matches(toSplit.charAt(start))) { 560 start++; 561 } 562 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 563 end--; 564 } 565 566 if (omitEmptyStrings && start == end) { 567 // Don't include the (unused) separator in next split string. 568 nextStart = offset; 569 continue; 570 } 571 572 if (limit == 1) { 573 // The limit has been reached, return the rest of the string as the 574 // final item. This is tested after empty string removal so that 575 // empty strings do not count towards the limit. 576 end = toSplit.length(); 577 offset = -1; 578 // Since we may have changed the end, we need to trim it again. 579 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) { 580 end--; 581 } 582 } else { 583 limit--; 584 } 585 586 return toSplit.subSequence(start, end).toString(); 587 } 588 return endOfData(); 589 } 590 } 591}