001 /*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.base;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021 import static com.google.common.base.Preconditions.checkState;
022
023 import com.google.common.annotations.Beta;
024 import com.google.common.annotations.GwtCompatible;
025 import com.google.common.annotations.GwtIncompatible;
026
027 import java.util.Iterator;
028 import java.util.NoSuchElementException;
029 import java.util.regex.Matcher;
030 import java.util.regex.Pattern;
031
032 /**
033 * An object that divides strings (or other instances of {@code CharSequence})
034 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter")
035 * which can be expressed as a single character, literal string, regular
036 * expression, {@code CharMatcher}, or by using a fixed substring length. This
037 * class provides the complementary functionality to {@link Joiner}.
038 *
039 * <p>Here is the most basic example of {@code Splitter} usage: <pre> {@code
040 *
041 * Splitter.on(',').split("foo,bar")}</pre>
042 *
043 * This invocation returns an {@code Iterable<String>} containing {@code "foo"}
044 * and {@code "bar"}, in that order.
045 *
046 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre> {@code
047 *
048 * Splitter.on(',').split("foo,,bar, quux")}</pre>
049 *
050 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}.
051 * Notice that the splitter does not assume that you want empty strings removed,
052 * or that you wish to trim whitespace. If you want features like these, simply
053 * ask for them: <pre> {@code
054 *
055 * private static final Splitter MY_SPLITTER = Splitter.on(',')
056 * .trimResults()
057 * .omitEmptyStrings();}</pre>
058 *
059 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable
060 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which
061 * the configuration methods are called is never significant; for instance,
062 * trimming is always applied first before checking for an empty result,
063 * regardless of the order in which the {@link #trimResults()} and
064 * {@link #omitEmptyStrings()} methods were invoked.
065 *
066 * <p><b>Warning: splitter instances are always immutable</b>; a configuration
067 * method such as {@code omitEmptyStrings} has no effect on the instance it
068 * is invoked on! You must store and use the new splitter instance returned by
069 * the method. This makes splitters thread-safe, and safe to store as {@code
070 * static final} constants (as illustrated above). <pre> {@code
071 *
072 * // Bad! Do not do this!
073 * Splitter splitter = Splitter.on('/');
074 * splitter.trimResults(); // does nothing!
075 * return splitter.split("wrong / wrong / wrong");}</pre>
076 *
077 * The separator recognized by the splitter does not have to be a single
078 * literal character as in the examples above. See the methods {@link
079 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples
080 * of other ways to specify separators.
081 *
082 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of
083 * similar JDK methods; for instance, it does not silently discard trailing
084 * separators, as does {@link String#split(String)}, nor does it have a default
085 * behavior of using five particular whitespace characters as separators, like
086 * {@link java.util.StringTokenizer}.
087 *
088 * @author Julien Silland
089 * @author Jesse Wilson
090 * @author Kevin Bourrillion
091 * @since 1
092 */
093 @GwtCompatible(emulated = true)
094 public final class Splitter {
095 private final CharMatcher trimmer;
096 private final boolean omitEmptyStrings;
097 private final Strategy strategy;
098 private final int limit;
099
100 private Splitter(Strategy strategy) {
101 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE);
102 }
103
104 private Splitter(Strategy strategy, boolean omitEmptyStrings,
105 CharMatcher trimmer, int limit) {
106 this.strategy = strategy;
107 this.omitEmptyStrings = omitEmptyStrings;
108 this.trimmer = trimmer;
109 this.limit = limit;
110 }
111
112 /**
113 * Returns a splitter that uses the given single-character separator. For
114 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
115 * containing {@code ["foo", "", "bar"]}.
116 *
117 * @param separator the character to recognize as a separator
118 * @return a splitter, with default settings, that recognizes that separator
119 */
120 public static Splitter on(char separator) {
121 return on(CharMatcher.is(separator));
122 }
123
124 /**
125 * Returns a splitter that considers any single character matched by the
126 * given {@code CharMatcher} to be a separator. For example, {@code
127 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
128 * iterable containing {@code ["foo", "", "bar", "quux"]}.
129 *
130 * @param separatorMatcher a {@link CharMatcher} that determines whether a
131 * character is a separator
132 * @return a splitter, with default settings, that uses this matcher
133 */
134 public static Splitter on(final CharMatcher separatorMatcher) {
135 checkNotNull(separatorMatcher);
136
137 return new Splitter(new Strategy() {
138 @Override public SplittingIterator iterator(
139 Splitter splitter, final CharSequence toSplit) {
140 return new SplittingIterator(splitter, toSplit) {
141 @Override int separatorStart(int start) {
142 return separatorMatcher.indexIn(toSplit, start);
143 }
144
145 @Override int separatorEnd(int separatorPosition) {
146 return separatorPosition + 1;
147 }
148 };
149 }
150 });
151 }
152
153 /**
154 * Returns a splitter that uses the given fixed string as a separator. For
155 * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an
156 * iterable containing {@code ["foo", "bar", "baz,qux"]}.
157 *
158 * @param separator the literal, nonempty string to recognize as a separator
159 * @return a splitter, with default settings, that recognizes that separator
160 */
161 public static Splitter on(final String separator) {
162 checkArgument(separator.length() != 0,
163 "The separator may not be the empty string.");
164
165 return new Splitter(new Strategy() {
166 @Override public SplittingIterator iterator(
167 Splitter splitter, CharSequence toSplit) {
168 return new SplittingIterator(splitter, toSplit) {
169 @Override public int separatorStart(int start) {
170 int delimeterLength = separator.length();
171
172 positions:
173 for (int p = start, last = toSplit.length() - delimeterLength;
174 p <= last; p++) {
175 for (int i = 0; i < delimeterLength; i++) {
176 if (toSplit.charAt(i + p) != separator.charAt(i)) {
177 continue positions;
178 }
179 }
180 return p;
181 }
182 return -1;
183 }
184
185 @Override public int separatorEnd(int separatorPosition) {
186 return separatorPosition + separator.length();
187 }
188 };
189 }
190 });
191 }
192
193 /**
194 * Returns a splitter that considers any subsequence matching {@code
195 * pattern} to be a separator. For example, {@code
196 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string
197 * into lines whether it uses DOS-style or UNIX-style line terminators.
198 *
199 * @param separatorPattern the pattern that determines whether a subsequence
200 * is a separator. This pattern may not match the empty string.
201 * @return a splitter, with default settings, that uses this pattern
202 * @throws IllegalArgumentException if {@code separatorPattern} matches the
203 * empty string
204 */
205 @GwtIncompatible("java.util.regex")
206 public static Splitter on(final Pattern separatorPattern) {
207 checkNotNull(separatorPattern);
208 checkArgument(!separatorPattern.matcher("").matches(),
209 "The pattern may not match the empty string: %s", separatorPattern);
210
211 return new Splitter(new Strategy() {
212 @Override public SplittingIterator iterator(
213 final Splitter splitter, CharSequence toSplit) {
214 final Matcher matcher = separatorPattern.matcher(toSplit);
215 return new SplittingIterator(splitter, toSplit) {
216 @Override public int separatorStart(int start) {
217 return matcher.find(start) ? matcher.start() : -1;
218 }
219
220 @Override public int separatorEnd(int separatorPosition) {
221 return matcher.end();
222 }
223 };
224 }
225 });
226 }
227
228 /**
229 * Returns a splitter that considers any subsequence matching a given
230 * pattern (regular expression) to be a separator. For example, {@code
231 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines
232 * whether it uses DOS-style or UNIX-style line terminators. This is
233 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}.
234 *
235 * @param separatorPattern the pattern that determines whether a subsequence
236 * is a separator. This pattern may not match the empty string.
237 * @return a splitter, with default settings, that uses this pattern
238 * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern}
239 * is a malformed expression
240 * @throws IllegalArgumentException if {@code separatorPattern} matches the
241 * empty string
242 */
243 @GwtIncompatible("java.util.regex")
244 public static Splitter onPattern(String separatorPattern) {
245 return on(Pattern.compile(separatorPattern));
246 }
247
248 /**
249 * Returns a splitter that divides strings into pieces of the given length.
250 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an
251 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
252 * smaller than {@code length} but will never be empty.
253 *
254 * @param length the desired length of pieces after splitting
255 * @return a splitter, with default settings, that can split into fixed sized
256 * pieces
257 */
258 public static Splitter fixedLength(final int length) {
259 checkArgument(length > 0, "The length may not be less than 1");
260
261 return new Splitter(new Strategy() {
262 @Override public SplittingIterator iterator(
263 final Splitter splitter, CharSequence toSplit) {
264 return new SplittingIterator(splitter, toSplit) {
265 @Override public int separatorStart(int start) {
266 int nextChunkStart = start + length;
267 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
268 }
269
270 @Override public int separatorEnd(int separatorPosition) {
271 return separatorPosition;
272 }
273 };
274 }
275 });
276 }
277
278 /**
279 * Returns a splitter that behaves equivalently to {@code this} splitter, but
280 * automatically omits empty strings from the results. For example, {@code
281 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
282 * iterable containing only {@code ["a", "b", "c"]}.
283 *
284 * <p>If either {@code trimResults} option is also specified when creating a
285 * splitter, that splitter always trims results first before checking for
286 * emptiness. So, for example, {@code
287 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
288 * an empty iterable.
289 *
290 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
291 * to return an empty iterable, but when using this option, it can (if the
292 * input sequence consists of nothing but separators).
293 *
294 * @return a splitter with the desired configuration
295 */
296 public Splitter omitEmptyStrings() {
297 return new Splitter(strategy, true, trimmer, limit);
298 }
299
300 /**
301 * Returns a splitter that behaves equivalently to {@code this} splitter but
302 * stops splitting after it reaches the limit.
303 * The limit defines the maximum number of items returned by the iterator.
304 *
305 * <p>For example,
306 * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable
307 * containing {@code ["a", "b", "c,d"]}. When omitting empty strings, the
308 * omitted strings do no count. Hence,
309 * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")}
310 * returns an iterable containing {@code ["a", "b", "c,d"}.
311 * When trim is requested, all entries, including the last are trimmed. Hence
312 * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")}
313 * results in @{code ["a", "b", "c , d"]}.
314 *
315 * @param limit the maximum number of items returns
316 * @return a splitter with the desired configuration
317 * @since 9
318 */
319 @Beta
320 public Splitter limit(int limit) {
321 checkArgument(limit > 0, "must be greater then zero: %s", limit);
322 return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
323 }
324
325 /**
326 * Returns a splitter that behaves equivalently to {@code this} splitter, but
327 * automatically removes leading and trailing {@linkplain
328 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
329 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
330 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable
331 * containing {@code ["a", "b", "c"]}.
332 *
333 * @return a splitter with the desired configuration
334 */
335 public Splitter trimResults() {
336 return trimResults(CharMatcher.WHITESPACE);
337 }
338
339 /**
340 * Returns a splitter that behaves equivalently to {@code this} splitter, but
341 * removes all leading or trailing characters matching the given {@code
342 * CharMatcher} from each returned substring. For example, {@code
343 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
344 * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
345 *
346 * @param trimmer a {@link CharMatcher} that determines whether a character
347 * should be removed from the beginning/end of a subsequence
348 * @return a splitter with the desired configuration
349 */
350 // TODO(kevinb): throw if a trimmer was already specified!
351 public Splitter trimResults(CharMatcher trimmer) {
352 checkNotNull(trimmer);
353 return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
354 }
355
356 /**
357 * Splits {@code sequence} into string components and makes them available
358 * through an {@link Iterator}, which may be lazily evaluated.
359 *
360 * @param sequence the sequence of characters to split
361 * @return an iteration over the segments split from the parameter.
362 */
363 public Iterable<String> split(final CharSequence sequence) {
364 checkNotNull(sequence);
365
366 return new Iterable<String>() {
367 @Override public Iterator<String> iterator() {
368 return strategy.iterator(Splitter.this, sequence);
369 }
370 };
371 }
372
373 private interface Strategy {
374 Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
375 }
376
377 private abstract static class SplittingIterator
378 extends AbstractIterator<String> {
379 final CharSequence toSplit;
380 final CharMatcher trimmer;
381 final boolean omitEmptyStrings;
382
383 /**
384 * Returns the first index in {@code toSplit} at or after {@code start}
385 * that contains the separator.
386 */
387 abstract int separatorStart(int start);
388
389 /**
390 * Returns the first index in {@code toSplit} after {@code
391 * separatorPosition} that does not contain a separator. This method is only
392 * invoked after a call to {@code separatorStart}.
393 */
394 abstract int separatorEnd(int separatorPosition);
395
396 int offset = 0;
397 int limit;
398
399 protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
400 this.trimmer = splitter.trimmer;
401 this.omitEmptyStrings = splitter.omitEmptyStrings;
402 this.limit = splitter.limit;
403 this.toSplit = toSplit;
404 }
405
406 @Override protected String computeNext() {
407 while (offset != -1) {
408 int start = offset;
409 int end;
410
411 int separatorPosition = separatorStart(offset);
412 if (separatorPosition == -1) {
413 end = toSplit.length();
414 offset = -1;
415 } else {
416 end = separatorPosition;
417 offset = separatorEnd(separatorPosition);
418 }
419
420 while (start < end && trimmer.matches(toSplit.charAt(start))) {
421 start++;
422 }
423 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
424 end--;
425 }
426
427 if (omitEmptyStrings && start == end) {
428 continue;
429 }
430
431 if (limit == 1) {
432 // The limit has been reached, return the rest of the string as the
433 // final item. This is tested after empty string removal so that
434 // empty strings do not count towards the limit.
435 end = toSplit.length();
436 offset = -1;
437 // Since we may have changed the end, we need to trim it again.
438 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
439 end--;
440 }
441 } else {
442 limit--;
443 }
444
445 return toSplit.subSequence(start, end).toString();
446 }
447 return endOfData();
448 }
449 }
450
451 /*
452 * Copied from common.collect.AbstractIterator. TODO(kevinb): un-fork if these
453 * packages are ever combined into a single library.
454 */
455 private abstract static class AbstractIterator<T> implements Iterator<T> {
456 State state = State.NOT_READY;
457
458 enum State {
459 READY, NOT_READY, DONE, FAILED,
460 }
461
462 T next;
463
464 protected abstract T computeNext();
465
466 protected final T endOfData() {
467 state = State.DONE;
468 return null;
469 }
470
471 @Override
472 public final boolean hasNext() {
473 checkState(state != State.FAILED);
474 switch (state) {
475 case DONE:
476 return false;
477 case READY:
478 return true;
479 default:
480 }
481 return tryToComputeNext();
482 }
483
484 boolean tryToComputeNext() {
485 state = State.FAILED; // temporary pessimism
486 next = computeNext();
487 if (state != State.DONE) {
488 state = State.READY;
489 return true;
490 }
491 return false;
492 }
493
494 @Override
495 public final T next() {
496 if (!hasNext()) {
497 throw new NoSuchElementException();
498 }
499 state = State.NOT_READY;
500 return next;
501 }
502
503 @Override public void remove() {
504 throw new UnsupportedOperationException();
505 }
506 }
507 }