001 /*
002 * Copyright (C) 2009 Google Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.base;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021 import static com.google.common.base.Preconditions.checkState;
022
023 import com.google.common.annotations.GwtCompatible;
024 import com.google.common.annotations.GwtIncompatible;
025
026 import java.util.Iterator;
027 import java.util.NoSuchElementException;
028 import java.util.regex.Matcher;
029 import java.util.regex.Pattern;
030
031 /**
032 * An object that divides strings (or other instances of {@code CharSequence})
033 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter")
034 * which can be expressed as a single character, literal string, regular
035 * expression, {@code CharMatcher}, or by using a fixed substring length. This
036 * class provides the complementary functionality to {@link Joiner}.
037 *
038 * <p>Here is the most basic example of {@code Splitter} usage: <pre> {@code
039 *
040 * Splitter.on(',').split("foo,bar")}</pre>
041 *
042 * This invocation returns an {@code Iterable<String>} containing {@code "foo"}
043 * and {@code "bar"}, in that order.
044 *
045 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre> {@code
046 *
047 * Splitter.on(',').split("foo,,bar, quux")}</pre>
048 *
049 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}.
050 * Notice that the splitter does not assume that you want empty strings removed,
051 * or that you wish to trim whitespace. If you want features like these, simply
052 * ask for them: <pre> {@code
053 *
054 * private static final Splitter MY_SPLITTER = Splitter.on(',')
055 * .trimResults()
056 * .omitEmptyStrings();}</pre>
057 *
058 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable
059 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which
060 * the configuration methods are called is never significant; for instance,
061 * trimming is always applied first before checking for an empty result,
062 * regardless of the order in which the {@link #trimResults()} and
063 * {@link #omitEmptyStrings()} methods were invoked.
064 *
065 * <p><b>Warning: splitter instances are always immutable</b>; a configuration
066 * method such as {@code omitEmptyStrings} has no effect on the instance it
067 * is invoked on! You must store and use the new splitter instance returned by
068 * the method. This makes splitters thread-safe, and safe to store as {@code
069 * static final} constants (as illustrated above). <pre> {@code
070 *
071 * // Bad! Do not do this!
072 * Splitter splitter = Splitter.on('/');
073 * splitter.trimResults(); // does nothing!
074 * return splitter.split("wrong / wrong / wrong");}</pre>
075 *
076 * The separator recognized by the splitter does not have to be a single
077 * literal character as in the examples above. See the methods {@link
078 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples
079 * of other ways to specify separators.
080 *
081 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of
082 * similar JDK methods; for instance, it does not silently discard trailing
083 * separators, as does {@link String#split(String)}, nor does it have a default
084 * behavior of using five particular whitespace characters as separators, like
085 * {@link java.util.StringTokenizer}.
086 *
087 * @author Julien Silland
088 * @author Jesse Wilson
089 * @author Kevin Bourrillion
090 * @since 1
091 */
092 @GwtCompatible
093 public final class Splitter {
094 private final CharMatcher trimmer;
095 private final boolean omitEmptyStrings;
096 private final Strategy strategy;
097
098 private Splitter(Strategy strategy) {
099 this(strategy, false, CharMatcher.NONE);
100 }
101
102 private Splitter(Strategy strategy, boolean omitEmptyStrings,
103 CharMatcher trimmer) {
104 this.strategy = strategy;
105 this.omitEmptyStrings = omitEmptyStrings;
106 this.trimmer = trimmer;
107 }
108
109 /**
110 * Returns a splitter that uses the given single-character separator. For
111 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
112 * containing {@code ["foo", "", "bar"]}.
113 *
114 * @param separator the character to recognize as a separator
115 * @return a splitter, with default settings, that recognizes that separator
116 */
117 public static Splitter on(char separator) {
118 return on(CharMatcher.is(separator));
119 }
120
121 /**
122 * Returns a splitter that considers any single character matched by the
123 * given {@code CharMatcher} to be a separator. For example, {@code
124 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
125 * iterable containing {@code ["foo", "", "bar", "quux"]}.
126 *
127 * @param separatorMatcher a {@link CharMatcher} that determines whether a
128 * character is a separator
129 * @return a splitter, with default settings, that uses this matcher
130 */
131 public static Splitter on(final CharMatcher separatorMatcher) {
132 checkNotNull(separatorMatcher);
133
134 return new Splitter(new Strategy() {
135 @Override public SplittingIterator iterator(
136 Splitter splitter, final CharSequence toSplit) {
137 return new SplittingIterator(splitter, toSplit) {
138 @Override int separatorStart(int start) {
139 return separatorMatcher.indexIn(toSplit, start);
140 }
141
142 @Override int separatorEnd(int separatorPosition) {
143 return separatorPosition + 1;
144 }
145 };
146 }
147 });
148 }
149
150 /**
151 * Returns a splitter that uses the given fixed string as a separator. For
152 * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an
153 * iterable containing {@code ["foo", "bar", "baz,qux"]}.
154 *
155 * @param separator the literal, nonempty string to recognize as a separator
156 * @return a splitter, with default settings, that recognizes that separator
157 */
158 public static Splitter on(final String separator) {
159 checkArgument(separator.length() != 0,
160 "The separator may not be the empty string.");
161
162 return new Splitter(new Strategy() {
163 @Override public SplittingIterator iterator(
164 Splitter splitter, CharSequence toSplit) {
165 return new SplittingIterator(splitter, toSplit) {
166 @Override public int separatorStart(int start) {
167 int delimeterLength = separator.length();
168
169 positions:
170 for (int p = start, last = toSplit.length() - delimeterLength;
171 p <= last; p++) {
172 for (int i = 0; i < delimeterLength; i++) {
173 if (toSplit.charAt(i + p) != separator.charAt(i)) {
174 continue positions;
175 }
176 }
177 return p;
178 }
179 return -1;
180 }
181
182 @Override public int separatorEnd(int separatorPosition) {
183 return separatorPosition + separator.length();
184 }
185 };
186 }
187 });
188 }
189
190 /**
191 * Returns a splitter that considers any subsequence matching {@code
192 * pattern} to be a separator. For example, {@code
193 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string
194 * into lines whether it uses DOS-style or UNIX-style line terminators.
195 *
196 * @param separatorPattern the pattern that determines whether a subsequence
197 * is a separator. This pattern may not match the empty string.
198 * @return a splitter, with default settings, that uses this pattern
199 * @throws IllegalArgumentException if {@code separatorPattern} matches the
200 * empty string
201 */
202 @GwtIncompatible("java.util.regex")
203 public static Splitter on(final Pattern separatorPattern) {
204 checkNotNull(separatorPattern);
205 checkArgument(!separatorPattern.matcher("").matches(),
206 "The pattern may not match the empty string: %s", separatorPattern);
207
208 return new Splitter(new Strategy() {
209 @Override public SplittingIterator iterator(
210 final Splitter splitter, CharSequence toSplit) {
211 final Matcher matcher = separatorPattern.matcher(toSplit);
212 return new SplittingIterator(splitter, toSplit) {
213 @Override public int separatorStart(int start) {
214 return matcher.find(start) ? matcher.start() : -1;
215 }
216
217 @Override public int separatorEnd(int separatorPosition) {
218 return matcher.end();
219 }
220 };
221 }
222 });
223 }
224
225 /**
226 * Returns a splitter that considers any subsequence matching a given
227 * pattern (regular expression) to be a separator. For example, {@code
228 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines
229 * whether it uses DOS-style or UNIX-style line terminators. This is
230 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}.
231 *
232 * @param separatorPattern the pattern that determines whether a subsequence
233 * is a separator. This pattern may not match the empty string.
234 * @return a splitter, with default settings, that uses this pattern
235 * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern}
236 * is a malformed expression
237 * @throws IllegalArgumentException if {@code separatorPattern} matches the
238 * empty string
239 */
240 @GwtIncompatible("java.util.regex")
241 public static Splitter onPattern(String separatorPattern) {
242 return on(Pattern.compile(separatorPattern));
243 }
244
245 /**
246 * Returns a splitter that divides strings into pieces of the given length.
247 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an
248 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
249 * smaller than {@code length} but will never be empty.
250 *
251 * @param length the desired length of pieces after splitting
252 * @return a splitter, with default settings, that can split into fixed sized
253 * pieces
254 */
255 public static Splitter fixedLength(final int length) {
256 checkArgument(length > 0, "The length may not be less than 1");
257
258 return new Splitter(new Strategy() {
259 @Override public SplittingIterator iterator(
260 final Splitter splitter, CharSequence toSplit) {
261 return new SplittingIterator(splitter, toSplit) {
262 @Override public int separatorStart(int start) {
263 int nextChunkStart = start + length;
264 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
265 }
266
267 @Override public int separatorEnd(int separatorPosition) {
268 return separatorPosition;
269 }
270 };
271 }
272 });
273 }
274
275 /**
276 * Returns a splitter that behaves equivalently to {@code this} splitter, but
277 * automatically omits empty strings from the results. For example, {@code
278 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
279 * iterable containing only {@code ["a", "b", "c"]}.
280 *
281 * <p>If either {@code trimResults} option is also specified when creating a
282 * splitter, that splitter always trims results first before checking for
283 * emptiness. So, for example, {@code
284 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
285 * an empty iterable.
286 *
287 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
288 * to return an empty iterable, but when using this option, it can (if the
289 * input sequence consists of nothing but separators).
290 *
291 * @return a splitter with the desired configuration
292 */
293 public Splitter omitEmptyStrings() {
294 return new Splitter(strategy, true, trimmer);
295 }
296
297 /**
298 * Returns a splitter that behaves equivalently to {@code this} splitter, but
299 * automatically removes leading and trailing {@linkplain
300 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
301 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
302 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable
303 * containing {@code ["a", "b", "c"]}.
304 *
305 * @return a splitter with the desired configuration
306 */
307 public Splitter trimResults() {
308 return trimResults(CharMatcher.WHITESPACE);
309 }
310
311 /**
312 * Returns a splitter that behaves equivalently to {@code this} splitter, but
313 * removes all leading or trailing characters matching the given {@code
314 * CharMatcher} from each returned substring. For example, {@code
315 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
316 * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
317 *
318 * @param trimmer a {@link CharMatcher} that determines whether a character
319 * should be removed from the beginning/end of a subsequence
320 * @return a splitter with the desired configuration
321 */
322 // TODO: throw if a trimmer was already specified!
323 public Splitter trimResults(CharMatcher trimmer) {
324 checkNotNull(trimmer);
325 return new Splitter(strategy, omitEmptyStrings, trimmer);
326 }
327
328 /**
329 * Splits the {@link CharSequence} passed in parameter.
330 *
331 * @param sequence the sequence of characters to split
332 * @return an iteration over the segments split from the parameter.
333 */
334 public Iterable<String> split(final CharSequence sequence) {
335 checkNotNull(sequence);
336
337 return new Iterable<String>() {
338 @Override public Iterator<String> iterator() {
339 return strategy.iterator(Splitter.this, sequence);
340 }
341 };
342 }
343
344 private interface Strategy {
345 Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
346 }
347
348 private abstract static class SplittingIterator
349 extends AbstractIterator<String> {
350 final CharSequence toSplit;
351 final CharMatcher trimmer;
352 final boolean omitEmptyStrings;
353
354 /**
355 * Returns the first index in {@code toSplit} at or after {@code start}
356 * that contains the separator.
357 */
358 abstract int separatorStart(int start);
359
360 /**
361 * Returns the first index in {@code toSplit} after {@code
362 * separatorPosition} that does not contain a separator. This method is only
363 * invoked after a call to {@code separatorStart}.
364 */
365 abstract int separatorEnd(int separatorPosition);
366
367 int offset = 0;
368
369 protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
370 this.trimmer = splitter.trimmer;
371 this.omitEmptyStrings = splitter.omitEmptyStrings;
372 this.toSplit = toSplit;
373 }
374
375 @Override protected String computeNext() {
376 while (offset != -1) {
377 int start = offset;
378 int end;
379
380 int separatorPosition = separatorStart(offset);
381 if (separatorPosition == -1) {
382 end = toSplit.length();
383 offset = -1;
384 } else {
385 end = separatorPosition;
386 offset = separatorEnd(separatorPosition);
387 }
388
389 while (start < end && trimmer.matches(toSplit.charAt(start))) {
390 start++;
391 }
392 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
393 end--;
394 }
395
396 if (omitEmptyStrings && start == end) {
397 continue;
398 }
399
400 return toSplit.subSequence(start, end).toString();
401 }
402 return endOfData();
403 }
404 }
405
406 /*
407 * Copied from common.collect.AbstractIterator. TODO: un-fork once these
408 * packages have been combined into a single library.
409 */
410 private static abstract class AbstractIterator<T> implements Iterator<T> {
411 State state = State.NOT_READY;
412
413 enum State {
414 READY, NOT_READY, DONE, FAILED,
415 }
416
417 T next;
418
419 protected abstract T computeNext();
420
421 protected final T endOfData() {
422 state = State.DONE;
423 return null;
424 }
425
426 public final boolean hasNext() {
427 checkState(state != State.FAILED);
428 switch (state) {
429 case DONE:
430 return false;
431 case READY:
432 return true;
433 default:
434 }
435 return tryToComputeNext();
436 }
437
438 boolean tryToComputeNext() {
439 state = State.FAILED; // temporary pessimism
440 next = computeNext();
441 if (state != State.DONE) {
442 state = State.READY;
443 return true;
444 }
445 return false;
446 }
447
448 public final T next() {
449 if (!hasNext()) {
450 throw new NoSuchElementException();
451 }
452 state = State.NOT_READY;
453 return next;
454 }
455
456 @Override public void remove() {
457 throw new UnsupportedOperationException();
458 }
459 }
460 }