001    /*
002     * Copyright (C) 2009 The Guava Authors
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package com.google.common.base;
018    
019    import static com.google.common.base.Preconditions.checkArgument;
020    import static com.google.common.base.Preconditions.checkNotNull;
021    import static com.google.common.base.Preconditions.checkState;
022    
023    import com.google.common.annotations.Beta;
024    import com.google.common.annotations.GwtCompatible;
025    import com.google.common.annotations.GwtIncompatible;
026    
027    import java.util.Collections;
028    import java.util.Iterator;
029    import java.util.LinkedHashMap;
030    import java.util.Map;
031    import java.util.NoSuchElementException;
032    import java.util.regex.Matcher;
033    import java.util.regex.Pattern;
034    
035    import javax.annotation.CheckReturnValue;
036    
037    /**
038     * An object that divides strings (or other instances of {@code CharSequence})
039     * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter")
040     * which can be expressed as a single character, literal string, regular
041     * expression, {@code CharMatcher}, or by using a fixed substring length. This
042     * class provides the complementary functionality to {@link Joiner}.
043     *
044     * <p>Here is the most basic example of {@code Splitter} usage: <pre>   {@code
045     *
046     *   Splitter.on(',').split("foo,bar")}</pre>
047     *
048     * This invocation returns an {@code Iterable<String>} containing {@code "foo"}
049     * and {@code "bar"}, in that order.
050     *
051     * <p>By default {@code Splitter}'s behavior is very simplistic: <pre>   {@code
052     *
053     *   Splitter.on(',').split("foo,,bar, quux")}</pre>
054     *
055     * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}.
056     * Notice that the splitter does not assume that you want empty strings removed,
057     * or that you wish to trim whitespace. If you want features like these, simply
058     * ask for them: <pre> {@code
059     *
060     *   private static final Splitter MY_SPLITTER = Splitter.on(',')
061     *       .trimResults()
062     *       .omitEmptyStrings();}</pre>
063     *
064     * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable
065     * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which
066     * the configuration methods are called is never significant; for instance,
067     * trimming is always applied first before checking for an empty result,
068     * regardless of the order in which the {@link #trimResults()} and
069     * {@link #omitEmptyStrings()} methods were invoked.
070     *
071     * <p><b>Warning: splitter instances are always immutable</b>; a configuration
072     * method such as {@code omitEmptyStrings} has no effect on the instance it
073     * is invoked on! You must store and use the new splitter instance returned by
074     * the method. This makes splitters thread-safe, and safe to store as {@code
075     * static final} constants (as illustrated above). <pre>   {@code
076     *
077     *   // Bad! Do not do this!
078     *   Splitter splitter = Splitter.on('/');
079     *   splitter.trimResults(); // does nothing!
080     *   return splitter.split("wrong / wrong / wrong");}</pre>
081     *
082     * The separator recognized by the splitter does not have to be a single
083     * literal character as in the examples above. See the methods {@link
084     * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples
085     * of other ways to specify separators.
086     *
087     * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of
088     * similar JDK methods; for instance, it does not silently discard trailing
089     * separators, as does {@link String#split(String)}, nor does it have a default
090     * behavior of using five particular whitespace characters as separators, like
091     * {@link java.util.StringTokenizer}.
092     *
093     * @author Julien Silland
094     * @author Jesse Wilson
095     * @author Kevin Bourrillion
096     * @author Louis Wasserman
097     * @since 1.0
098     */
099    @GwtCompatible(emulated = true)
100    public final class Splitter {
101      private final CharMatcher trimmer;
102      private final boolean omitEmptyStrings;
103      private final Strategy strategy;
104      private final int limit;
105    
106      private Splitter(Strategy strategy) {
107        this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE);
108      }
109    
110      private Splitter(Strategy strategy, boolean omitEmptyStrings,
111          CharMatcher trimmer, int limit) {
112        this.strategy = strategy;
113        this.omitEmptyStrings = omitEmptyStrings;
114        this.trimmer = trimmer;
115        this.limit = limit;
116      }
117    
118      /**
119       * Returns a splitter that uses the given single-character separator. For
120       * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
121       * containing {@code ["foo", "", "bar"]}.
122       *
123       * @param separator the character to recognize as a separator
124       * @return a splitter, with default settings, that recognizes that separator
125       */
126      public static Splitter on(char separator) {
127        return on(CharMatcher.is(separator));
128      }
129    
130      /**
131       * Returns a splitter that considers any single character matched by the
132       * given {@code CharMatcher} to be a separator. For example, {@code
133       * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
134       * iterable containing {@code ["foo", "", "bar", "quux"]}.
135       *
136       * @param separatorMatcher a {@link CharMatcher} that determines whether a
137       *     character is a separator
138       * @return a splitter, with default settings, that uses this matcher
139       */
140      public static Splitter on(final CharMatcher separatorMatcher) {
141        checkNotNull(separatorMatcher);
142    
143        return new Splitter(new Strategy() {
144          @Override public SplittingIterator iterator(
145              Splitter splitter, final CharSequence toSplit) {
146            return new SplittingIterator(splitter, toSplit) {
147              @Override int separatorStart(int start) {
148                return separatorMatcher.indexIn(toSplit, start);
149              }
150    
151              @Override int separatorEnd(int separatorPosition) {
152                return separatorPosition + 1;
153              }
154            };
155          }
156        });
157      }
158    
159      /**
160       * Returns a splitter that uses the given fixed string as a separator. For
161       * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an
162       * iterable containing {@code ["foo", "bar", "baz,qux"]}.
163       *
164       * @param separator the literal, nonempty string to recognize as a separator
165       * @return a splitter, with default settings, that recognizes that separator
166       */
167      public static Splitter on(final String separator) {
168        checkArgument(separator.length() != 0,
169            "The separator may not be the empty string.");
170    
171        return new Splitter(new Strategy() {
172          @Override public SplittingIterator iterator(
173              Splitter splitter, CharSequence toSplit) {
174            return new SplittingIterator(splitter, toSplit) {
175              @Override public int separatorStart(int start) {
176                int delimeterLength = separator.length();
177    
178                positions:
179                for (int p = start, last = toSplit.length() - delimeterLength;
180                    p <= last; p++) {
181                  for (int i = 0; i < delimeterLength; i++) {
182                    if (toSplit.charAt(i + p) != separator.charAt(i)) {
183                      continue positions;
184                    }
185                  }
186                  return p;
187                }
188                return -1;
189              }
190    
191              @Override public int separatorEnd(int separatorPosition) {
192                return separatorPosition + separator.length();
193              }
194            };
195          }
196        });
197      }
198    
199      /**
200       * Returns a splitter that considers any subsequence matching {@code
201       * pattern} to be a separator. For example, {@code
202       * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string
203       * into lines whether it uses DOS-style or UNIX-style line terminators.
204       *
205       * @param separatorPattern the pattern that determines whether a subsequence
206       *     is a separator. This pattern may not match the empty string.
207       * @return a splitter, with default settings, that uses this pattern
208       * @throws IllegalArgumentException if {@code separatorPattern} matches the
209       *     empty string
210       */
211      @GwtIncompatible("java.util.regex")
212      public static Splitter on(final Pattern separatorPattern) {
213        checkNotNull(separatorPattern);
214        checkArgument(!separatorPattern.matcher("").matches(),
215            "The pattern may not match the empty string: %s", separatorPattern);
216    
217        return new Splitter(new Strategy() {
218          @Override public SplittingIterator iterator(
219              final Splitter splitter, CharSequence toSplit) {
220            final Matcher matcher = separatorPattern.matcher(toSplit);
221            return new SplittingIterator(splitter, toSplit) {
222              @Override public int separatorStart(int start) {
223                return matcher.find(start) ? matcher.start() : -1;
224              }
225    
226              @Override public int separatorEnd(int separatorPosition) {
227                return matcher.end();
228              }
229            };
230          }
231        });
232      }
233    
234      /**
235       * Returns a splitter that considers any subsequence matching a given
236       * pattern (regular expression) to be a separator. For example, {@code
237       * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines
238       * whether it uses DOS-style or UNIX-style line terminators. This is
239       * equivalent to {@code Splitter.on(Pattern.compile(pattern))}.
240       *
241       * @param separatorPattern the pattern that determines whether a subsequence
242       *     is a separator. This pattern may not match the empty string.
243       * @return a splitter, with default settings, that uses this pattern
244       * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern}
245       *     is a malformed expression
246       * @throws IllegalArgumentException if {@code separatorPattern} matches the
247       *     empty string
248       */
249      @GwtIncompatible("java.util.regex")
250      public static Splitter onPattern(String separatorPattern) {
251        return on(Pattern.compile(separatorPattern));
252      }
253    
254      /**
255       * Returns a splitter that divides strings into pieces of the given length.
256       * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an
257       * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
258       * smaller than {@code length} but will never be empty.
259       *
260       * @param length the desired length of pieces after splitting
261       * @return a splitter, with default settings, that can split into fixed sized
262       *     pieces
263       */
264      public static Splitter fixedLength(final int length) {
265        checkArgument(length > 0, "The length may not be less than 1");
266    
267        return new Splitter(new Strategy() {
268          @Override public SplittingIterator iterator(
269              final Splitter splitter, CharSequence toSplit) {
270            return new SplittingIterator(splitter, toSplit) {
271              @Override public int separatorStart(int start) {
272                int nextChunkStart = start + length;
273                return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
274              }
275    
276              @Override public int separatorEnd(int separatorPosition) {
277                return separatorPosition;
278              }
279            };
280          }
281        });
282      }
283    
284      /**
285       * Returns a splitter that behaves equivalently to {@code this} splitter, but
286       * automatically omits empty strings from the results. For example, {@code
287       * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
288       * iterable containing only {@code ["a", "b", "c"]}.
289       *
290       * <p>If either {@code trimResults} option is also specified when creating a
291       * splitter, that splitter always trims results first before checking for
292       * emptiness. So, for example, {@code
293       * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
294       * an empty iterable.
295       *
296       * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
297       * to return an empty iterable, but when using this option, it can (if the
298       * input sequence consists of nothing but separators).
299       *
300       * @return a splitter with the desired configuration
301       */
302      @CheckReturnValue
303      public Splitter omitEmptyStrings() {
304        return new Splitter(strategy, true, trimmer, limit);
305      }
306    
307      /**
308       * Returns a splitter that behaves equivalently to {@code this} splitter but
309       * stops splitting after it reaches the limit.
310       * The limit defines the maximum number of items returned by the iterator.
311       *
312       * <p>For example,
313       * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable
314       * containing {@code ["a", "b", "c,d"]}.  When omitting empty strings, the
315       * omitted strings do no count.  Hence,
316       * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")}
317       * returns an iterable containing {@code ["a", "b", "c,d"}.
318       * When trim is requested, all entries, including the last are trimmed.  Hence
319       * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")}
320       * results in @{code ["a", "b", "c , d"]}.
321       *
322       * @param limit the maximum number of items returns
323       * @return a splitter with the desired configuration
324       * @since 9.0
325       */
326      @CheckReturnValue
327      public Splitter limit(int limit) {
328        checkArgument(limit > 0, "must be greater then zero: %s", limit);
329        return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
330      }
331    
332      /**
333       * Returns a splitter that behaves equivalently to {@code this} splitter, but
334       * automatically removes leading and trailing {@linkplain
335       * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
336       * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
337       * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable
338       * containing {@code ["a", "b", "c"]}.
339       *
340       * @return a splitter with the desired configuration
341       */
342      @CheckReturnValue
343      public Splitter trimResults() {
344        return trimResults(CharMatcher.WHITESPACE);
345      }
346    
347      /**
348       * Returns a splitter that behaves equivalently to {@code this} splitter, but
349       * removes all leading or trailing characters matching the given {@code
350       * CharMatcher} from each returned substring. For example, {@code
351       * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
352       * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
353       *
354       * @param trimmer a {@link CharMatcher} that determines whether a character
355       *     should be removed from the beginning/end of a subsequence
356       * @return a splitter with the desired configuration
357       */
358      // TODO(kevinb): throw if a trimmer was already specified!
359      @CheckReturnValue
360      public Splitter trimResults(CharMatcher trimmer) {
361        checkNotNull(trimmer);
362        return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
363      }
364    
365      /**
366       * Splits {@code sequence} into string components and makes them available
367       * through an {@link Iterator}, which may be lazily evaluated.
368       *
369       * @param sequence the sequence of characters to split
370       * @return an iteration over the segments split from the parameter.
371       */
372      public Iterable<String> split(final CharSequence sequence) {
373        checkNotNull(sequence);
374    
375        return new Iterable<String>() {
376          @Override public Iterator<String> iterator() {
377            return spliterator(sequence);
378          }
379        };
380      }
381    
382      private Iterator<String> spliterator(CharSequence sequence) {
383        return strategy.iterator(this, sequence);
384      }
385    
386      /**
387       * Returns a {@code MapSplitter} which splits entries based on this splitter,
388       * and splits entries into keys and values using the specified separator.
389       *
390       * @since 10.0
391       */
392      @CheckReturnValue
393      @Beta
394      public MapSplitter withKeyValueSeparator(String separator) {
395        return withKeyValueSeparator(on(separator));
396      }
397    
398      /**
399       * Returns a {@code MapSplitter} which splits entries based on this splitter,
400       * and splits entries into keys and values using the specified key-value
401       * splitter.
402       *
403       * @since 10.0
404       */
405      @CheckReturnValue
406      @Beta
407      public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) {
408        return new MapSplitter(this, keyValueSplitter);
409      }
410    
411      /**
412       * An object that splits strings into maps as {@code Splitter} splits
413       * iterables and lists. Like {@code Splitter}, it is thread-safe and
414       * immutable.
415       *
416       * @since 10.0
417       */
418      @Beta
419      public static final class MapSplitter {
420        private static final String INVALID_ENTRY_MESSAGE =
421            "Chunk [%s] is not a valid entry";
422        private final Splitter outerSplitter;
423        private final Splitter entrySplitter;
424    
425        private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) {
426          this.outerSplitter = outerSplitter; // only "this" is passed
427          this.entrySplitter = checkNotNull(entrySplitter);
428        }
429    
430        /**
431         * Splits {@code sequence} into substrings, splits each substring into
432         * an entry, and returns an unmodifiable map with each of the entries. For
433         * example, <code>
434         * Splitter.on(';').trimResults().withKeyValueSeparator("=>")
435         * .split("a=>b ; c=>b")
436         * </code> will return a mapping from {@code "a"} to {@code "b"} and
437         * {@code "c"} to {@code b}.
438         *
439         * <p>The returned map preserves the order of the entries from
440         * {@code sequence}.
441         *
442         * @throws IllegalArgumentException if the specified sequence does not split
443         *         into valid map entries, or if there are duplicate keys
444         */
445        public Map<String, String> split(CharSequence sequence) {
446          Map<String, String> map = new LinkedHashMap<String, String>();
447          for (String entry : outerSplitter.split(sequence)) {
448            Iterator<String> entryFields = entrySplitter.spliterator(entry);
449    
450            checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
451            String key = entryFields.next();
452            checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key);
453    
454            checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
455            String value = entryFields.next();
456            map.put(key, value);
457    
458            checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
459          }
460          return Collections.unmodifiableMap(map);
461        }
462      }
463    
464      private interface Strategy {
465        Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
466      }
467    
468      private abstract static class SplittingIterator
469          extends AbstractIterator<String> {
470        final CharSequence toSplit;
471        final CharMatcher trimmer;
472        final boolean omitEmptyStrings;
473    
474        /**
475         * Returns the first index in {@code toSplit} at or after {@code start}
476         * that contains the separator.
477         */
478        abstract int separatorStart(int start);
479    
480        /**
481         * Returns the first index in {@code toSplit} after {@code
482         * separatorPosition} that does not contain a separator. This method is only
483         * invoked after a call to {@code separatorStart}.
484         */
485        abstract int separatorEnd(int separatorPosition);
486    
487        int offset = 0;
488        int limit;
489    
490        protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
491          this.trimmer = splitter.trimmer;
492          this.omitEmptyStrings = splitter.omitEmptyStrings;
493          this.limit = splitter.limit;
494          this.toSplit = toSplit;
495        }
496    
497        @Override protected String computeNext() {
498          while (offset != -1) {
499            int start = offset;
500            int end;
501    
502            int separatorPosition = separatorStart(offset);
503            if (separatorPosition == -1) {
504              end = toSplit.length();
505              offset = -1;
506            } else {
507              end = separatorPosition;
508              offset = separatorEnd(separatorPosition);
509            }
510    
511            while (start < end && trimmer.matches(toSplit.charAt(start))) {
512              start++;
513            }
514            while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
515              end--;
516            }
517    
518            if (omitEmptyStrings && start == end) {
519              continue;
520            }
521    
522            if (limit == 1) {
523              // The limit has been reached, return the rest of the string as the
524              // final item.  This is tested after empty string removal so that
525              // empty strings do not count towards the limit.
526              end = toSplit.length();
527              offset = -1;
528              // Since we may have changed the end, we need to trim it again.
529              while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
530                end--;
531              }
532            } else {
533              limit--;
534            }
535    
536            return toSplit.subSequence(start, end).toString();
537          }
538          return endOfData();
539        }
540      }
541    
542      /*
543       * Copied from common.collect.AbstractIterator. TODO(kevinb): un-fork if these
544       * packages are ever combined into a single library.
545       */
546      private abstract static class AbstractIterator<T> implements Iterator<T> {
547        State state = State.NOT_READY;
548    
549        enum State {
550          READY, NOT_READY, DONE, FAILED,
551        }
552    
553        T next;
554    
555        protected abstract T computeNext();
556    
557        protected final T endOfData() {
558          state = State.DONE;
559          return null;
560        }
561    
562        @Override
563        public final boolean hasNext() {
564          checkState(state != State.FAILED);
565          switch (state) {
566            case DONE:
567              return false;
568            case READY:
569              return true;
570            default:
571          }
572          return tryToComputeNext();
573        }
574    
575        boolean tryToComputeNext() {
576          state = State.FAILED; // temporary pessimism
577          next = computeNext();
578          if (state != State.DONE) {
579            state = State.READY;
580            return true;
581          }
582          return false;
583        }
584    
585        @Override
586        public final T next() {
587          if (!hasNext()) {
588            throw new NoSuchElementException();
589          }
590          state = State.NOT_READY;
591          return next;
592        }
593    
594        @Override public void remove() {
595          throw new UnsupportedOperationException();
596        }
597      }
598    }