001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.escape;
016
017import com.google.common.annotations.GwtCompatible;
018import com.google.common.base.Function;
019
020/**
021 * An object that converts literal text into a format safe for inclusion in a particular context
022 * (such as an XML document). Typically (but not always), the inverse process of "unescaping" the
023 * text is performed automatically by the relevant parser.
024 *
025 * <p>For example, an XML escaper would convert the literal string {@code "Foo<Bar>"} into {@code
026 * "Foo&lt;Bar&gt;"} to prevent {@code "<Bar>"} from being confused with an XML tag. When the
027 * resulting XML document is parsed, the parser API will return this text as the original literal
028 * string {@code "Foo<Bar>"}.
029 *
030 * <p>An {@code Escaper} instance is required to be stateless, and safe when used concurrently by
031 * multiple threads.
032 *
033 * <p>Because, in general, escaping operates on the code points of a string and not on its
034 * individual {@code char} values, it is not safe to assume that {@code escape(s)} is equivalent to
035 * {@code escape(s.substring(0, n)) + escape(s.substing(n))} for arbitrary {@code n}. This is
036 * because of the possibility of splitting a surrogate pair. The only case in which it is safe to
037 * escape strings and concatenate the results is if you can rule out this possibility, either by
038 * splitting an existing long string into short strings adaptively around
039 * {@linkplain Character#isHighSurrogate surrogate} {@linkplain Character#isLowSurrogate pairs}, or
040 * by starting with short strings already known to be free of unpaired surrogates.
041 *
042 * <p>The two primary implementations of this interface are {@link CharEscaper} and
043 * {@link UnicodeEscaper}. They are heavily optimized for performance and greatly simplify the task
044 * of implementing new escapers. It is strongly recommended that when implementing a new escaper you
045 * extend one of these classes. If you find that you are unable to achieve the desired behavior
046 * using either of these classes, please contact the Java libraries team for advice.
047 *
048 * <p>Popular escapers are defined as constants in classes like
049 * {@link com.google.common.html.HtmlEscapers} and {@link com.google.common.xml.XmlEscapers}. To
050 * create your own escapers, use {@link CharEscaperBuilder}, or extend {@code CharEscaper} or
051 * {@code UnicodeEscaper}.
052 *
053 * @author David Beaumont
054 * @since 15.0
055 */
056@GwtCompatible
057public abstract class Escaper {
058  // TODO(user): evaluate custom implementations, considering package private constructor.
059  /** Constructor for use by subclasses. */
060  protected Escaper() {}
061
062  /**
063   * Returns the escaped form of a given literal string.
064   *
065   * <p>Note that this method may treat input characters differently depending on the specific
066   * escaper implementation.
067   *
068   * <ul>
069   * <li>{@link UnicodeEscaper} handles <a href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a>
070   * correctly, including surrogate character pairs. If the input is badly formed the escaper should
071   * throw {@link IllegalArgumentException}.
072   * <li>{@link CharEscaper} handles Java characters independently and does not verify the input for
073   * well formed characters. A {@code CharEscaper} should not be used in situations where input is
074   * not guaranteed to be restricted to the Basic Multilingual Plane (BMP).
075   * </ul>
076   *
077   * @param string the literal string to be escaped
078   * @return the escaped form of {@code string}
079   * @throws NullPointerException if {@code string} is null
080   * @throws IllegalArgumentException if {@code string} contains badly formed UTF-16 or cannot be
081   *     escaped for any other reason
082   */
083  public abstract String escape(String string);
084
085  private final Function<String, String> asFunction =
086      new Function<String, String>() {
087        @Override
088        public String apply(String from) {
089          return escape(from);
090        }
091      };
092
093  /**
094   * Returns a {@link Function} that invokes {@link #escape(String)} on this escaper.
095   */
096  public final Function<String, String> asFunction() {
097    return asFunction;
098  }
099}