001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.xml;
016
017import com.google.common.annotations.GwtCompatible;
018import com.google.common.escape.Escaper;
019import com.google.common.escape.Escapers;
020
021/**
022 * {@code Escaper} instances suitable for strings to be included in XML attribute values and
023 * elements' text contents. When possible, avoid manual escaping by using templating systems and
024 * high-level APIs that provide autoescaping. For example, consider <a
025 * href="http://www.xom.nu/">XOM</a> or <a href="http://www.jdom.org/">JDOM</a>.
026 *
027 * <p><b>Note:</b> Currently the escapers provided by this class do not escape any characters
028 * outside the ASCII character range. Unlike HTML escaping the XML escapers will not escape
029 * non-ASCII characters to their numeric entity replacements. These XML escapers provide the minimal
030 * level of escaping to ensure that the output can be safely included in a Unicode XML document.
031 *
032 * <p>For details on the behavior of the escapers in this class, see sections <a
033 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and <a
034 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification.
035 *
036 * @author Alex Matevossian
037 * @author David Beaumont
038 * @since 15.0
039 */
040@GwtCompatible
041@ElementTypesAreNonnullByDefault
042public class XmlEscapers {
043  private XmlEscapers() {}
044
045  private static final char MIN_ASCII_CONTROL_CHAR = 0x00;
046  private static final char MAX_ASCII_CONTROL_CHAR = 0x1F;
047
048  // For each xxxEscaper() method, please add links to external reference pages
049  // that are considered authoritative for the behavior of that escaper.
050
051  /**
052   * Returns an {@link Escaper} instance that escapes special characters in a string so it can
053   * safely be included in an XML document as element content. See section <a
054   * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification.
055   *
056   * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not safe</b> to use this
057   * escaper to escape attribute values. Use {@link #xmlContentEscaper} if the output can appear in
058   * element content or {@link #xmlAttributeEscaper} in attribute values.
059   *
060   * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the
061   * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more
062   * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of
063   * the XML specification.
064   *
065   * <p>This escaper does not escape non-ASCII characters to their numeric character references
066   * (NCR). Any non-ASCII characters appearing in the input will be preserved in the output.
067   * Specifically "\r" (carriage return) is preserved in the output, which may result in it being
068   * silently converted to "\n" when the XML is parsed.
069   *
070   * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode
071   * validation on its input.
072   */
073  public static Escaper xmlContentEscaper() {
074    return XML_CONTENT_ESCAPER;
075  }
076
077  /**
078   * Returns an {@link Escaper} instance that escapes special characters in a string so it can
079   * safely be included in XML document as an attribute value. See section <a
080   * href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a> of the XML
081   * specification.
082   *
083   * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the
084   * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more
085   * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of
086   * the XML specification.
087   *
088   * <p>This escaper does not escape non-ASCII characters to their numeric character references
089   * (NCR). However, horizontal tab {@code '\t'}, line feed {@code '\n'} and carriage return {@code
090   * '\r'} are escaped to a corresponding NCR {@code "&#x9;"}, {@code "&#xA;"}, and {@code "&#xD;"}
091   * respectively. Any other non-ASCII characters appearing in the input will be preserved in the
092   * output.
093   *
094   * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode
095   * validation on its input.
096   */
097  public static Escaper xmlAttributeEscaper() {
098    return XML_ATTRIBUTE_ESCAPER;
099  }
100
101  private static final Escaper XML_ESCAPER;
102  private static final Escaper XML_CONTENT_ESCAPER;
103  private static final Escaper XML_ATTRIBUTE_ESCAPER;
104
105  static {
106    Escapers.Builder builder = Escapers.builder();
107    // The char values \uFFFE and \uFFFF are explicitly not allowed in XML
108    // (Unicode code points above \uFFFF are represented via surrogate pairs
109    // which means they are treated as pairs of safe characters).
110    builder.setSafeRange(Character.MIN_VALUE, '\uFFFD');
111    // Unsafe characters are replaced with the Unicode replacement character.
112    builder.setUnsafeReplacement("\uFFFD");
113
114    /*
115     * Except for \n, \t, and \r, all ASCII control characters are replaced with the Unicode
116     * replacement character.
117     *
118     * Implementation note: An alternative to the following would be to make a map that simply
119     * replaces the allowed ASCII whitespace characters with themselves and to set the minimum safe
120     * character to 0x20. However this would slow down the escaping of simple strings that contain
121     * \t, \n, or \r.
122     */
123    for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) {
124      if (c != '\t' && c != '\n' && c != '\r') {
125        builder.addEscape(c, "\uFFFD");
126      }
127    }
128
129    // Build the content escaper first and then add quote escaping for the
130    // general escaper.
131    builder.addEscape('&', "&amp;");
132    builder.addEscape('<', "&lt;");
133    builder.addEscape('>', "&gt;");
134    XML_CONTENT_ESCAPER = builder.build();
135    builder.addEscape('\'', "&apos;");
136    builder.addEscape('"', "&quot;");
137    XML_ESCAPER = builder.build();
138    builder.addEscape('\t', "&#x9;");
139    builder.addEscape('\n', "&#xA;");
140    builder.addEscape('\r', "&#xD;");
141    XML_ATTRIBUTE_ESCAPER = builder.build();
142  }
143}