001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.xml;
016
017import com.google.common.annotations.Beta;
018import com.google.common.annotations.GwtCompatible;
019import com.google.common.escape.Escaper;
020import com.google.common.escape.Escapers;
021
022/**
023 * {@code Escaper} instances suitable for strings to be included in XML attribute values and
024 * elements' text contents. When possible, avoid manual escaping by using templating systems and
025 * high-level APIs that provide autoescaping. For example, consider <a
026 * href="http://www.xom.nu/">XOM</a> or <a href="http://www.jdom.org/">JDOM</a>.
027 *
028 * <p><b>Note:</b> Currently the escapers provided by this class do not escape any characters
029 * outside the ASCII character range. Unlike HTML escaping the XML escapers will not escape
030 * non-ASCII characters to their numeric entity replacements. These XML escapers provide the minimal
031 * level of escaping to ensure that the output can be safely included in a Unicode XML document.
032 *
033 * <p>For details on the behavior of the escapers in this class, see sections <a
034 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and <a
035 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification.
036 *
037 * @author Alex Matevossian
038 * @author David Beaumont
039 * @since 15.0
040 */
041@Beta
042@GwtCompatible
043@ElementTypesAreNonnullByDefault
044public class XmlEscapers {
045  private XmlEscapers() {}
046
047  private static final char MIN_ASCII_CONTROL_CHAR = 0x00;
048  private static final char MAX_ASCII_CONTROL_CHAR = 0x1F;
049
050  // For each xxxEscaper() method, please add links to external reference pages
051  // that are considered authoritative for the behavior of that escaper.
052
053  /**
054   * Returns an {@link Escaper} instance that escapes special characters in a string so it can
055   * safely be included in an XML document as element content. See section <a
056   * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification.
057   *
058   * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not safe</b> to use this
059   * escaper to escape attribute values. Use {@link #xmlContentEscaper} if the output can appear in
060   * element content or {@link #xmlAttributeEscaper} in attribute values.
061   *
062   * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the
063   * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more
064   * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of
065   * the XML specification.
066   *
067   * <p>This escaper does not escape non-ASCII characters to their numeric character references
068   * (NCR). Any non-ASCII characters appearing in the input will be preserved in the output.
069   * Specifically "\r" (carriage return) is preserved in the output, which may result in it being
070   * silently converted to "\n" when the XML is parsed.
071   *
072   * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode
073   * validation on its input.
074   */
075  public static Escaper xmlContentEscaper() {
076    return XML_CONTENT_ESCAPER;
077  }
078
079  /**
080   * Returns an {@link Escaper} instance that escapes special characters in a string so it can
081   * safely be included in XML document as an attribute value. See section <a
082   * href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a> of the XML
083   * specification.
084   *
085   * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the
086   * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more
087   * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of
088   * the XML specification.
089   *
090   * <p>This escaper does not escape non-ASCII characters to their numeric character references
091   * (NCR). However, horizontal tab {@code '\t'}, line feed {@code '\n'} and carriage return {@code
092   * '\r'} are escaped to a corresponding NCR {@code "&#x9;"}, {@code "&#xA;"}, and {@code "&#xD;"}
093   * respectively. Any other non-ASCII characters appearing in the input will be preserved in the
094   * output.
095   *
096   * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode
097   * validation on its input.
098   */
099  public static Escaper xmlAttributeEscaper() {
100    return XML_ATTRIBUTE_ESCAPER;
101  }
102
103  private static final Escaper XML_ESCAPER;
104  private static final Escaper XML_CONTENT_ESCAPER;
105  private static final Escaper XML_ATTRIBUTE_ESCAPER;
106
107  static {
108    Escapers.Builder builder = Escapers.builder();
109    // The char values \uFFFE and \uFFFF are explicitly not allowed in XML
110    // (Unicode code points above \uFFFF are represented via surrogate pairs
111    // which means they are treated as pairs of safe characters).
112    builder.setSafeRange(Character.MIN_VALUE, '\uFFFD');
113    // Unsafe characters are replaced with the Unicode replacement character.
114    builder.setUnsafeReplacement("\uFFFD");
115
116    /*
117     * Except for \n, \t, and \r, all ASCII control characters are replaced with the Unicode
118     * replacement character.
119     *
120     * Implementation note: An alternative to the following would be to make a map that simply
121     * replaces the allowed ASCII whitespace characters with themselves and to set the minimum safe
122     * character to 0x20. However this would slow down the escaping of simple strings that contain
123     * \t, \n, or \r.
124     */
125    for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) {
126      if (c != '\t' && c != '\n' && c != '\r') {
127        builder.addEscape(c, "\uFFFD");
128      }
129    }
130
131    // Build the content escaper first and then add quote escaping for the
132    // general escaper.
133    builder.addEscape('&', "&amp;");
134    builder.addEscape('<', "&lt;");
135    builder.addEscape('>', "&gt;");
136    XML_CONTENT_ESCAPER = builder.build();
137    builder.addEscape('\'', "&apos;");
138    builder.addEscape('"', "&quot;");
139    XML_ESCAPER = builder.build();
140    builder.addEscape('\t', "&#x9;");
141    builder.addEscape('\n', "&#xA;");
142    builder.addEscape('\r', "&#xD;");
143    XML_ATTRIBUTE_ESCAPER = builder.build();
144  }
145}