001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.xml;
016
017import com.google.common.annotations.GwtCompatible;
018import com.google.common.escape.Escaper;
019import com.google.common.escape.Escapers;
020
021/**
022 * {@code Escaper} instances suitable for strings to be included in XML attribute values and
023 * elements' text contents. When possible, avoid manual escaping by using templating systems and
024 * high-level APIs that provide autoescaping. For example, consider <a
025 * href="http://www.xom.nu/">XOM</a> or <a href="http://www.jdom.org/">JDOM</a>.
026 *
027 * <p><b>Note:</b> Currently the escapers provided by this class do not escape any characters
028 * outside the ASCII character range. Unlike HTML escaping the XML escapers will not escape
029 * non-ASCII characters to their numeric entity replacements. These XML escapers provide the minimal
030 * level of escaping to ensure that the output can be safely included in a Unicode XML document.
031 *
032 * <p>For details on the behavior of the escapers in this class, see sections <a
033 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and <a
034 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification.
035 *
036 * @author Alex Matevossian
037 * @author David Beaumont
038 * @since 15.0
039 */
040@GwtCompatible
041public class XmlEscapers {
042  private XmlEscapers() {}
043
044  private static final char MIN_ASCII_CONTROL_CHAR = 0x00;
045  private static final char MAX_ASCII_CONTROL_CHAR = 0x1F;
046
047  // For each xxxEscaper() method, please add links to external reference pages
048  // that are considered authoritative for the behavior of that escaper.
049
050  /**
051   * Returns an {@link Escaper} instance that escapes special characters in a string so it can
052   * safely be included in an XML document as element content. See section <a
053   * href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification.
054   *
055   * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not safe</b> to use this
056   * escaper to escape attribute values. Use {@link #xmlContentEscaper} if the output can appear in
057   * element content or {@link #xmlAttributeEscaper} in attribute values.
058   *
059   * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the
060   * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more
061   * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of
062   * the XML specification.
063   *
064   * <p>This escaper does not escape non-ASCII characters to their numeric character references
065   * (NCR). Any non-ASCII characters appearing in the input will be preserved in the output.
066   * Specifically "\r" (carriage return) is preserved in the output, which may result in it being
067   * silently converted to "\n" when the XML is parsed.
068   *
069   * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode
070   * validation on its input.
071   */
072  public static Escaper xmlContentEscaper() {
073    return XML_CONTENT_ESCAPER;
074  }
075
076  /**
077   * Returns an {@link Escaper} instance that escapes special characters in a string so it can
078   * safely be included in XML document as an attribute value. See section <a
079   * href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a> of the XML
080   * specification.
081   *
082   * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the
083   * character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more
084   * detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of
085   * the XML specification.
086   *
087   * <p>This escaper does not escape non-ASCII characters to their numeric character references
088   * (NCR). However, horizontal tab {@code '\t'}, line feed {@code '\n'} and carriage return {@code
089   * '\r'} are escaped to a corresponding NCR {@code "&#x9;"}, {@code "&#xA;"}, and {@code "&#xD;"}
090   * respectively. Any other non-ASCII characters appearing in the input will be preserved in the
091   * output.
092   *
093   * <p>This escaper does not treat surrogate pairs specially and does not perform Unicode
094   * validation on its input.
095   */
096  public static Escaper xmlAttributeEscaper() {
097    return XML_ATTRIBUTE_ESCAPER;
098  }
099
100  private static final Escaper XML_ESCAPER;
101  private static final Escaper XML_CONTENT_ESCAPER;
102  private static final Escaper XML_ATTRIBUTE_ESCAPER;
103
104  static {
105    Escapers.Builder builder = Escapers.builder();
106    // The char values \uFFFE and \uFFFF are explicitly not allowed in XML
107    // (Unicode code points above \uFFFF are represented via surrogate pairs
108    // which means they are treated as pairs of safe characters).
109    builder.setSafeRange(Character.MIN_VALUE, '\uFFFD');
110    // Unsafe characters are replaced with the Unicode replacement character.
111    builder.setUnsafeReplacement("\uFFFD");
112
113    /*
114     * Except for \n, \t, and \r, all ASCII control characters are replaced with the Unicode
115     * replacement character.
116     *
117     * Implementation note: An alternative to the following would be to make a map that simply
118     * replaces the allowed ASCII whitespace characters with themselves and to set the minimum safe
119     * character to 0x20. However this would slow down the escaping of simple strings that contain
120     * \t, \n, or \r.
121     */
122    for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) {
123      if (c != '\t' && c != '\n' && c != '\r') {
124        builder.addEscape(c, "\uFFFD");
125      }
126    }
127
128    // Build the content escaper first and then add quote escaping for the
129    // general escaper.
130    builder.addEscape('&', "&amp;");
131    builder.addEscape('<', "&lt;");
132    builder.addEscape('>', "&gt;");
133    XML_CONTENT_ESCAPER = builder.build();
134    builder.addEscape('\'', "&apos;");
135    builder.addEscape('"', "&quot;");
136    XML_ESCAPER = builder.build();
137    builder.addEscape('\t', "&#x9;");
138    builder.addEscape('\n', "&#xA;");
139    builder.addEscape('\r', "&#xD;");
140    XML_ATTRIBUTE_ESCAPER = builder.build();
141  }
142}