001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.xml;
018
019import com.google.common.annotations.Beta;
020import com.google.common.annotations.GwtCompatible;
021import com.google.common.escape.Escaper;
022import com.google.common.escape.Escapers;
023
024/**
025 * {@code Escaper} instances suitable for strings to be included in XML
026 * attribute values and elements' text contents. When possible, avoid manual
027 * escaping by using templating systems and high-level APIs that provide
028 * autoescaping. For example, consider <a href="http://www.xom.nu/">XOM</a> or
029 * <a href="http://www.jdom.org/">JDOM</a>.
030 *
031 * <p><b>Note:</b> Currently the escapers provided by this class do not escape
032 * any characters outside the ASCII character range. Unlike HTML escaping the
033 * XML escapers will not escape non-ASCII characters to their numeric entity
034 * replacements. These XML escapers provide the minimal level of escaping to
035 * ensure that the output can be safely included in a Unicode XML document.
036 *
037 *
038 * <p>For details on the behavior of the escapers in this class, see sections
039 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and
040 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the
041 * XML specification.
042 *
043 * @author Alex Matevossian
044 * @author David Beaumont
045 * @since 15.0
046 */
047@Beta
048@GwtCompatible
049public class XmlEscapers {
050  private XmlEscapers() {}
051
052  private static final char MIN_ASCII_CONTROL_CHAR = 0x00;
053  private static final char MAX_ASCII_CONTROL_CHAR = 0x1F;
054
055  // For each xxxEscaper() method, please add links to external reference pages
056  // that are considered authoritative for the behavior of that escaper.
057
058  /**
059   * Returns an {@link Escaper} instance that escapes special characters in a
060   * string so it can safely be included in an XML document as element content.
061   * See section
062   * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the
063   * XML specification.
064   *
065   * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not
066   * safe</b> to use this escaper to escape attribute values. Use
067   * {@link #xmlContentEscaper} if the output can appear in element content or
068   * {@link #xmlAttributeEscaper} in attribute values.
069   *
070   * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control
071   * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which
072   * are not permitted in XML. For more detail see section <a
073   * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the
074   * XML specification.
075   *
076   * <p>This escaper does not escape non-ASCII characters to their numeric
077   * character references (NCR). Any non-ASCII characters appearing in the input
078   * will be preserved in the output. Specifically "\r" (carriage return) is
079   * preserved in the output, which may result in it being silently converted to
080   * "\n" when the XML is parsed.
081   *
082   * <p>This escaper does not treat surrogate pairs specially and does not
083   * perform Unicode validation on its input.
084   */
085  public static Escaper xmlContentEscaper() {
086    return XML_CONTENT_ESCAPER;
087  }
088
089  /**
090   * Returns an {@link Escaper} instance that escapes special characters in a
091   * string so it can safely be included in XML document as an attribute value.
092   * See section
093   * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a>
094   * of the XML specification.
095   *
096   * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control
097   * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which
098   * are not permitted in XML. For more detail see section <a
099   * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the
100   * XML specification.
101   *
102   * <p>This escaper does not escape non-ASCII characters to their numeric
103   * character references (NCR). However, horizontal tab {@code '\t'}, line feed
104   * {@code '\n'} and carriage return {@code '\r'} are escaped to a
105   * corresponding NCR {@code "&#x9;"}, {@code "&#xA;"}, and {@code "&#xD;"}
106   * respectively. Any other non-ASCII characters appearing in the input will
107   * be preserved in the output.
108   *
109   * <p>This escaper does not treat surrogate pairs specially and does not
110   * perform Unicode validation on its input.
111   */
112  public static Escaper xmlAttributeEscaper() {
113    return XML_ATTRIBUTE_ESCAPER;
114  }
115
116  private static final Escaper XML_ESCAPER;
117  private static final Escaper XML_CONTENT_ESCAPER;
118  private static final Escaper XML_ATTRIBUTE_ESCAPER;
119  static {
120    Escapers.Builder builder = Escapers.builder();
121    // The char values \uFFFE and \uFFFF are explicitly not allowed in XML
122    // (Unicode code points above \uFFFF are represented via surrogate pairs
123    // which means they are treated as pairs of safe characters).
124    builder.setSafeRange(Character.MIN_VALUE, '\uFFFD');
125    // Unsafe characters are replaced with the Unicode replacement character.
126    builder.setUnsafeReplacement("\uFFFD");
127
128    /*
129     * Except for \n, \t, and \r, all ASCII control characters are replaced with
130     * the Unicode replacement character.
131     *
132     * Implementation note: An alternative to the following would be to make a
133     * map that simply replaces the allowed ASCII whitespace characters with
134     * themselves and to set the minimum safe character to 0x20. However this
135     * would slow down the escaping of simple strings that contain \t, \n, or
136     * \r.
137     */
138    for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) {
139      if (c != '\t' && c != '\n' && c != '\r') {
140        builder.addEscape(c, "\uFFFD");
141      }
142    }
143
144    // Build the content escaper first and then add quote escaping for the
145    // general escaper.
146    builder.addEscape('&', "&amp;");
147    builder.addEscape('<', "&lt;");
148    builder.addEscape('>', "&gt;");
149    XML_CONTENT_ESCAPER = builder.build();
150    builder.addEscape('\'', "&apos;");
151    builder.addEscape('"', "&quot;");
152    XML_ESCAPER = builder.build();
153    builder.addEscape('\t', "&#x9;");
154    builder.addEscape('\n', "&#xA;");
155    builder.addEscape('\r', "&#xD;");
156    XML_ATTRIBUTE_ESCAPER = builder.build();
157  }
158}