001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.xml;
018
019import com.google.common.annotations.Beta;
020import com.google.common.annotations.GwtCompatible;
021import com.google.common.escape.Escaper;
022import com.google.common.escape.Escapers;
023
024/**
025 * {@code Escaper} instances suitable for strings to be included in XML
026 * attribute values and elements' text contents. When possible, avoid manual
027 * escaping by using templating systems and high-level APIs that provide
028 * autoescaping. For example, consider <a href="http://www.xom.nu/">XOM</a> or
029 * <a href="http://www.jdom.org/">JDOM</a>.
030 *
031 * <p><b>Note</b>: Currently the escapers provided by this class do not escape
032 * any characters outside the ASCII character range. Unlike HTML escaping the
033 * XML escapers will not escape non-ASCII characters to their numeric entity
034 * replacements. These XML escapers provide the minimal level of escaping to
035 * ensure that the output can be safely included in a Unicode XML document.
036 *
037 *
038 * <p>For details on the behavior of the escapers in this class, see sections
039 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and
040 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the
041 * XML specification.
042 *
043 * @author Alex Matevossian
044 * @author David Beaumont
045 * @since 15.0
046 */
047@Beta
048@GwtCompatible
049public class XmlEscapers {
050  private XmlEscapers() {}
051
052  private static final char MIN_ASCII_CONTROL_CHAR = 0x00;
053  private static final char MAX_ASCII_CONTROL_CHAR = 0x1F;
054
055  // For each xxxEscaper() method, please add links to external reference pages
056  // that are considered authoritative for the behavior of that escaper.
057
058  // TODO(user): When this escaper strips \uFFFE & \uFFFF, add this doc.
059  // <p>This escaper also silently removes non-whitespace control characters and
060  // the character values {@code 0xFFFE} and {@code 0xFFFF} which are not
061  // permitted in XML. For more detail see section
062  // <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of
063  // the XML specification.
064
065  /**
066   * Returns an {@link Escaper} instance that escapes special characters in a
067   * string so it can safely be included in an XML document as element content.
068   * See section
069   * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the
070   * XML specification.
071   *
072   * <p><b>Note</b>: Double and single quotes are not escaped, so it is <b>not
073   * safe</b> to use this escaper to escape attribute values. Use
074   * {@link #xmlContentEscaper} if the output can appear in element content or
075   * {@link #xmlAttributeEscaper} in attribute values.
076   *
077   * <p>This escaper does not escape non-ASCII characters to their numeric
078   * character references (NCR). Any non-ASCII characters appearing in the input
079   * will be preserved in the output. Specifically "\r" (carriage return) is
080   * preserved in the output, which may result in it being silently converted to
081   * "\n" when the XML is parsed.
082   *
083   * <p>This escaper does not treat surrogate pairs specially and does not
084   * perform Unicode validation on its input.
085   */
086  public static Escaper xmlContentEscaper() {
087    return XML_CONTENT_ESCAPER;
088  }
089
090  /**
091   * Returns an {@link Escaper} instance that escapes special characters in a
092   * string so it can safely be included in XML document as an attribute value.
093   * See section
094   * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a>
095   * of the XML specification.
096   *
097   * <p>This escaper does not escape non-ASCII characters to their numeric
098   * character references (NCR). However, horizontal tab {@code '\t'}, line feed
099   * {@code '\n'} and carriage return {@code '\r'} are escaped to a
100   * corresponding NCR {@code "&#x9;"}, {@code "&#xA;"}, and {@code "&#xD;"}
101   * respectively. Any other non-ASCII characters appearing in the input will
102   * be preserved in the output.
103   *
104   * <p>This escaper does not treat surrogate pairs specially and does not
105   * perform Unicode validation on its input.
106   */
107  public static Escaper xmlAttributeEscaper() {
108    return XML_ATTRIBUTE_ESCAPER;
109  }
110
111  private static final Escaper XML_ESCAPER;
112  private static final Escaper XML_CONTENT_ESCAPER;
113  private static final Escaper XML_ATTRIBUTE_ESCAPER;
114  static {
115    Escapers.Builder builder = Escapers.builder();
116    // The char values \uFFFE and \uFFFF are explicitly not allowed in XML
117    // (Unicode code points above \uFFFF are represented via surrogate pairs
118    // which means they are treated as pairs of safe characters).
119    // TODO(user): When refactoring done change the \uFFFF below to \uFFFD
120    builder.setSafeRange(Character.MIN_VALUE, '\uFFFF');
121    // Unsafe characters are removed.
122    builder.setUnsafeReplacement("");
123
124    // Except for '\n', '\t' and '\r' we remove all ASCII control characters.
125    // An alternative to this would be to make a map that simply replaces the
126    // allowed ASCII whitespace characters with themselves and set the minimum
127    // safe character to 0x20. However this would slow down the escaping of
128    // simple strings that contain '\t','\n' or '\r'.
129    for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) {
130      if (c != '\t' && c != '\n' && c != '\r') {
131        builder.addEscape(c, "");
132      }
133    }
134
135    // Build the content escaper first and then add quote escaping for the
136    // general escaper.
137    builder.addEscape('&', "&amp;");
138    builder.addEscape('<', "&lt;");
139    builder.addEscape('>', "&gt;");
140    XML_CONTENT_ESCAPER = builder.build();
141    builder.addEscape('\'', "&apos;");
142    builder.addEscape('"', "&quot;");
143    XML_ESCAPER = builder.build();
144    builder.addEscape('\t', "&#x9;");
145    builder.addEscape('\n', "&#xA;");
146    builder.addEscape('\r', "&#xD;");
147    XML_ATTRIBUTE_ESCAPER = builder.build();
148  }
149}