001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package com.google.common.xml; 018 019import com.google.common.annotations.Beta; 020import com.google.common.annotations.GwtCompatible; 021import com.google.common.escape.Escaper; 022import com.google.common.escape.Escapers; 023 024/** 025 * {@code Escaper} instances suitable for strings to be included in XML 026 * attribute values and elements' text contents. When possible, avoid manual 027 * escaping by using templating systems and high-level APIs that provide 028 * autoescaping. For example, consider <a href="http://www.xom.nu/">XOM</a> or 029 * <a href="http://www.jdom.org/">JDOM</a>. 030 * 031 * <p><b>Note:</b> Currently the escapers provided by this class do not escape 032 * any characters outside the ASCII character range. Unlike HTML escaping the 033 * XML escapers will not escape non-ASCII characters to their numeric entity 034 * replacements. These XML escapers provide the minimal level of escaping to 035 * ensure that the output can be safely included in a Unicode XML document. 036 * 037 * 038 * <p>For details on the behavior of the escapers in this class, see sections 039 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and 040 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the 041 * XML specification. 042 * 043 * @author Alex Matevossian 044 * @author David Beaumont 045 * @since 15.0 046 */ 047@Beta 048@GwtCompatible 049public class XmlEscapers { 050 private XmlEscapers() {} 051 052 private static final char MIN_ASCII_CONTROL_CHAR = 0x00; 053 private static final char MAX_ASCII_CONTROL_CHAR = 0x1F; 054 055 // For each xxxEscaper() method, please add links to external reference pages 056 // that are considered authoritative for the behavior of that escaper. 057 058 /** 059 * Returns an {@link Escaper} instance that escapes special characters in a 060 * string so it can safely be included in an XML document as element content. 061 * See section 062 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the 063 * XML specification. 064 * 065 * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not 066 * safe</b> to use this escaper to escape attribute values. Use 067 * {@link #xmlContentEscaper} if the output can appear in element content or 068 * {@link #xmlAttributeEscaper} in attribute values. 069 * 070 * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control 071 * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which 072 * are not permitted in XML. For more detail see section <a 073 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the 074 * XML specification. 075 * 076 * <p>This escaper does not escape non-ASCII characters to their numeric 077 * character references (NCR). Any non-ASCII characters appearing in the input 078 * will be preserved in the output. Specifically "\r" (carriage return) is 079 * preserved in the output, which may result in it being silently converted to 080 * "\n" when the XML is parsed. 081 * 082 * <p>This escaper does not treat surrogate pairs specially and does not 083 * perform Unicode validation on its input. 084 */ 085 public static Escaper xmlContentEscaper() { 086 return XML_CONTENT_ESCAPER; 087 } 088 089 /** 090 * Returns an {@link Escaper} instance that escapes special characters in a 091 * string so it can safely be included in XML document as an attribute value. 092 * See section 093 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a> 094 * of the XML specification. 095 * 096 * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control 097 * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which 098 * are not permitted in XML. For more detail see section <a 099 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the 100 * XML specification. 101 * 102 * <p>This escaper does not escape non-ASCII characters to their numeric 103 * character references (NCR). However, horizontal tab {@code '\t'}, line feed 104 * {@code '\n'} and carriage return {@code '\r'} are escaped to a 105 * corresponding NCR {@code "	"}, {@code "
"}, and {@code "
"} 106 * respectively. Any other non-ASCII characters appearing in the input will 107 * be preserved in the output. 108 * 109 * <p>This escaper does not treat surrogate pairs specially and does not 110 * perform Unicode validation on its input. 111 */ 112 public static Escaper xmlAttributeEscaper() { 113 return XML_ATTRIBUTE_ESCAPER; 114 } 115 116 private static final Escaper XML_ESCAPER; 117 private static final Escaper XML_CONTENT_ESCAPER; 118 private static final Escaper XML_ATTRIBUTE_ESCAPER; 119 static { 120 Escapers.Builder builder = Escapers.builder(); 121 // The char values \uFFFE and \uFFFF are explicitly not allowed in XML 122 // (Unicode code points above \uFFFF are represented via surrogate pairs 123 // which means they are treated as pairs of safe characters). 124 builder.setSafeRange(Character.MIN_VALUE, '\uFFFD'); 125 // Unsafe characters are replaced with the Unicode replacement character. 126 builder.setUnsafeReplacement("\uFFFD"); 127 128 /* 129 * Except for \n, \t, and \r, all ASCII control characters are replaced with 130 * the Unicode replacement character. 131 * 132 * Implementation note: An alternative to the following would be to make a 133 * map that simply replaces the allowed ASCII whitespace characters with 134 * themselves and to set the minimum safe character to 0x20. However this 135 * would slow down the escaping of simple strings that contain \t, \n, or 136 * \r. 137 */ 138 for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) { 139 if (c != '\t' && c != '\n' && c != '\r') { 140 builder.addEscape(c, "\uFFFD"); 141 } 142 } 143 144 // Build the content escaper first and then add quote escaping for the 145 // general escaper. 146 builder.addEscape('&', "&"); 147 builder.addEscape('<', "<"); 148 builder.addEscape('>', ">"); 149 XML_CONTENT_ESCAPER = builder.build(); 150 builder.addEscape('\'', "'"); 151 builder.addEscape('"', """); 152 XML_ESCAPER = builder.build(); 153 builder.addEscape('\t', "	"); 154 builder.addEscape('\n', "
"); 155 builder.addEscape('\r', "
"); 156 XML_ATTRIBUTE_ESCAPER = builder.build(); 157 } 158}