001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package com.google.common.xml; 018 019import com.google.common.annotations.Beta; 020import com.google.common.annotations.GwtCompatible; 021import com.google.common.escape.Escaper; 022import com.google.common.escape.Escapers; 023 024/** 025 * {@code Escaper} instances suitable for strings to be included in XML 026 * attribute values and elements' text contents. When possible, avoid manual 027 * escaping by using templating systems and high-level APIs that provide 028 * autoescaping. For example, consider <a href="http://www.xom.nu/">XOM</a> or 029 * <a href="http://www.jdom.org/">JDOM</a>. 030 * 031 * <p><b>Note</b>: Currently the escapers provided by this class do not escape 032 * any characters outside the ASCII character range. Unlike HTML escaping the 033 * XML escapers will not escape non-ASCII characters to their numeric entity 034 * replacements. These XML escapers provide the minimal level of escaping to 035 * ensure that the output can be safely included in a Unicode XML document. 036 * 037 * 038 * <p>For details on the behavior of the escapers in this class, see sections 039 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and 040 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the 041 * XML specification. 042 * 043 * @author Alex Matevossian 044 * @author David Beaumont 045 * @since 15.0 046 */ 047@Beta 048@GwtCompatible 049public class XmlEscapers { 050 private XmlEscapers() {} 051 052 private static final char MIN_ASCII_CONTROL_CHAR = 0x00; 053 private static final char MAX_ASCII_CONTROL_CHAR = 0x1F; 054 055 // For each xxxEscaper() method, please add links to external reference pages 056 // that are considered authoritative for the behavior of that escaper. 057 058 // TODO(user): When this escaper strips \uFFFE & \uFFFF, add this doc. 059 // <p>This escaper also silently removes non-whitespace control characters and 060 // the character values {@code 0xFFFE} and {@code 0xFFFF} which are not 061 // permitted in XML. For more detail see section 062 // <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of 063 // the XML specification. 064 065 /** 066 * Returns an {@link Escaper} instance that escapes special characters in a 067 * string so it can safely be included in an XML document as element content. 068 * See section 069 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the 070 * XML specification. 071 * 072 * <p><b>Note</b>: Double and single quotes are not escaped, so it is <b>not 073 * safe</b> to use this escaper to escape attribute values. Use 074 * {@link #xmlContentEscaper} if the output can appear in element content or 075 * {@link #xmlAttributeEscaper} in attribute values. 076 * 077 * <p>This escaper does not escape non-ASCII characters to their numeric 078 * character references (NCR). Any non-ASCII characters appearing in the input 079 * will be preserved in the output. Specifically "\r" (carriage return) is 080 * preserved in the output, which may result in it being silently converted to 081 * "\n" when the XML is parsed. 082 * 083 * <p>This escaper does not treat surrogate pairs specially and does not 084 * perform Unicode validation on its input. 085 */ 086 public static Escaper xmlContentEscaper() { 087 return XML_CONTENT_ESCAPER; 088 } 089 090 /** 091 * Returns an {@link Escaper} instance that escapes special characters in a 092 * string so it can safely be included in XML document as an attribute value. 093 * See section 094 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a> 095 * of the XML specification. 096 * 097 * <p>This escaper does not escape non-ASCII characters to their numeric 098 * character references (NCR). However, horizontal tab {@code '\t'}, line feed 099 * {@code '\n'} and carriage return {@code '\r'} are escaped to a 100 * corresponding NCR {@code "	"}, {@code "
"}, and {@code "
"} 101 * respectively. Any other non-ASCII characters appearing in the input will 102 * be preserved in the output. 103 * 104 * <p>This escaper does not treat surrogate pairs specially and does not 105 * perform Unicode validation on its input. 106 */ 107 public static Escaper xmlAttributeEscaper() { 108 return XML_ATTRIBUTE_ESCAPER; 109 } 110 111 private static final Escaper XML_ESCAPER; 112 private static final Escaper XML_CONTENT_ESCAPER; 113 private static final Escaper XML_ATTRIBUTE_ESCAPER; 114 static { 115 Escapers.Builder builder = Escapers.builder(); 116 // The char values \uFFFE and \uFFFF are explicitly not allowed in XML 117 // (Unicode code points above \uFFFF are represented via surrogate pairs 118 // which means they are treated as pairs of safe characters). 119 // TODO(user): When refactoring done change the \uFFFF below to \uFFFD 120 builder.setSafeRange(Character.MIN_VALUE, '\uFFFF'); 121 // Unsafe characters are removed. 122 builder.setUnsafeReplacement(""); 123 124 // Except for '\n', '\t' and '\r' we remove all ASCII control characters. 125 // An alternative to this would be to make a map that simply replaces the 126 // allowed ASCII whitespace characters with themselves and set the minimum 127 // safe character to 0x20. However this would slow down the escaping of 128 // simple strings that contain '\t','\n' or '\r'. 129 for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) { 130 if (c != '\t' && c != '\n' && c != '\r') { 131 builder.addEscape(c, ""); 132 } 133 } 134 135 // Build the content escaper first and then add quote escaping for the 136 // general escaper. 137 builder.addEscape('&', "&"); 138 builder.addEscape('<', "<"); 139 builder.addEscape('>', ">"); 140 XML_CONTENT_ESCAPER = builder.build(); 141 builder.addEscape('\'', "'"); 142 builder.addEscape('"', """); 143 XML_ESCAPER = builder.build(); 144 builder.addEscape('\t', "	"); 145 builder.addEscape('\n', "
"); 146 builder.addEscape('\r', "
"); 147 XML_ATTRIBUTE_ESCAPER = builder.build(); 148 } 149}