1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package html provides functions for escaping and unescaping HTML text.
14 type writer
interface {
15 WriteString(string) (int, error
)
18 // These replacements permit compatibility with old numeric entities that
19 // assumed Windows-1252 encoding.
20 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
21 var replacementTable
= [...]rune
{
22 '\u20AC', // First entry is what 0x80 should be replaced with.
53 '\u0178', // Last entry is 0x9F.
54 // 0x00->'\uFFFD' is handled programmatically.
55 // 0x0D->'\u000D' is a no-op.
58 // unescapeEntity reads an entity like "<" from b[src:] and writes the
59 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
60 // Precondition: b[src] == '&' && dst <= src.
61 // attribute should be true if parsing an attribute value.
62 func unescapeEntity(b
[]byte, dst
, src
int, attribute
bool) (dst1
, src1
int) {
63 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
65 // i starts at 1 because we already know that s[0] == '&'.
70 return dst
+ 1, src
+ 1
74 if len(s
) <= 3 { // We need to have at least "&#.".
76 return dst
+ 1, src
+ 1
81 if c
== 'x' || c
== 'X' {
91 if '0' <= c
&& c
<= '9' {
92 x
= 16*x
+ rune(c
) - '0'
94 } else if 'a' <= c
&& c
<= 'f' {
95 x
= 16*x
+ rune(c
) - 'a' + 10
97 } else if 'A' <= c
&& c
<= 'F' {
98 x
= 16*x
+ rune(c
) - 'A' + 10
101 } else if '0' <= c
&& c
<= '9' {
102 x
= 10*x
+ rune(c
) - '0'
111 if i
<= 3 { // No characters matched.
113 return dst
+ 1, src
+ 1
116 if 0x80 <= x
&& x
<= 0x9F {
117 // Replace characters from Windows-1252 with UTF-8 equivalents.
118 x
= replacementTable
[x
-0x80]
119 } else if x
== 0 ||
(0xD800 <= x
&& x
<= 0xDFFF) || x
> 0x10FFFF {
120 // Replace invalid characters with the replacement character.
124 return dst
+ utf8
.EncodeRune(b
[dst
:], x
), src
+ i
127 // Consume the maximum number of characters possible, with the
128 // consumed characters matching one of the named references.
133 // Lower-cased characters are more common in entities, so we check for them first.
134 if 'a' <= c
&& c
<= 'z' ||
'A' <= c
&& c
<= 'Z' ||
'0' <= c
&& c
<= '9' {
143 entityName
:= string(s
[1:i
])
144 if entityName
== "" {
146 } else if attribute
&& entityName
[len(entityName
)-1] != ';' && len(s
) > i
&& s
[i
] == '=' {
148 } else if x
:= entity
[entityName
]; x
!= 0 {
149 return dst
+ utf8
.EncodeRune(b
[dst
:], x
), src
+ i
150 } else if x
:= entity2
[entityName
]; x
[0] != 0 {
151 dst1
:= dst
+ utf8
.EncodeRune(b
[dst
:], x
[0])
152 return dst1
+ utf8
.EncodeRune(b
[dst1
:], x
[1]), src
+ i
153 } else if !attribute
{
154 maxLen
:= len(entityName
) - 1
155 if maxLen
> longestEntityWithoutSemicolon
{
156 maxLen
= longestEntityWithoutSemicolon
158 for j
:= maxLen
; j
> 1; j
-- {
159 if x
:= entity
[entityName
[:j
]]; x
!= 0 {
160 return dst
+ utf8
.EncodeRune(b
[dst
:], x
), src
+ j
+ 1
165 dst1
, src1
= dst
+i
, src
+i
166 copy(b
[dst
:dst1
], b
[src
:src1
])
170 // unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".
171 func unescape(b
[]byte) []byte {
172 for i
, c
:= range b
{
174 dst
, src
:= unescapeEntity(b
, i
, i
, false)
178 dst
, src
= unescapeEntity(b
, dst
, src
, false)
181 dst
, src
= dst
+1, src
+1
190 const escapedChars
= `&'<>"`
192 func escape(w writer
, s
string) error
{
193 i
:= strings
.IndexAny(s
, escapedChars
)
195 if _
, err
:= w
.WriteString(s
[:i
]); err
!= nil {
203 // "'" is shorter than "'" and apos was not in HTML until HTML5.
210 // """ is shorter than """.
213 panic("unrecognized escape character")
216 if _
, err
:= w
.WriteString(esc
); err
!= nil {
219 i
= strings
.IndexAny(s
, escapedChars
)
221 _
, err
:= w
.WriteString(s
)
225 // EscapeString escapes special characters like "<" to become "<". It
226 // escapes only five such characters: <, >, &, ' and ".
227 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
229 func EscapeString(s
string) string {
230 if strings
.IndexAny(s
, escapedChars
) == -1 {
238 // UnescapeString unescapes entities like "<" to become "<". It unescapes a
239 // larger range of entities than EscapeString escapes. For example, "á"
240 // unescapes to "รก", as does "á" and "&xE1;".
241 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
243 func UnescapeString(s
string) string {
244 for _
, c
:= range s
{
246 return string(unescape([]byte(s
)))