1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package html provides functions for escaping and unescaping HTML text.
13 // These replacements permit compatibility with old numeric entities that
14 // assumed Windows-1252 encoding.
15 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
16 var replacementTable
= [...]rune
{
17 '\u20AC', // First entry is what 0x80 should be replaced with.
48 '\u0178', // Last entry is 0x9F.
49 // 0x00->'\uFFFD' is handled programmatically.
50 // 0x0D->'\u000D' is a no-op.
53 // unescapeEntity reads an entity like "<" from b[src:] and writes the
54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
55 // Precondition: b[src] == '&' && dst <= src.
56 func unescapeEntity(b
[]byte, dst
, src
int) (dst1
, src1
int) {
57 const attribute
= false
59 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
61 // i starts at 1 because we already know that s[0] == '&'.
66 return dst
+ 1, src
+ 1
70 if len(s
) <= 3 { // We need to have at least "&#.".
72 return dst
+ 1, src
+ 1
77 if c
== 'x' || c
== 'X' {
87 if '0' <= c
&& c
<= '9' {
88 x
= 16*x
+ rune(c
) - '0'
90 } else if 'a' <= c
&& c
<= 'f' {
91 x
= 16*x
+ rune(c
) - 'a' + 10
93 } else if 'A' <= c
&& c
<= 'F' {
94 x
= 16*x
+ rune(c
) - 'A' + 10
97 } else if '0' <= c
&& c
<= '9' {
98 x
= 10*x
+ rune(c
) - '0'
107 if i
<= 3 { // No characters matched.
109 return dst
+ 1, src
+ 1
112 if 0x80 <= x
&& x
<= 0x9F {
113 // Replace characters from Windows-1252 with UTF-8 equivalents.
114 x
= replacementTable
[x
-0x80]
115 } else if x
== 0 ||
(0xD800 <= x
&& x
<= 0xDFFF) || x
> 0x10FFFF {
116 // Replace invalid characters with the replacement character.
120 return dst
+ utf8
.EncodeRune(b
[dst
:], x
), src
+ i
123 // Consume the maximum number of characters possible, with the
124 // consumed characters matching one of the named references.
129 // Lower-cased characters are more common in entities, so we check for them first.
130 if 'a' <= c
&& c
<= 'z' ||
'A' <= c
&& c
<= 'Z' ||
'0' <= c
&& c
<= '9' {
140 if len(entityName
) == 0 {
142 } else if attribute
&& entityName
[len(entityName
)-1] != ';' && len(s
) > i
&& s
[i
] == '=' {
144 } else if x
:= entity
[string(entityName
)]; x
!= 0 {
145 return dst
+ utf8
.EncodeRune(b
[dst
:], x
), src
+ i
146 } else if x
:= entity2
[string(entityName
)]; x
[0] != 0 {
147 dst1
:= dst
+ utf8
.EncodeRune(b
[dst
:], x
[0])
148 return dst1
+ utf8
.EncodeRune(b
[dst1
:], x
[1]), src
+ i
149 } else if !attribute
{
150 maxLen
:= len(entityName
) - 1
151 if maxLen
> longestEntityWithoutSemicolon
{
152 maxLen
= longestEntityWithoutSemicolon
154 for j
:= maxLen
; j
> 1; j
-- {
155 if x
:= entity
[string(entityName
[:j
])]; x
!= 0 {
156 return dst
+ utf8
.EncodeRune(b
[dst
:], x
), src
+ j
+ 1
161 dst1
, src1
= dst
+i
, src
+i
162 copy(b
[dst
:dst1
], b
[src
:src1
])
166 var htmlEscaper
= strings
.NewReplacer(
168 `'`, "'", // "'" is shorter than "'" and apos was not in HTML until HTML5.
171 `"`, """, // """ is shorter than """.
174 // EscapeString escapes special characters like "<" to become "<". It
175 // escapes only five such characters: <, >, &, ' and ".
176 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
178 func EscapeString(s
string) string {
179 return htmlEscaper
.Replace(s
)
182 // UnescapeString unescapes entities like "<" to become "<". It unescapes a
183 // larger range of entities than EscapeString escapes. For example, "á"
184 // unescapes to "รก", as does "á" and "á".
185 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
187 func UnescapeString(s
string) string {
188 i
:= strings
.IndexByte(s
, '&')
195 dst
, src
:= unescapeEntity(b
, i
, i
)
196 for len(s
[src
:]) > 0 {
200 i
= strings
.IndexByte(s
[src
:], '&')
203 dst
+= copy(b
[dst
:], s
[src
:])
208 copy(b
[dst
:], s
[src
:src
+i
])
210 dst
, src
= unescapeEntity(b
, dst
+i
, src
+i
)
212 return string(b
[:dst
])