2 * Copyright (c) 2017, Facebook, Inc.
5 * This source code is licensed under the MIT license found in the
6 * LICENSE file in the "hack" directory of this source tree.
11 * HTML5 special entity decoding
13 * HHVM decodes certain HTML entities present in input strings before
14 * generating bytecode. In order to generate bytecode identical to HHVM's,
15 * this module performs the same HTML entity decoding as HHVM.
16 * Mimics: zend-html.cpp
17 * The list of entities tested was taken from
18 * https://dev.w3.org/html5/html-author/charref on 09/27/2017.
25 let entity_regex = Str.regexp
"&[^;]+;"
27 type html_entity_map
= {
33 B.set b pos
(char_of_int n
);
37 if k
< 0x80 then begin
42 else if k
< 0x800 then begin
43 (* buf[0] = 0xc0 | (k >> 6);
44 buf[1] = 0x80 | (k & 0x3f); *)
46 |> set 0 (0xc0 lor (k
lsr 6))
47 |> set 1 (0x80 lor (k
land 0x3f))
49 else if k
< 0x10000 then begin
50 (* buf[0] = 0xe0 | (k >> 12);
51 buf[1] = 0x80 | ((k >> 6) & 0x3f);
52 buf[2] = 0x80 | (k & 0x3f); *)
54 |> set 0 (0xe0 lor (k
lsr 12))
55 |> set 1 (0x80 lor ((k
lsr 6) land 0x3f))
56 |> set 2 (0x80 lor (k
land 0x3f))
58 else if k
< 0x200000 then begin
59 (* buf[0] = 0xf0 | (k >> 18);
60 buf[1] = 0x80 | ((k >> 12) & 0x3f);
61 buf[2] = 0x80 | ((k >> 6) & 0x3f);
62 buf[3] = 0x80 | (k & 0x3f); *)
64 |> set 0 (0xf0 lor (k
lsr 18))
65 |> set 1 (0x80 lor ((k
lsr 12) land 0x3f))
66 |> set 2 (0x80 lor ((k
lsr 6) land 0x3f))
67 |> set 3 (0x80 lor (k
land 0x3f))
69 else if k
< 0x4000000 then begin
70 (* buf[0] = 0xf8 | (k >> 24);
71 buf[1] = 0x80 | ((k >> 18) & 0x3f);
72 buf[2] = 0x80 | ((k >> 12) & 0x3f);
73 buf[3] = 0x80 | ((k >> 6) & 0x3f);
74 buf[4] = 0x80 | (k & 0x3f); *)
76 |> set 0 (0xf8 lor (k
lsr 24))
77 |> set 1 (0x80 lor ((k
lsr 18) land 0x3f))
78 |> set 2 (0x80 lor ((k
lsr 12) land 0x3f))
79 |> set 3 (0x80 lor ((k
lsr 6) land 0x3f))
80 |> set 4 (0x80 lor (k
land 0x3f))
83 (* buf[0] = 0xfc | (k >> 30);
84 buf[1] = 0x80 | ((k >> 24) & 0x3f);
85 buf[2] = 0x80 | ((k >> 18) & 0x3f);
86 buf[3] = 0x80 | ((k >> 12) & 0x3f);
87 buf[4] = 0x80 | ((k >> 6) & 0x3f);
88 buf[5] = 0x80 | (k & 0x3f); *)
90 |> set 0 (0xfc lor (k
lsr 30))
91 |> set 1 (0x80 lor ((k
lsr 24) land 0x3f))
92 |> set 2 (0x80 lor ((k
lsr 18) land 0x3f))
93 |> set 3 (0x80 lor ((k
lsr 12) land 0x3f))
94 |> set 4 (0x80 lor ((k
lsr 6) land 0x3f))
95 |> set 5 (0x80 lor (k
land 0x3f))
99 let ent_iso_8859_1 = [
100 "nbsp"; "iexcl"; "cent"; "pound"; "curren"; "yen"; "brvbar";
101 "sect"; "uml"; "copy"; "ordf"; "laquo"; "not"; "shy"; "reg";
102 "macr"; "deg"; "plusmn"; "sup2"; "sup3"; "acute"; "micro";
103 "para"; "middot"; "cedil"; "sup1"; "ordm"; "raquo"; "frac14";
104 "frac12"; "frac34"; "iquest"; "Agrave"; "Aacute"; "Acirc";
105 "Atilde"; "Auml"; "Aring"; "AElig"; "Ccedil"; "Egrave";
106 "Eacute"; "Ecirc"; "Euml"; "Igrave"; "Iacute"; "Icirc";
107 "Iuml"; "ETH"; "Ntilde"; "Ograve"; "Oacute"; "Ocirc"; "Otilde";
108 "Ouml"; "times"; "Oslash"; "Ugrave"; "Uacute"; "Ucirc"; "Uuml";
109 "Yacute"; "THORN"; "szlig"; "agrave"; "aacute"; "acirc";
110 "atilde"; "auml"; "aring"; "aelig"; "ccedil"; "egrave";
111 "eacute"; "ecirc"; "euml"; "igrave"; "iacute"; "icirc";
112 "iuml"; "eth"; "ntilde"; "ograve"; "oacute"; "ocirc"; "otilde";
113 "ouml"; "divide"; "oslash"; "ugrave"; "uacute"; "ucirc";
114 "uuml"; "yacute"; "thorn"; "yuml"
116 let ent_uni_338_402 = [
118 "OElig"; "oelig"; ""; ""; ""; "";
119 ""; ""; ""; ""; ""; ""; ""; "";
121 "Scaron"; "scaron"; ""; ""; ""; ""; ""; "";
122 ""; ""; ""; ""; ""; ""; ""; "";
123 ""; ""; ""; ""; ""; ""; ""; "";
125 "Yuml"; ""; ""; ""; ""; ""; ""; "";
126 ""; ""; ""; ""; ""; ""; ""; "";
127 ""; ""; ""; ""; ""; ""; ""; "";
131 let ent_uni_spacing = [
135 ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
136 ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
140 let ent_uni_greek = [
142 "Alpha"; "Beta"; "Gamma"; "Delta"; "Epsilon"; "Zeta"; "Eta"; "Theta";
143 "Iota"; "Kappa"; "Lambda"; "Mu"; "Nu"; "Xi"; "Omicron"; "Pi"; "Rho";
144 ""; "Sigma"; "Tau"; "Upsilon"; "Phi"; "Chi"; "Psi"; "Omega";
145 (* 938 - 944 are not mapped *)
146 ""; ""; ""; ""; ""; ""; "";
147 "alpha"; "beta"; "gamma"; "delta"; "epsilon"; "zeta"; "eta"; "theta";
148 "iota"; "kappa"; "lambda"; "mu"; "nu"; "xi"; "omicron"; "pi"; "rho";
149 "sigmaf"; "sigma"; "tau"; "upsilon"; "phi"; "chi"; "psi"; "omega";
150 (* 970 - 976 are not mapped *)
151 ""; ""; ""; ""; ""; ""; "";
156 let ent_uni_punct = [
158 "ensp"; "emsp"; ""; ""; ""; ""; "";
159 "thinsp"; ""; ""; "zwnj"; "zwj"; "lrm"; "rlm";
160 ""; ""; ""; "ndash"; "mdash"; ""; ""; "";
162 "lsquo"; "rsquo"; "sbquo"; ""; "ldquo"; "rdquo"; "bdquo"; "";
163 "dagger"; "Dagger"; "bull"; ""; ""; ""; "hellip";
164 ""; ""; ""; ""; ""; ""; ""; ""; ""; "permil"; "";
166 "prime"; "Prime"; ""; ""; ""; ""; ""; "lsaquo"; "rsaquo"; "";
167 ""; ""; "oline"; ""; ""; ""; ""; "";
173 let ent_uni_8465_8501 = [
175 "image"; ""; ""; ""; ""; ""; "";
177 "weierp"; ""; ""; "";
179 "real"; ""; ""; ""; ""; "";
181 "trade"; ""; ""; ""; ""; ""; ""; ""; "";
182 ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
186 let ent_uni_8592_9002 = [
188 "larr"; "uarr"; "rarr"; "darr"; "harr"; ""; ""; "";
189 ""; ""; ""; ""; ""; ""; ""; "";
191 ""; ""; ""; ""; ""; ""; ""; "";
192 ""; ""; ""; ""; ""; ""; ""; "";
194 ""; ""; ""; ""; ""; "crarr"; ""; "";
195 ""; ""; ""; ""; ""; ""; ""; "";
197 ""; ""; ""; ""; ""; ""; ""; "";
198 ""; ""; ""; ""; ""; ""; ""; "";
200 "lArr"; "uArr"; "rArr"; "dArr"; "hArr"; "vArr"; ""; "";
201 ""; ""; "lAarr"; "rAarr"; ""; "rarrw"; ""; "";
203 ""; ""; ""; ""; ""; ""; ""; "";
204 ""; ""; ""; ""; ""; ""; ""; "";
205 ""; ""; ""; ""; ""; ""; ""; "";
206 ""; ""; ""; ""; ""; ""; ""; "";
208 "forall"; "comp"; "part"; "exist"; "nexist"; "empty"; ""; "nabla";
209 "isin"; "notin"; "epsis"; "ni"; "notni"; "bepsi"; ""; "prod";
211 "coprod"; "sum"; "minus"; "mnplus"; "plusdo"; ""; "setmn"; "lowast";
212 "compfn"; ""; "radic"; ""; ""; "prop"; "infin"; "ang90";
214 "ang"; "angmsd"; "angsph"; "mid"; "nmid"; "par"; "npar"; "and";
215 "or"; "cap"; "cup"; "int"; ""; ""; "conint"; "";
217 ""; ""; ""; ""; "there4"; "becaus"; ""; "";
218 ""; ""; ""; ""; "sim"; "bsim"; ""; "";
220 "wreath"; "nsim"; ""; "sime"; "nsime"; "cong"; ""; "ncong";
221 "asymp"; "nap"; "ape"; ""; "bcong"; "asymp"; "bump"; "bumpe";
223 ""; ""; ""; ""; ""; ""; ""; "";
224 ""; ""; ""; ""; ""; ""; ""; "";
226 "ne"; "equiv"; ""; ""; "le"; "ge"; "lE"; "gE";
227 "lnE"; "gnE"; "Lt"; "Gt"; "twixt"; ""; "nlt"; "ngt";
229 "nles"; "nges"; "lsim"; "gsim"; ""; ""; "lg"; "gl";
230 ""; ""; "pr"; "sc"; "cupre"; "sscue"; "prsim"; "scsim";
232 "npr"; "nsc"; "sub"; "sup"; "nsub"; "nsup"; "sube"; "supe";
233 ""; ""; ""; ""; ""; ""; ""; "";
235 ""; ""; ""; ""; ""; "oplus"; ""; "otimes";
236 ""; ""; ""; ""; ""; ""; ""; "";
238 ""; ""; ""; ""; ""; "perp"; ""; "";
239 ""; ""; ""; ""; ""; ""; ""; "";
241 ""; ""; ""; ""; ""; ""; ""; "";
242 ""; ""; ""; ""; ""; ""; ""; "";
244 ""; ""; ""; ""; ""; "sdot"; ""; "";
245 ""; ""; ""; ""; ""; ""; ""; "";
247 ""; ""; ""; ""; ""; ""; ""; "";
248 ""; ""; ""; ""; ""; ""; ""; "";
250 ""; ""; ""; ""; ""; ""; ""; "";
251 ""; ""; ""; ""; ""; ""; ""; "";
253 ""; ""; ""; ""; ""; ""; ""; "";
254 ""; ""; ""; ""; ""; ""; ""; "";
256 ""; ""; ""; ""; ""; ""; ""; "";
257 "lceil"; "rceil"; "lfloor"; "rfloor"; ""; ""; ""; "";
259 ""; ""; ""; ""; ""; ""; ""; "";
260 ""; ""; ""; ""; ""; ""; ""; "";
262 ""; ""; ""; ""; ""; ""; ""; "";
269 let ent_uni_9824_9830 = [
271 "spades"; ""; ""; "clubs"; ""; "hearts"; "diams"
273 let utf_entity_maps = [
274 { start_char
= 0xa0; table
= ent_iso_8859_1 };
275 { start_char
= 338; table
= ent_uni_338_402 };
276 { start_char
= 710; table
= ent_uni_spacing };
277 { start_char
= 913; table
= ent_uni_greek };
278 { start_char
= 8194; table
= ent_uni_punct };
279 { start_char
= 8364; table
= ent_uni_euro };
280 { start_char
= 8465; table
= ent_uni_8465_8501 };
281 { start_char
= 8592; table
= ent_uni_8592_9002 };
282 { start_char
= 9674; table
= ent_uni_9674 };
283 { start_char
= 9824; table
= ent_uni_9824_9830 };
285 let decode_table = Caml.Hashtbl.create
0 in
286 List.iter
utf_entity_maps ~f
: begin fun { start_char
; table
} ->
287 List.iteri table ~f
: begin fun i entity
->
288 if String.length entity
<> 0 then
289 Caml.Hashtbl.add
decode_table entity
(utf32_to_utf8 (start_char
+ i
))
298 "cloud", utf32_to_utf8 0x2601;
299 "umbrella", utf32_to_utf8 0x2602;
300 "snowman", utf32_to_utf8 0x2603;
301 "snowflake", utf32_to_utf8 0x2745;
302 "comet", utf32_to_utf8 0x2604;
303 "thunderstorm", utf32_to_utf8 0x2608
305 List.iter
predefined ~f
:(fun (k
, v
) -> Caml.Hashtbl.add
decode_table k v
);
308 let decode_entity s
=
309 (* check if entity has shape &#...
310 - if yes - this is value of utf32 codepoint
311 that should be converted to utf8 *)
312 if String.get s
1 = '#'
314 assert (String.length s
>= 3);
316 "0" ^
String.sub s
2 (String.length s
- 3)
320 (* malformed entity - return empty string *)
324 assert (String.length s
>= 2);
325 (* strip & and ; from the value and lookup result in decode table *)
326 String.sub s
1 (String.length s
- 2)
327 |> Caml.Hashtbl.find_opt
decode_table
328 |> Option.value ~default
:s
331 let decode s
= Str.global_substitute
entity_regex (fun m
->
332 decode_entity (Str.matched_string m
)