disallow keywords as identifiers
[hiphop-php.git] / hphp / hack / src / parser / html_entities.ml
blob1fb13f49d1954a2eb1af52e8dcba3836faf808d2
1 (*
2 * Copyright (c) 2017, Facebook, Inc.
3 * All rights reserved.
5 * This source code is licensed under the MIT license found in the
6 * LICENSE file in the "hack" directory of this source tree.
8 *)
10 (**
11 * HTML5 special entity decoding
13 * HHVM decodes certain HTML entities present in input strings before
14 * generating bytecode. In order to generate bytecode identical to HHVM's,
15 * this module performs the same HTML entity decoding as HHVM.
16 * Mimics: zend-html.cpp
17 * The list of entities tested was taken from
18 * https://dev.w3.org/html5/html-author/charref on 09/27/2017.
21 open Core_kernel
23 module B = Bytes
25 let entity_regex = Str.regexp "&[^;]+;"
27 type html_entity_map = {
28 start_char: int;
29 table: string list
32 let set pos n b =
33 B.set b pos (char_of_int n);
36 let utf32_to_utf8 k =
37 if k < 0x80 then begin
38 (* buf[0] = k; *)
39 B.create 1
40 |> set 0 k
41 end
42 else if k < 0x800 then begin
43 (* buf[0] = 0xc0 | (k >> 6);
44 buf[1] = 0x80 | (k & 0x3f); *)
45 B.create 2
46 |> set 0 (0xc0 lor (k lsr 6))
47 |> set 1 (0x80 lor (k land 0x3f))
48 end
49 else if k < 0x10000 then begin
50 (* buf[0] = 0xe0 | (k >> 12);
51 buf[1] = 0x80 | ((k >> 6) & 0x3f);
52 buf[2] = 0x80 | (k & 0x3f); *)
53 B.create 3
54 |> set 0 (0xe0 lor (k lsr 12))
55 |> set 1 (0x80 lor ((k lsr 6) land 0x3f))
56 |> set 2 (0x80 lor (k land 0x3f))
57 end
58 else if k < 0x200000 then begin
59 (* buf[0] = 0xf0 | (k >> 18);
60 buf[1] = 0x80 | ((k >> 12) & 0x3f);
61 buf[2] = 0x80 | ((k >> 6) & 0x3f);
62 buf[3] = 0x80 | (k & 0x3f); *)
63 B.create 4
64 |> set 0 (0xf0 lor (k lsr 18))
65 |> set 1 (0x80 lor ((k lsr 12) land 0x3f))
66 |> set 2 (0x80 lor ((k lsr 6) land 0x3f))
67 |> set 3 (0x80 lor (k land 0x3f))
68 end
69 else if k < 0x4000000 then begin
70 (* buf[0] = 0xf8 | (k >> 24);
71 buf[1] = 0x80 | ((k >> 18) & 0x3f);
72 buf[2] = 0x80 | ((k >> 12) & 0x3f);
73 buf[3] = 0x80 | ((k >> 6) & 0x3f);
74 buf[4] = 0x80 | (k & 0x3f); *)
75 B.create 5
76 |> set 0 (0xf8 lor (k lsr 24))
77 |> set 1 (0x80 lor ((k lsr 18) land 0x3f))
78 |> set 2 (0x80 lor ((k lsr 12) land 0x3f))
79 |> set 3 (0x80 lor ((k lsr 6) land 0x3f))
80 |> set 4 (0x80 lor (k land 0x3f))
81 end
82 else begin
83 (* buf[0] = 0xfc | (k >> 30);
84 buf[1] = 0x80 | ((k >> 24) & 0x3f);
85 buf[2] = 0x80 | ((k >> 18) & 0x3f);
86 buf[3] = 0x80 | ((k >> 12) & 0x3f);
87 buf[4] = 0x80 | ((k >> 6) & 0x3f);
88 buf[5] = 0x80 | (k & 0x3f); *)
89 B.create 6
90 |> set 0 (0xfc lor (k lsr 30))
91 |> set 1 (0x80 lor ((k lsr 24) land 0x3f))
92 |> set 2 (0x80 lor ((k lsr 18) land 0x3f))
93 |> set 3 (0x80 lor ((k lsr 12) land 0x3f))
94 |> set 4 (0x80 lor ((k lsr 6) land 0x3f))
95 |> set 5 (0x80 lor (k land 0x3f))
96 end
98 let decode_table =
99 let ent_iso_8859_1 = [
100 "nbsp"; "iexcl"; "cent"; "pound"; "curren"; "yen"; "brvbar";
101 "sect"; "uml"; "copy"; "ordf"; "laquo"; "not"; "shy"; "reg";
102 "macr"; "deg"; "plusmn"; "sup2"; "sup3"; "acute"; "micro";
103 "para"; "middot"; "cedil"; "sup1"; "ordm"; "raquo"; "frac14";
104 "frac12"; "frac34"; "iquest"; "Agrave"; "Aacute"; "Acirc";
105 "Atilde"; "Auml"; "Aring"; "AElig"; "Ccedil"; "Egrave";
106 "Eacute"; "Ecirc"; "Euml"; "Igrave"; "Iacute"; "Icirc";
107 "Iuml"; "ETH"; "Ntilde"; "Ograve"; "Oacute"; "Ocirc"; "Otilde";
108 "Ouml"; "times"; "Oslash"; "Ugrave"; "Uacute"; "Ucirc"; "Uuml";
109 "Yacute"; "THORN"; "szlig"; "agrave"; "aacute"; "acirc";
110 "atilde"; "auml"; "aring"; "aelig"; "ccedil"; "egrave";
111 "eacute"; "ecirc"; "euml"; "igrave"; "iacute"; "icirc";
112 "iuml"; "eth"; "ntilde"; "ograve"; "oacute"; "ocirc"; "otilde";
113 "ouml"; "divide"; "oslash"; "ugrave"; "uacute"; "ucirc";
114 "uuml"; "yacute"; "thorn"; "yuml"
115 ] in
116 let ent_uni_338_402 = [
117 (* 338 (0x0152) *)
118 "OElig"; "oelig"; ""; ""; ""; "";
119 ""; ""; ""; ""; ""; ""; ""; "";
120 (* 352 (0x0160) *)
121 "Scaron"; "scaron"; ""; ""; ""; ""; ""; "";
122 ""; ""; ""; ""; ""; ""; ""; "";
123 ""; ""; ""; ""; ""; ""; ""; "";
124 (* 376 (0x0178) *)
125 "Yuml"; ""; ""; ""; ""; ""; ""; "";
126 ""; ""; ""; ""; ""; ""; ""; "";
127 ""; ""; ""; ""; ""; ""; ""; "";
128 (* 400 (0x0190) *)
129 ""; ""; "fnof"
130 ] in
131 let ent_uni_spacing = [
132 (* 710 *)
133 "circ";
134 (* 711 - 730 *)
135 ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
136 ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
137 (* 731 - 732 *)
138 ""; "tilde"
139 ] in
140 let ent_uni_greek = [
141 (* 913 *)
142 "Alpha"; "Beta"; "Gamma"; "Delta"; "Epsilon"; "Zeta"; "Eta"; "Theta";
143 "Iota"; "Kappa"; "Lambda"; "Mu"; "Nu"; "Xi"; "Omicron"; "Pi"; "Rho";
144 ""; "Sigma"; "Tau"; "Upsilon"; "Phi"; "Chi"; "Psi"; "Omega";
145 (* 938 - 944 are not mapped *)
146 ""; ""; ""; ""; ""; ""; "";
147 "alpha"; "beta"; "gamma"; "delta"; "epsilon"; "zeta"; "eta"; "theta";
148 "iota"; "kappa"; "lambda"; "mu"; "nu"; "xi"; "omicron"; "pi"; "rho";
149 "sigmaf"; "sigma"; "tau"; "upsilon"; "phi"; "chi"; "psi"; "omega";
150 (* 970 - 976 are not mapped *)
151 ""; ""; ""; ""; ""; ""; "";
152 "thetasym"; "upsih";
153 ""; ""; "";
154 "piv"
155 ] in
156 let ent_uni_punct = [
157 (* 8194 *)
158 "ensp"; "emsp"; ""; ""; ""; ""; "";
159 "thinsp"; ""; ""; "zwnj"; "zwj"; "lrm"; "rlm";
160 ""; ""; ""; "ndash"; "mdash"; ""; ""; "";
161 (* 8216 *)
162 "lsquo"; "rsquo"; "sbquo"; ""; "ldquo"; "rdquo"; "bdquo"; "";
163 "dagger"; "Dagger"; "bull"; ""; ""; ""; "hellip";
164 ""; ""; ""; ""; ""; ""; ""; ""; ""; "permil"; "";
165 (* 8242 *)
166 "prime"; "Prime"; ""; ""; ""; ""; ""; "lsaquo"; "rsaquo"; "";
167 ""; ""; "oline"; ""; ""; ""; ""; "";
168 "frasl"
169 ] in
170 let ent_uni_euro = [
171 "euro"
172 ] in
173 let ent_uni_8465_8501 = [
174 (* 8465 *)
175 "image"; ""; ""; ""; ""; ""; "";
176 (* 8472 *)
177 "weierp"; ""; ""; "";
178 (* 8476 *)
179 "real"; ""; ""; ""; ""; "";
180 (* 8482 *)
181 "trade"; ""; ""; ""; ""; ""; ""; ""; "";
182 ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
183 (* 8501 *)
184 "alefsym";
185 ] in
186 let ent_uni_8592_9002 = [
187 (* 8592 (0x2190) *)
188 "larr"; "uarr"; "rarr"; "darr"; "harr"; ""; ""; "";
189 ""; ""; ""; ""; ""; ""; ""; "";
190 (* 8608 (0x21a0) *)
191 ""; ""; ""; ""; ""; ""; ""; "";
192 ""; ""; ""; ""; ""; ""; ""; "";
193 (* 8624 (0x21b0) *)
194 ""; ""; ""; ""; ""; "crarr"; ""; "";
195 ""; ""; ""; ""; ""; ""; ""; "";
196 (* 8640 (0x21c0) *)
197 ""; ""; ""; ""; ""; ""; ""; "";
198 ""; ""; ""; ""; ""; ""; ""; "";
199 (* 8656 (0x21d0) *)
200 "lArr"; "uArr"; "rArr"; "dArr"; "hArr"; "vArr"; ""; "";
201 ""; ""; "lAarr"; "rAarr"; ""; "rarrw"; ""; "";
202 (* 8672 (0x21e0) *)
203 ""; ""; ""; ""; ""; ""; ""; "";
204 ""; ""; ""; ""; ""; ""; ""; "";
205 ""; ""; ""; ""; ""; ""; ""; "";
206 ""; ""; ""; ""; ""; ""; ""; "";
207 (* 8704 (0x2200) *)
208 "forall"; "comp"; "part"; "exist"; "nexist"; "empty"; ""; "nabla";
209 "isin"; "notin"; "epsis"; "ni"; "notni"; "bepsi"; ""; "prod";
210 (* 8720 (0x2210) *)
211 "coprod"; "sum"; "minus"; "mnplus"; "plusdo"; ""; "setmn"; "lowast";
212 "compfn"; ""; "radic"; ""; ""; "prop"; "infin"; "ang90";
213 (* 8736 (0x2220) *)
214 "ang"; "angmsd"; "angsph"; "mid"; "nmid"; "par"; "npar"; "and";
215 "or"; "cap"; "cup"; "int"; ""; ""; "conint"; "";
216 (* 8752 (0x2230) *)
217 ""; ""; ""; ""; "there4"; "becaus"; ""; "";
218 ""; ""; ""; ""; "sim"; "bsim"; ""; "";
219 (* 8768 (0x2240) *)
220 "wreath"; "nsim"; ""; "sime"; "nsime"; "cong"; ""; "ncong";
221 "asymp"; "nap"; "ape"; ""; "bcong"; "asymp"; "bump"; "bumpe";
222 (* 8784 (0x2250) *)
223 ""; ""; ""; ""; ""; ""; ""; "";
224 ""; ""; ""; ""; ""; ""; ""; "";
225 (* 8800 (0x2260) *)
226 "ne"; "equiv"; ""; ""; "le"; "ge"; "lE"; "gE";
227 "lnE"; "gnE"; "Lt"; "Gt"; "twixt"; ""; "nlt"; "ngt";
228 (* 8816 (0x2270) *)
229 "nles"; "nges"; "lsim"; "gsim"; ""; ""; "lg"; "gl";
230 ""; ""; "pr"; "sc"; "cupre"; "sscue"; "prsim"; "scsim";
231 (* 8832 (0x2280) *)
232 "npr"; "nsc"; "sub"; "sup"; "nsub"; "nsup"; "sube"; "supe";
233 ""; ""; ""; ""; ""; ""; ""; "";
234 (* 8848 (0x2290) *)
235 ""; ""; ""; ""; ""; "oplus"; ""; "otimes";
236 ""; ""; ""; ""; ""; ""; ""; "";
237 (* 8864 (0x22a0) *)
238 ""; ""; ""; ""; ""; "perp"; ""; "";
239 ""; ""; ""; ""; ""; ""; ""; "";
240 (* 8880 (0x22b0) *)
241 ""; ""; ""; ""; ""; ""; ""; "";
242 ""; ""; ""; ""; ""; ""; ""; "";
243 (* 8896 (0x22c0) *)
244 ""; ""; ""; ""; ""; "sdot"; ""; "";
245 ""; ""; ""; ""; ""; ""; ""; "";
246 (* 8912 (0x22d0) *)
247 ""; ""; ""; ""; ""; ""; ""; "";
248 ""; ""; ""; ""; ""; ""; ""; "";
249 (* 8928 (0x22e0) *)
250 ""; ""; ""; ""; ""; ""; ""; "";
251 ""; ""; ""; ""; ""; ""; ""; "";
252 (* 8944 (0x22f0) *)
253 ""; ""; ""; ""; ""; ""; ""; "";
254 ""; ""; ""; ""; ""; ""; ""; "";
255 (* 8960 (0x2300) *)
256 ""; ""; ""; ""; ""; ""; ""; "";
257 "lceil"; "rceil"; "lfloor"; "rfloor"; ""; ""; ""; "";
258 (* 8976 (0x2310) *)
259 ""; ""; ""; ""; ""; ""; ""; "";
260 ""; ""; ""; ""; ""; ""; ""; "";
261 (* 8992 (0x2320) *)
262 ""; ""; ""; ""; ""; ""; ""; "";
263 ""; "lang"; "rang"
264 ] in
265 let ent_uni_9674 = [
266 (* 9674 *)
267 "loz"
268 ] in
269 let ent_uni_9824_9830 = [
270 (* 9824 *)
271 "spades"; ""; ""; "clubs"; ""; "hearts"; "diams"
272 ] in
273 let utf_entity_maps = [
274 { start_char = 0xa0; table = ent_iso_8859_1 };
275 { start_char = 338; table = ent_uni_338_402 };
276 { start_char = 710; table = ent_uni_spacing };
277 { start_char = 913; table = ent_uni_greek };
278 { start_char = 8194; table = ent_uni_punct };
279 { start_char = 8364; table = ent_uni_euro };
280 { start_char = 8465; table = ent_uni_8465_8501 };
281 { start_char = 8592; table = ent_uni_8592_9002 };
282 { start_char = 9674; table = ent_uni_9674 };
283 { start_char = 9824; table = ent_uni_9824_9830 };
284 ] in
285 let decode_table = Caml.Hashtbl.create 0 in
286 List.iter utf_entity_maps ~f: begin fun { start_char; table } ->
287 List.iteri table ~f: begin fun i entity ->
288 if String.length entity <> 0 then
289 Caml.Hashtbl.add decode_table entity (utf32_to_utf8 (start_char + i))
291 end;
292 let predefined = [
293 "quot", "\"";
294 "lt", "<";
295 "gt", ">";
296 "amp", "&";
297 "apos", "\'";
298 "cloud", utf32_to_utf8 0x2601;
299 "umbrella", utf32_to_utf8 0x2602;
300 "snowman", utf32_to_utf8 0x2603;
301 "snowflake", utf32_to_utf8 0x2745;
302 "comet", utf32_to_utf8 0x2604;
303 "thunderstorm", utf32_to_utf8 0x2608
304 ] in
305 List.iter predefined ~f:(fun (k, v) -> Caml.Hashtbl.add decode_table k v);
306 decode_table
308 let decode_entity s =
309 (* check if entity has shape &#...
310 - if yes - this is value of utf32 codepoint
311 that should be converted to utf8 *)
312 if String.get s 1 = '#'
313 then begin
314 assert (String.length s >= 3);
316 "0" ^ String.sub s 2 (String.length s - 3)
317 |> int_of_string
318 |> utf32_to_utf8
319 with _ ->
320 (* malformed entity - return empty string *)
323 else begin
324 assert (String.length s >= 2);
325 (* strip & and ; from the value and lookup result in decode table *)
326 String.sub s 1 (String.length s - 2)
327 |> Caml.Hashtbl.find_opt decode_table
328 |> Option.value ~default:s
331 let decode s = Str.global_substitute entity_regex (fun m ->
332 decode_entity (Str.matched_string m)