Move the job of converting provided coeffects to ambient coeffects from callee to...
[hiphop-php.git] / hphp / zend / zend-html.cpp
blob42e07e464d20a1b215a0664b0da54ae7b499274d
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
17 #include "hphp/zend/zend-html.h"
19 #include <unicode/uchar.h>
20 #include <unicode/utf8.h>
22 #include "hphp/util/lock.h"
23 #include "hphp/util/functional.h"
24 #include "hphp/util/hash-map.h"
26 namespace HPHP {
28 ///////////////////////////////////////////////////////////////////////////////
29 // UTF-8 entity tables
31 using namespace entity_charset_enum;
33 static entity_table_t ent_cp_866[] = {
34 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
35 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
36 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
37 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
38 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
39 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
40 "blk14", "blk12", "blk34", "boxv", "boxvl", "boxvL", "boxVl", "boxDl",
41 "boxdL", "boxVL", "boxV", "boxDL", "boxUL", "boxUl", "boxuL", "boxdl",
42 "boxur", "boxhu", "boxhd", "boxvr", "boxh", "boxvh", "boxvR", "boxVr",
43 "boxUR", "boxDR", "boxHU", "boxHD", "boxVR", "boxH", "boxVH", "boxHu",
44 "boxhU", "boxHd", "boxhD", "boxUr", "boxuR", "boxdR", "boxDr", "boxVh",
45 "boxvH", "boxul", "boxdr", "block", "lhblk", nullptr, nullptr, "uhblk",
46 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
47 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
48 "IOcy", "iocy", "Jukcy", "jukcy", "YIcy", "yicy", "Ubrcy", "ubrcy",
49 "deg", nullptr, "middot", "Sqrt", "numero", "curren", nullptr, "nbsp"
52 static entity_table_t ent_cp_1251[] = {
53 "DJcy", "GJcy", "sbquo", "gjcy", "bdquo", "hellip", "dagger", "Dagger",
54 "euro", "permil", "LJcy", "lsaquo", "NJcy", "KJcy", "TSHcy", "DZcy",
55 "djcy", "lsquo", "rsquo", "ldquo", "rdquo", "bull", "ndash", "mdash",
56 nullptr, "trade", "ljcy", "rsaquo", "njcy", "kjcy", "tshcy", "dzcy",
57 "nbsp", "Ubrcy", "ubrcy", "Jsercy", "curren", nullptr, "brvbar", "sect",
58 "IOcy", "copy", "Jukcy", "laquo", "not", "shy", "reg", "YIcy",
59 "deg", "pm", "Iukcy", "iukcy", nullptr, "micro", "para", "middot",
60 "iocy", "numero", "jukcy", "raquo", "jsercy", "DScy", "dscy", "yicy",
61 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
62 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
63 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
64 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
65 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
66 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
67 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
68 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy"
71 /* codepage 1252 is a Windows extension to iso-8859-1. */
72 static entity_table_t ent_cp_1252[] = {
73 "euro", nullptr, "sbquo", "fnof", "bdquo", "hellip", "dagger",
74 "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
75 nullptr, nullptr, nullptr, nullptr, "lsquo", "rsquo", "ldquo", "rdquo",
76 "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
77 "oelig", nullptr, nullptr, "Yuml"
80 static entity_table_t ent_iso_8859_1[] = {
81 "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
82 "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
83 "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
84 "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
85 "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
86 "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
87 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
88 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
89 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
90 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
91 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
92 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
93 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
94 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
95 "uuml", "yacute", "thorn", "yuml"
98 static entity_table_t ent_iso_8859_5[] = {
99 "nbsp", "IOcy", "DJcy", "GJcy", "Jukcy", "DScy", "Iukcy", "YIcy",
100 "Jsercy", "LJcy", "NJcy", "TSHcy", "KJcy", "shy", "Ubrcy", "DZcy",
101 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
102 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
103 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
104 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
105 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
106 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
107 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
108 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
109 "numero", "iocy", "djcy", "gjcy", "jukcy", "dscy", "iukcy", "yicy",
110 "jsercy", "ljcy", "njcy", "tshcy", "kjcy", "sect", "ubrcy", "dzcy"
113 static entity_table_t ent_iso_8859_15[] = {
114 "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
115 "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
116 "macr", "deg", "plusmn", "sup2", "sup3", nullptr, /* Zcaron */
117 "micro", "para", "middot", nullptr, /* zcaron */ "sup1", "ordm",
118 "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
119 "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
120 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
121 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
122 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
123 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
124 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
125 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
126 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
127 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
128 "uuml", "yacute", "thorn", "yuml"
131 static entity_table_t ent_uni_338_402[] = {
132 /* 338 (0x0152) */
133 "OElig", "oelig", nullptr, nullptr, nullptr, nullptr,
134 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
135 /* 352 (0x0160) */
136 "Scaron", "scaron", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
137 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
138 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
139 /* 376 (0x0178) */
140 "Yuml", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
141 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
142 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
143 /* 400 (0x0190) */
144 nullptr, nullptr, "fnof"
147 static entity_table_t ent_uni_spacing[] = {
148 /* 710 */
149 "circ",
150 /* 711 - 730 */
151 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
152 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
153 /* 731 - 732 */
154 nullptr, "tilde"
157 static entity_table_t ent_uni_greek[] = {
158 /* 913 */
159 "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
160 "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
161 nullptr, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
162 /* 938 - 944 are not mapped */
163 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
164 "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
165 "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
166 "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
167 /* 970 - 976 are not mapped */
168 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
169 "thetasym", "upsih",
170 nullptr, nullptr, nullptr,
171 "piv"
174 static entity_table_t ent_uni_punct[] = {
175 /* 8194 */
176 "ensp", "emsp", nullptr, nullptr, nullptr, nullptr, nullptr,
177 "thinsp", nullptr, nullptr, "zwnj", "zwj", "lrm", "rlm",
178 nullptr, nullptr, nullptr, "ndash", "mdash", nullptr, nullptr, nullptr,
179 /* 8216 */
180 "lsquo", "rsquo", "sbquo", nullptr, "ldquo", "rdquo", "bdquo", nullptr,
181 "dagger", "Dagger", "bull", nullptr, nullptr, nullptr, "hellip",
182 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "permil", nullptr,
183 /* 8242 */
184 "prime", "Prime", nullptr, nullptr, nullptr, nullptr, nullptr, "lsaquo", "rsaquo", nullptr,
185 nullptr, nullptr, "oline", nullptr, nullptr, nullptr, nullptr, nullptr,
186 "frasl"
189 static entity_table_t ent_uni_euro[] = {
190 "euro"
193 static entity_table_t ent_uni_8465_8501[] = {
194 /* 8465 */
195 "image", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
196 /* 8472 */
197 "weierp", nullptr, nullptr, nullptr,
198 /* 8476 */
199 "real", nullptr, nullptr, nullptr, nullptr, nullptr,
200 /* 8482 */
201 "trade", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
202 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
203 /* 8501 */
204 "alefsym",
207 static entity_table_t ent_uni_8592_9002[] = {
208 /* 8592 (0x2190) */
209 "larr", "uarr", "rarr", "darr", "harr", nullptr, nullptr, nullptr,
210 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
211 /* 8608 (0x21a0) */
212 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
213 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
214 /* 8624 (0x21b0) */
215 nullptr, nullptr, nullptr, nullptr, nullptr, "crarr", nullptr, nullptr,
216 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
217 /* 8640 (0x21c0) */
218 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
219 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
220 /* 8656 (0x21d0) */
221 "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", nullptr, nullptr,
222 nullptr, nullptr, "lAarr", "rAarr", nullptr, "rarrw", nullptr, nullptr,
223 /* 8672 (0x21e0) */
224 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
225 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
226 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
227 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
228 /* 8704 (0x2200) */
229 "forall", "comp", "part", "exist", "nexist", "empty", nullptr, "nabla",
230 "isin", "notin", "epsis", "ni", "notni", "bepsi", nullptr, "prod",
231 /* 8720 (0x2210) */
232 "coprod", "sum", "minus", "mnplus", "plusdo", nullptr, "setmn", "lowast",
233 "compfn", nullptr, "radic", nullptr, nullptr, "prop", "infin", "ang90",
234 /* 8736 (0x2220) */
235 "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
236 "or", "cap", "cup", "int", nullptr, nullptr, "conint", nullptr,
237 /* 8752 (0x2230) */
238 nullptr, nullptr, nullptr, nullptr, "there4", "becaus", nullptr, nullptr,
239 nullptr, nullptr, nullptr, nullptr, "sim", "bsim", nullptr, nullptr,
240 /* 8768 (0x2240) */
241 "wreath", "nsim", nullptr, "sime", "nsime", "cong", nullptr, "ncong",
242 "asymp", "nap", "ape", nullptr, "bcong", "asymp", "bump", "bumpe",
243 /* 8784 (0x2250) */
244 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
245 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
246 /* 8800 (0x2260) */
247 "ne", "equiv", nullptr, nullptr, "le", "ge", "lE", "gE",
248 "lnE", "gnE", "Lt", "Gt", "twixt", nullptr, "nlt", "ngt",
249 /* 8816 (0x2270) */
250 "nles", "nges", "lsim", "gsim", nullptr, nullptr, "lg", "gl",
251 nullptr, nullptr, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
252 /* 8832 (0x2280) */
253 "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
254 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
255 /* 8848 (0x2290) */
256 nullptr, nullptr, nullptr, nullptr, nullptr, "oplus", nullptr, "otimes",
257 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
258 /* 8864 (0x22a0) */
259 nullptr, nullptr, nullptr, nullptr, nullptr, "perp", nullptr, nullptr,
260 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
261 /* 8880 (0x22b0) */
262 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
263 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
264 /* 8896 (0x22c0) */
265 nullptr, nullptr, nullptr, nullptr, nullptr, "sdot", nullptr, nullptr,
266 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
267 /* 8912 (0x22d0) */
268 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
269 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
270 /* 8928 (0x22e0) */
271 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
272 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
273 /* 8944 (0x22f0) */
274 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
275 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
276 /* 8960 (0x2300) */
277 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
278 "lceil", "rceil", "lfloor", "rfloor", nullptr, nullptr, nullptr, nullptr,
279 /* 8976 (0x2310) */
280 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
281 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
282 /* 8992 (0x2320) */
283 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
284 nullptr, "lang", "rang"
287 static entity_table_t ent_uni_9674[] = {
288 /* 9674 */
289 "loz"
292 static entity_table_t ent_uni_9824_9830[] = {
293 /* 9824 */
294 "spades", nullptr, nullptr, "clubs", nullptr, "hearts", "diams"
297 static const struct html_entity_map entity_map[] = {
298 { cs_cp866, 0x80, 0xff, ent_cp_866 },
299 { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
300 { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
301 { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
302 { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
303 { cs_8859_5, 0xa0, 0xff, ent_iso_8859_5 },
304 { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
305 { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
306 { cs_utf_8, 338, 402, ent_uni_338_402 },
307 { cs_utf_8, 710, 732, ent_uni_spacing },
308 { cs_utf_8, 913, 982, ent_uni_greek },
309 { cs_utf_8, 8194, 8260, ent_uni_punct },
310 { cs_utf_8, 8364, 8364, ent_uni_euro },
311 { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
312 { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
313 { cs_utf_8, 9674, 9674, ent_uni_9674 },
314 { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
315 { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
316 { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
317 { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
318 { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
319 { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
320 /* Missing support for these at the moment
321 { cs_koi8r, 0xa3, 0xff, ent_koi8r },
322 { cs_macroman, 0x0b, 0xff, ent_macroman },
324 { cs_terminator }
327 static const struct {
328 const char *codeset;
329 entity_charset charset;
330 } charset_map[] = {
331 { "ISO-8859-1", cs_8859_1 },
332 { "ISO8859-1", cs_8859_1 },
333 { "ISO-8859-5", cs_8859_5 },
334 { "ISO8859-5", cs_8859_5 },
335 { "ISO-8859-15", cs_8859_15 },
336 { "ISO8859-15", cs_8859_15 },
337 { "utf-8", cs_utf_8 },
338 { "cp866", cs_cp866 },
339 { "866", cs_cp866 },
340 { "ibm866", cs_cp866 },
341 { "cp1251", cs_cp1251 },
342 { "Windows-1251", cs_cp1251 },
343 { "win-1251", cs_cp1251 },
344 { "cp1252", cs_cp1252 },
345 { "Windows-1252", cs_cp1252 },
346 { "1252", cs_cp1252 },
347 { "BIG5", cs_big5 },
348 { "950", cs_big5 },
349 { "GB2312", cs_gb2312 },
350 { "936", cs_gb2312 },
351 { "BIG5-HKSCS", cs_big5hkscs },
352 { "Shift_JIS", cs_sjis },
353 { "SJIS", cs_sjis },
354 { "932", cs_sjis },
355 { "EUCJP", cs_eucjp },
356 /* Missing support for these at the moment
357 { "EUC-JP", cs_eucjp },
358 { "KOI8-R", cs_koi8r },
359 { "koi8-ru", cs_koi8r },
360 { "koi8r", cs_koi8r },
361 { "MacRoman", cs_macroman },
363 { nullptr }
366 ///////////////////////////////////////////////////////////////////////////////
368 entity_charset determine_charset(const char *charset_hint) {
369 entity_charset charset = cs_unknown;
371 if (charset_hint == nullptr) {
372 // default to utf-8
373 return cs_utf_8;
376 size_t len = strlen(charset_hint);
378 /* now walk the charset map and look for the codeset */
379 for (int i = 0; charset_map[i].codeset; i++) {
380 if (len == strlen(charset_map[i].codeset) &&
381 strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
382 charset = charset_map[i].charset;
383 break;
387 return charset;
390 static int utf32_to_utf8(unsigned char *buf, int k) {
391 int retval = 0;
393 if (k < 0x80) {
394 buf[0] = k;
395 retval = 1;
396 } else if (k < 0x800) {
397 buf[0] = 0xc0 | (k >> 6);
398 buf[1] = 0x80 | (k & 0x3f);
399 retval = 2;
400 } else if (k < 0x10000) {
401 buf[0] = 0xe0 | (k >> 12);
402 buf[1] = 0x80 | ((k >> 6) & 0x3f);
403 buf[2] = 0x80 | (k & 0x3f);
404 retval = 3;
405 } else if (k < 0x200000) {
406 buf[0] = 0xf0 | (k >> 18);
407 buf[1] = 0x80 | ((k >> 12) & 0x3f);
408 buf[2] = 0x80 | ((k >> 6) & 0x3f);
409 buf[3] = 0x80 | (k & 0x3f);
410 retval = 4;
411 } else if (k < 0x4000000) {
412 buf[0] = 0xf8 | (k >> 24);
413 buf[1] = 0x80 | ((k >> 18) & 0x3f);
414 buf[2] = 0x80 | ((k >> 12) & 0x3f);
415 buf[3] = 0x80 | ((k >> 6) & 0x3f);
416 buf[4] = 0x80 | (k & 0x3f);
417 retval = 5;
418 } else {
419 buf[0] = 0xfc | (k >> 30);
420 buf[1] = 0x80 | ((k >> 24) & 0x3f);
421 buf[2] = 0x80 | ((k >> 18) & 0x3f);
422 buf[3] = 0x80 | ((k >> 12) & 0x3f);
423 buf[4] = 0x80 | ((k >> 6) & 0x3f);
424 buf[5] = 0x80 | (k & 0x3f);
425 retval = 6;
427 buf[retval] = '\0';
429 return retval;
432 using HtmlEntityMap = hphp_const_char_map<std::string>;
434 static volatile bool EntityMapInited = false;
435 static Mutex EntityMapMutex;
436 static HtmlEntityMap EntityMap[cs_end];
437 static HtmlEntityMap XHPEntityMap[cs_end];
439 static void init_entity_table() {
440 for (unsigned int i = 0; entity_map[i].charset != cs_terminator; i++) {
441 const html_entity_map &em = entity_map[i];
442 const entity_charset charset = entity_map[i].charset;
444 int index = 0;
445 for (int ch = em.basechar; ch <= em.endchar; ch++, index++) {
446 const char *entity = em.table[index];
447 if (entity == nullptr) {
448 continue;
450 unsigned char buf[10];
451 switch (charset) {
452 case cs_8859_1:
453 case cs_cp1252:
454 case cs_8859_15:
455 case cs_cp1251:
456 case cs_8859_5:
457 case cs_cp866:
458 case cs_koi8r:
459 buf[0] = ch;
460 buf[1] = '\0';
461 break;
463 case cs_utf_8:
464 utf32_to_utf8(buf, ch);
465 break;
467 default:
468 continue;
470 EntityMap[charset][entity] = (const char *)buf;
471 XHPEntityMap[charset][entity] = (const char *)buf;
474 EntityMap[charset]["quot"] = "\"";
475 EntityMap[charset]["lt"] = "<";
476 EntityMap[charset]["gt"] = ">";
477 EntityMap[charset]["amp"] = "&";
479 XHPEntityMap[charset]["quot"] = "\"";
480 XHPEntityMap[charset]["lt"] = "<";
481 XHPEntityMap[charset]["gt"] = ">";
482 XHPEntityMap[charset]["amp"] = "&";
483 // XHP-specific entities
484 XHPEntityMap[charset]["apos"] = "\'";
485 XHPEntityMap[charset]["cloud"] = u8"\u2601";
486 XHPEntityMap[charset]["umbrella"] = u8"\u2602";
487 XHPEntityMap[charset]["snowman"] = u8"\u2603";
488 XHPEntityMap[charset]["snowflake"] = u8"\u2745";
489 XHPEntityMap[charset]["comet"] = u8"\u2604";
490 XHPEntityMap[charset]["thunderstorm"] = u8"\u2608";
493 // the first element is an empty table
494 EntityMap[cs_terminator]["quot"] = "\"";
495 EntityMap[cs_terminator]["lt"] = "<";
496 EntityMap[cs_terminator]["gt"] = ">";
497 EntityMap[cs_terminator]["amp"] = "&";
498 // XHP-specific entities
499 XHPEntityMap[cs_terminator]["apos"] = "\'";
500 XHPEntityMap[cs_terminator]["cloud"] = u8"\u2601";
501 XHPEntityMap[cs_terminator]["umbrella"] = u8"\u2602";
502 XHPEntityMap[cs_terminator]["snowman"] = u8"\u2603";
503 XHPEntityMap[cs_terminator]["snowflake"] = u8"\u2745";
504 XHPEntityMap[cs_terminator]["comet"] = u8"\u2604";
505 XHPEntityMap[cs_terminator]["thunderstorm"] = u8"\u2608";
508 ///////////////////////////////////////////////////////////////////////////////
509 inline static bool decode_entity(char *entity, int *len,
510 bool decode_double_quote,
511 bool decode_single_quote,
512 entity_charset charset, bool all,
513 bool xhp = false) {
514 // entity is 16 bytes, allocated statically below
515 // default in PHP
516 assert(entity && *entity);
517 if (entity[0] == '#') {
518 int code;
519 if (entity[1] == 'x' || entity[1] == 'X') {
520 if (!isxdigit(entity[2])) return false;
521 code = strtol(entity + 2, nullptr, 16);
522 } else {
523 if (!isdigit(entity[1])) return false;
524 code = strtol(entity + 1, nullptr, 10);
527 // since we don't support multibyte chars other than utf-8
528 int l = 1;
530 if (code == 39 && decode_single_quote) {
531 entity[0] = code;
532 entity[1] = '\0';
533 *len = l;
534 return true;
537 if (!all && (code != '&') &&
538 (code != '<') && (code != '>') &&
539 (code != '"') && (code != '\'')) {
540 // htmlspecialchars_decode() does not parse numeric
541 // entities other than & < > " '
542 return false;
545 switch (charset) {
546 case cs_utf_8:
548 unsigned char buf[10];
549 int size = utf32_to_utf8(buf, code);
550 memcpy(entity, buf, size + 1);
551 l = size;
552 break;
555 case cs_8859_1:
556 case cs_8859_5:
557 case cs_8859_15:
558 if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
559 return false;
560 } else {
561 if (code == 39) {
562 return false;
564 entity[0] = code;
565 entity[1] = '\0';
567 break;
569 case cs_cp1252:
570 case cs_cp1251:
571 case cs_cp866:
572 if (code > 0xff) {
573 return false;
575 entity[0] = code;
576 entity[1] = '\0';
577 break;
579 case cs_big5:
580 case cs_big5hkscs:
581 case cs_sjis:
582 case cs_eucjp:
583 if (code >= 0x80) {
584 return false;
586 entity[0] = code;
587 entity[1] = '\0';
588 break;
590 case cs_gb2312:
591 if (code >= 0x81) {
592 return false;
594 entity[0] = code;
595 entity[1] = '\0';
596 break;
598 default:
599 return false;
600 break;
602 *len = l;
603 return true;
604 } else {
605 HtmlEntityMap *entityMap;
607 if (strncasecmp(entity, "quot", 4) == 0 && !decode_double_quote) {
608 return false;
611 if (all) {
612 entityMap = xhp ? &XHPEntityMap[charset] : &EntityMap[charset];
613 } else {
614 entityMap = xhp ? &XHPEntityMap[cs_terminator]
615 : &EntityMap[cs_terminator];
617 HtmlEntityMap::const_iterator iter = entityMap->find(entity);
618 if (iter != entityMap->end()) {
619 memcpy(entity, iter->second.c_str(), iter->second.length() + 1);
620 *len = iter->second.length();
621 return true;
625 return false;
628 inline static bool encode_entity(char* buf, int* buflen,
629 const char* entity, bool utf8) {
630 entity_charset charset = cs_utf_8;
631 if (!utf8){ charset = cs_8859_1; }
633 HtmlEntityMap *entityMap = &EntityMap[charset];
635 for(HtmlEntityMap::const_iterator iter = entityMap->begin();
636 iter != entityMap->end(); iter++) {
637 if (strcmp(iter->second.c_str(), entity) == 0) {
638 memcpy(buf, iter->first, strlen(iter->first));
639 *buflen = strlen(iter->first);
640 return true;
643 return false;
646 char *string_html_encode(const char *input, int &len,
647 const int64_t qsBitmask, bool utf8,
648 bool dEncode, bool htmlEnt) {
649 assert(input);
651 * Though seems to be wasting memory a lot, we have to realize most of the
652 * time this function is called with small strings, or fragments of HTMLs.
653 * Allocating/deallocating anything less than 1K is trivial these days, and
654 * we want avoid string copying as much as possible. Of course, the return
655 * char * is really sent back at large, occupying unnessary space for
656 * potentially longer time than we need, we have to realize the two closest
657 * solutions are not that much better, either:
659 * 1. pre-calculate size by iterating through the string once: too time
660 * consuming;
661 * 2. take a guess and double buffer size when over: still wasting, and
662 * it may not save that much.
664 * Note: Amount of allocation per character to be encoded may have to be
665 * increased as larger HTML Entities are implemented.
667 char *ret = (char *)malloc(len * 14uL + 1);
668 if (!ret) {
669 return nullptr;
671 char *q = ret;
672 for (const char *p = input, *end = input + len; p < end; p++) {
673 unsigned char c = *p;
674 char entity[5];
675 int codeLength = 0;
676 switch (c) {
677 case '"':
678 if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_DOUBLE)) {
679 *q++ = '&'; *q++ = 'q'; *q++ = 'u'; *q++ = 'o'; *q++ = 't'; *q++ = ';';
680 } else {
681 *q++ = c;
683 break;
684 case '\'':
685 if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SINGLE)) {
686 *q++ = '&';
687 if ((qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_XML1))) {
688 *q++ = 'a'; *q++ = 'p'; *q++ = 'o'; *q++ = 's';
689 } else {
690 *q++ = '#'; *q++ = '0'; *q++ = '3'; *q++ = '9';
692 *q++ = ';';
693 } else {
694 *q++ = c;
696 break;
697 case '<':
698 *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
699 break;
700 case '>':
701 *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
702 break;
703 case '&':
704 if (!dEncode) {
705 p++;
707 html_get_entity_map();
709 bool found = false;
710 for (const char *t = p; *t; t++) {
711 if (*t == ';') {
712 int l = t - p;
713 if (l > 0) {
714 char sbuf[16] = {0};
715 char *buf;
716 if (l > 10) {
717 buf = (char* )malloc(l + 1);
718 } else {
719 buf = sbuf;
721 memcpy(buf, p, l);
722 buf[l] = '\0';
723 if (decode_entity(buf, &l, true, true,
724 cs_utf_8, true)) {
725 found = true;
726 *q++ = '&';
727 for(const char *s = p; s <= t; s++) {
728 *q++ = *s;
730 p = t;
732 if (buf != sbuf) {
733 free(buf);
736 break;
739 if (!found) {
740 p--;
741 *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
743 } else {
744 *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
746 break;
747 case static_cast<unsigned char>('\xc2'):
748 if (htmlEnt && utf8 && p != end && *(p+1) == '\xa0') {
749 *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
750 p++;
751 break;
754 // fallthrough
755 default: {
756 if (LIKELY(c < 0x80)) {
757 *q++ = c;
758 break;
759 } else if (htmlEnt && !utf8 && (c - 160) < sizeof(ent_iso_8859_1) - 1) {
761 * https://github.com/facebook/hhvm/issues/2186
762 * If not UTF8, and we are converting to HTML entities, use known
763 * entity equivalent of the character, if possible.
764 * Since we only support ISO-8859-1 or UTF8 right now, and they use
765 * the same mapping array, use it.
766 * Start at 0xA0 = 160
768 *q++ = '&';
769 const char *s = ent_iso_8859_1[c - 160];
770 int len = strlen(s);
771 for (int n = 0; n < len; n++) {
772 *q++ = *s++;
774 *q++ = ';';
775 break;
778 bool should_skip =
779 qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_IGNORE);
780 bool should_replace =
781 qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SUBSTITUTE);
783 if (!utf8 && should_skip) {
784 *q++ = c;
785 break;
788 auto avail = end - p;
789 auto utf8_trail = [](unsigned char c) { return c >= 0x80 && c <= 0xbf; };
790 auto utf8_lead = [](unsigned char c) {
791 return c < 0x80 || (c >= 0xC2 && c <= 0xF4);
794 // This has to be a macro since it needs to be able to break away from
795 // the for loop we're in.
796 // ENT_IGNORE has higher precedence than ENT_SUBSTITUTE
797 // \uFFFD is Unicode Replacement Character (U+FFFD)
798 #define UTF8_ERROR_IF_LEN(cond, len) \
799 if (cond) { \
800 p += (len) - 1; \
801 if (should_skip) { break; } \
802 else if (should_replace) { strcpy(q, u8"\uFFFD"); q += 3; break; } \
803 else { goto exit_error; } \
806 #define UTF8_ERROR_IF(cond) UTF8_ERROR_IF_LEN(cond, 1)
808 if (utf8) {
809 if (c < 0xc2) {
810 UTF8_ERROR_IF(true);
811 } else if (c < 0xe0) {
812 UTF8_ERROR_IF(avail < 2);
813 UTF8_ERROR_IF_LEN(!utf8_trail(*(p + 1)), utf8_lead(*(p + 1)) ? 1 : 2);
815 uint16_t tc = ((c & 0x1f) << 6) | (p[1] & 0x3f);
816 UTF8_ERROR_IF_LEN(tc < 0x80, 2); // non-shortest form
818 codeLength = 2;
819 entity[0] = *p;
820 entity[1] = *(p + 1);
821 entity[2] = '\0';
822 } else if (c < 0xf0) {
823 if (avail < 3 || !utf8_trail(*(p + 1)) || !utf8_trail(*(p + 2))) {
824 UTF8_ERROR_IF_LEN(avail < 2 || utf8_lead(*(p + 1)), 1);
825 UTF8_ERROR_IF_LEN(avail < 3 || utf8_lead(*(p + 2)), 2);
826 UTF8_ERROR_IF_LEN(true, 3);
829 uint32_t tc = ((c & 0x0f) << 12) |
830 ((*(p+1) & 0x3f) << 6) |
831 (*(p+2) & 0x3f);
832 UTF8_ERROR_IF_LEN(tc < 0x800, 3); // non-shortest form
833 UTF8_ERROR_IF_LEN(tc >= 0xd800 && tc <= 0xdfff, 3); // surrogate
835 codeLength = 3;
836 entity[0] = *p;
837 entity[1] = *(p + 1);
838 entity[2] = *(p + 2);
839 entity[3] = '\0';
840 } else if (c < 0xf5) {
841 if (avail < 4 || !utf8_trail(*(p + 1)) || !utf8_trail(*(p + 2)) ||
842 !utf8_trail(*(p + 3))) {
843 UTF8_ERROR_IF_LEN(avail < 2 || utf8_lead(*(p + 1)), 1);
844 UTF8_ERROR_IF_LEN(avail < 3 || utf8_lead(*(p + 2)), 2);
845 UTF8_ERROR_IF_LEN(avail < 4 || utf8_lead(*(p + 3)), 3);
846 UTF8_ERROR_IF_LEN(true, 4);
849 uint32_t tc = ((c & 0x07) << 18) |
850 ((*(p+1) & 0x3f) << 12) |
851 ((*(p+2) & 0x3f) << 6) |
852 (*(p+3) & 0x3f);
854 // non-shortest form or outside range
855 UTF8_ERROR_IF_LEN(tc < 0x10000 || tc > 0x10ffff, 4);
857 codeLength = 4;
858 entity[0] = *p;
859 entity[1] = *(p + 1);
860 entity[2] = *(p + 2);
861 entity[3] = *(p + 3);
862 entity[4] = '\0';
863 } else {
864 UTF8_ERROR_IF(true);
866 } else {
867 codeLength = 1;
868 entity[0] = *p;
869 entity[1] = '\0';
872 if (htmlEnt) {
873 html_get_entity_map();
875 char buf[16] = {0};
876 buf[0] = c;
877 int len = 1;
879 if (encode_entity(buf, &len, const_cast<char*>(entity), utf8)) {
880 *q++ = '&';
881 const char *s = buf;
882 for (int n = 0; n < len; n++) {
883 *q++ = *s++;
885 *q++ = ';';
886 } else {
887 memcpy(q, p, codeLength);
888 q += codeLength;
890 } else {
891 memcpy(q, p, codeLength);
892 q += codeLength;
894 p += codeLength - 1;
896 break;
902 #undef UTF8_ERROR_IF
903 #undef UTF8_ERROR_IF_LEN
905 if (q - ret > INT_MAX) {
906 goto exit_error;
908 *q = 0;
909 len = q - ret;
910 return ret;
912 exit_error:
913 free(ret);
914 return nullptr;
917 char *string_html_encode_extra(const char *input, int &len,
918 StringHtmlEncoding flags,
919 const AsciiMap *asciiMap) {
920 assert(input);
922 * Though seems to be wasting memory a lot, we have to realize most of the
923 * time this function is called with small strings, or fragments of HTMLs.
924 * Allocating/deallocating anything less than 1K is trivial these days, and
925 * we want avoid string copying as much as possible. Of course, the return
926 * char * is really sent back at large, occupying unnessary space for
927 * potentially longer time than we need, we have to realize the two closest
928 * solutions are not that much better, either:
930 * 1. pre-calculate size by iterating through the string once: too time
931 * consuming;
932 * 2. take a guess and double buffer size when over: still wasting, and
933 * it may not save that much.
935 char *ret = (char *)malloc(len * 8uL + 1);
936 if (!ret) {
937 return nullptr;
939 char *q = ret;
940 const char *rep = u8"\ufffd";
941 int32_t srcPosBytes;
942 for (srcPosBytes = 0; srcPosBytes < len; /* incremented in-loop */) {
943 unsigned char c = input[srcPosBytes];
944 if (c && c < 128) {
945 srcPosBytes++; // Optimize US-ASCII case
946 if ((asciiMap->map[c & 64 ? 1 : 0] >> (c & 63)) & 1) {
947 switch (c) {
948 case '"':
949 *q++ = '&'; *q++ = 'q'; *q++ = 'u';
950 *q++ = 'o'; *q++ = 't'; *q++ = ';';
951 break;
952 case '\'':
953 *q++ = '&'; *q++ = '#'; *q++ = '0';
954 *q++ = '3'; *q++ = '9'; *q++ = ';';
955 break;
956 case '<':
957 *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
958 break;
959 case '>':
960 *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
961 break;
962 case '&':
963 *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
964 break;
965 default:
966 *q++ = '&'; *q++ = '#';
967 *q++ = c >= 100 ? '1' : '0';
968 *q++ = ((c / 10) % 10) + '0';
969 *q++ = (c % 10) + '0';
970 *q++ = ';';
971 break;
973 } else {
974 *q++ = c;
976 } else if (flags & STRING_HTML_ENCODE_UTF8) {
977 UChar32 curCodePoint;
978 U8_NEXT(input, srcPosBytes, len, curCodePoint);
979 if ((flags & STRING_HTML_ENCODE_NBSP) && curCodePoint == 0xC2A0) {
980 *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
981 } else if (curCodePoint <= 0) {
982 if (flags & STRING_HTML_ENCODE_UTF8IZE_REPLACE) {
983 if (flags & STRING_HTML_ENCODE_HIGH) {
984 *q++ = '&'; *q++ = '#'; *q++ = 'x';
985 *q++ = 'f'; *q++ = 'f'; *q++ = 'f'; *q++ = 'd';
986 *q++ = ';';
987 } else {
988 const char *r = rep;
989 while (*r) *q++ = *r++;
992 } else if (flags & STRING_HTML_ENCODE_HIGH) {
993 q += sprintf(q, "&#x%x;", curCodePoint);
994 } else {
995 int32_t pos = 0;
996 U8_APPEND_UNSAFE(q, pos, curCodePoint);
997 q += pos;
999 } else {
1000 srcPosBytes++; // Optimize US-ASCII case
1001 if (c == 0xa0) {
1002 *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
1003 } else if (flags & STRING_HTML_ENCODE_HIGH) {
1004 *q++ = '&'; *q++ = '#';
1005 *q++ = c >= 200 ? '2' : '1';
1006 *q++ = ((c / 10) % 10) + '0';
1007 *q++ = (c % 10) + '0';
1008 *q++ = ';';
1009 } else {
1010 *q++ = c;
1014 if (q - ret > INT_MAX) {
1015 free(ret);
1016 return nullptr;
1018 *q = 0;
1019 len = q - ret;
1020 return ret;
1023 char *string_html_decode(const char *input, int &len,
1024 bool decode_double_quote, bool decode_single_quote,
1025 const char *charset_hint, bool all,
1026 bool xhp /* = false */) {
1027 assert(input);
1029 if (!EntityMapInited) {
1030 Lock lock(EntityMapMutex);
1031 if (!EntityMapInited) {
1032 init_entity_table();
1033 EntityMapInited = true;
1037 entity_charset charset = determine_charset(charset_hint);
1038 if (charset == cs_unknown) {
1039 return nullptr;
1042 char *ret = (char *)malloc(len + 1);
1043 char *q = ret;
1044 for (const char *p = input; *p || UNLIKELY(p - input < len); p++) {
1045 char ch = *p;
1046 if (ch != '&') {
1047 *q++ = ch;
1048 continue;
1050 p++;
1052 bool found = false;
1053 for (const char *t = p; *t; t++) {
1054 if (*t == ';') {
1055 int l = t - p;
1056 if (l > 0) {
1057 char sbuf[16] = {0};
1058 char *buf;
1059 if (l > 10) {
1060 buf = (char* )malloc(l + 1);
1061 } else {
1062 buf = sbuf;
1064 memcpy(buf, p, l);
1065 buf[l] = '\0';
1066 if (decode_entity(buf, &l, decode_double_quote, decode_single_quote,
1067 charset, all, xhp)) {
1068 memcpy(q, buf, l);
1069 found = true;
1070 p = t;
1071 q += l;
1073 if (buf != sbuf) {
1074 free(buf);
1077 break;
1080 if (!found) {
1081 p--;
1082 *q++ = '&'; // not an entity
1085 *q = '\0';
1086 len = q - ret;
1087 return ret;
1090 const html_entity_map* html_get_entity_map() {
1091 if (!EntityMapInited) {
1092 Lock lock(EntityMapMutex);
1093 if (!EntityMapInited) {
1094 init_entity_table();
1095 EntityMapInited = true;
1098 return entity_map;
1101 ///////////////////////////////////////////////////////////////////////////////