2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
17 #include "hphp/zend/zend-html.h"
19 #include <unicode/uchar.h>
20 #include <unicode/utf8.h>
22 #include "hphp/util/lock.h"
23 #include "hphp/util/functional.h"
24 #include "hphp/util/hash-map.h"
28 ///////////////////////////////////////////////////////////////////////////////
29 // UTF-8 entity tables
31 using namespace entity_charset_enum
;
33 static entity_table_t ent_cp_866
[] = {
34 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
35 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
36 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
37 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
38 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
39 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
40 "blk14", "blk12", "blk34", "boxv", "boxvl", "boxvL", "boxVl", "boxDl",
41 "boxdL", "boxVL", "boxV", "boxDL", "boxUL", "boxUl", "boxuL", "boxdl",
42 "boxur", "boxhu", "boxhd", "boxvr", "boxh", "boxvh", "boxvR", "boxVr",
43 "boxUR", "boxDR", "boxHU", "boxHD", "boxVR", "boxH", "boxVH", "boxHu",
44 "boxhU", "boxHd", "boxhD", "boxUr", "boxuR", "boxdR", "boxDr", "boxVh",
45 "boxvH", "boxul", "boxdr", "block", "lhblk", nullptr, nullptr, "uhblk",
46 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
47 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
48 "IOcy", "iocy", "Jukcy", "jukcy", "YIcy", "yicy", "Ubrcy", "ubrcy",
49 "deg", nullptr, "middot", "Sqrt", "numero", "curren", nullptr, "nbsp"
52 static entity_table_t ent_cp_1251
[] = {
53 "DJcy", "GJcy", "sbquo", "gjcy", "bdquo", "hellip", "dagger", "Dagger",
54 "euro", "permil", "LJcy", "lsaquo", "NJcy", "KJcy", "TSHcy", "DZcy",
55 "djcy", "lsquo", "rsquo", "ldquo", "rdquo", "bull", "ndash", "mdash",
56 nullptr, "trade", "ljcy", "rsaquo", "njcy", "kjcy", "tshcy", "dzcy",
57 "nbsp", "Ubrcy", "ubrcy", "Jsercy", "curren", nullptr, "brvbar", "sect",
58 "IOcy", "copy", "Jukcy", "laquo", "not", "shy", "reg", "YIcy",
59 "deg", "pm", "Iukcy", "iukcy", nullptr, "micro", "para", "middot",
60 "iocy", "numero", "jukcy", "raquo", "jsercy", "DScy", "dscy", "yicy",
61 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
62 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
63 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
64 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
65 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
66 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
67 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
68 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy"
71 /* codepage 1252 is a Windows extension to iso-8859-1. */
72 static entity_table_t ent_cp_1252
[] = {
73 "euro", nullptr, "sbquo", "fnof", "bdquo", "hellip", "dagger",
74 "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
75 nullptr, nullptr, nullptr, nullptr, "lsquo", "rsquo", "ldquo", "rdquo",
76 "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
77 "oelig", nullptr, nullptr, "Yuml"
80 static entity_table_t ent_iso_8859_1
[] = {
81 "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
82 "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
83 "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
84 "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
85 "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
86 "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
87 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
88 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
89 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
90 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
91 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
92 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
93 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
94 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
95 "uuml", "yacute", "thorn", "yuml"
98 static entity_table_t ent_iso_8859_5
[] = {
99 "nbsp", "IOcy", "DJcy", "GJcy", "Jukcy", "DScy", "Iukcy", "YIcy",
100 "Jsercy", "LJcy", "NJcy", "TSHcy", "KJcy", "shy", "Ubrcy", "DZcy",
101 "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
102 "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
103 "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
104 "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
105 "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
106 "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
107 "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
108 "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
109 "numero", "iocy", "djcy", "gjcy", "jukcy", "dscy", "iukcy", "yicy",
110 "jsercy", "ljcy", "njcy", "tshcy", "kjcy", "sect", "ubrcy", "dzcy"
113 static entity_table_t ent_iso_8859_15
[] = {
114 "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
115 "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
116 "macr", "deg", "plusmn", "sup2", "sup3", nullptr, /* Zcaron */
117 "micro", "para", "middot", nullptr, /* zcaron */ "sup1", "ordm",
118 "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
119 "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
120 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
121 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
122 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
123 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
124 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
125 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
126 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
127 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
128 "uuml", "yacute", "thorn", "yuml"
131 static entity_table_t ent_uni_338_402
[] = {
133 "OElig", "oelig", nullptr, nullptr, nullptr, nullptr,
134 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
136 "Scaron", "scaron", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
137 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
138 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
140 "Yuml", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
141 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
142 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
144 nullptr, nullptr, "fnof"
147 static entity_table_t ent_uni_spacing
[] = {
151 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
152 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
157 static entity_table_t ent_uni_greek
[] = {
159 "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
160 "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
161 nullptr, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
162 /* 938 - 944 are not mapped */
163 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
164 "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
165 "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
166 "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
167 /* 970 - 976 are not mapped */
168 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
170 nullptr, nullptr, nullptr,
174 static entity_table_t ent_uni_punct
[] = {
176 "ensp", "emsp", nullptr, nullptr, nullptr, nullptr, nullptr,
177 "thinsp", nullptr, nullptr, "zwnj", "zwj", "lrm", "rlm",
178 nullptr, nullptr, nullptr, "ndash", "mdash", nullptr, nullptr, nullptr,
180 "lsquo", "rsquo", "sbquo", nullptr, "ldquo", "rdquo", "bdquo", nullptr,
181 "dagger", "Dagger", "bull", nullptr, nullptr, nullptr, "hellip",
182 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "permil", nullptr,
184 "prime", "Prime", nullptr, nullptr, nullptr, nullptr, nullptr, "lsaquo", "rsaquo", nullptr,
185 nullptr, nullptr, "oline", nullptr, nullptr, nullptr, nullptr, nullptr,
189 static entity_table_t ent_uni_euro
[] = {
193 static entity_table_t ent_uni_8465_8501
[] = {
195 "image", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
197 "weierp", nullptr, nullptr, nullptr,
199 "real", nullptr, nullptr, nullptr, nullptr, nullptr,
201 "trade", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
202 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
207 static entity_table_t ent_uni_8592_9002
[] = {
209 "larr", "uarr", "rarr", "darr", "harr", nullptr, nullptr, nullptr,
210 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
212 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
213 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
215 nullptr, nullptr, nullptr, nullptr, nullptr, "crarr", nullptr, nullptr,
216 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
218 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
219 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
221 "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", nullptr, nullptr,
222 nullptr, nullptr, "lAarr", "rAarr", nullptr, "rarrw", nullptr, nullptr,
224 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
225 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
226 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
227 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
229 "forall", "comp", "part", "exist", "nexist", "empty", nullptr, "nabla",
230 "isin", "notin", "epsis", "ni", "notni", "bepsi", nullptr, "prod",
232 "coprod", "sum", "minus", "mnplus", "plusdo", nullptr, "setmn", "lowast",
233 "compfn", nullptr, "radic", nullptr, nullptr, "prop", "infin", "ang90",
235 "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
236 "or", "cap", "cup", "int", nullptr, nullptr, "conint", nullptr,
238 nullptr, nullptr, nullptr, nullptr, "there4", "becaus", nullptr, nullptr,
239 nullptr, nullptr, nullptr, nullptr, "sim", "bsim", nullptr, nullptr,
241 "wreath", "nsim", nullptr, "sime", "nsime", "cong", nullptr, "ncong",
242 "asymp", "nap", "ape", nullptr, "bcong", "asymp", "bump", "bumpe",
244 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
245 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
247 "ne", "equiv", nullptr, nullptr, "le", "ge", "lE", "gE",
248 "lnE", "gnE", "Lt", "Gt", "twixt", nullptr, "nlt", "ngt",
250 "nles", "nges", "lsim", "gsim", nullptr, nullptr, "lg", "gl",
251 nullptr, nullptr, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
253 "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
254 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
256 nullptr, nullptr, nullptr, nullptr, nullptr, "oplus", nullptr, "otimes",
257 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
259 nullptr, nullptr, nullptr, nullptr, nullptr, "perp", nullptr, nullptr,
260 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
262 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
263 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
265 nullptr, nullptr, nullptr, nullptr, nullptr, "sdot", nullptr, nullptr,
266 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
268 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
269 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
271 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
272 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
274 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
275 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
277 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
278 "lceil", "rceil", "lfloor", "rfloor", nullptr, nullptr, nullptr, nullptr,
280 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
281 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
283 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
284 nullptr, "lang", "rang"
287 static entity_table_t ent_uni_9674
[] = {
292 static entity_table_t ent_uni_9824_9830
[] = {
294 "spades", nullptr, nullptr, "clubs", nullptr, "hearts", "diams"
297 static const struct html_entity_map entity_map
[] = {
298 { cs_cp866
, 0x80, 0xff, ent_cp_866
},
299 { cs_cp1251
, 0x80, 0xff, ent_cp_1251
},
300 { cs_cp1252
, 0x80, 0x9f, ent_cp_1252
},
301 { cs_cp1252
, 0xa0, 0xff, ent_iso_8859_1
},
302 { cs_8859_1
, 0xa0, 0xff, ent_iso_8859_1
},
303 { cs_8859_5
, 0xa0, 0xff, ent_iso_8859_5
},
304 { cs_8859_15
, 0xa0, 0xff, ent_iso_8859_15
},
305 { cs_utf_8
, 0xa0, 0xff, ent_iso_8859_1
},
306 { cs_utf_8
, 338, 402, ent_uni_338_402
},
307 { cs_utf_8
, 710, 732, ent_uni_spacing
},
308 { cs_utf_8
, 913, 982, ent_uni_greek
},
309 { cs_utf_8
, 8194, 8260, ent_uni_punct
},
310 { cs_utf_8
, 8364, 8364, ent_uni_euro
},
311 { cs_utf_8
, 8465, 8501, ent_uni_8465_8501
},
312 { cs_utf_8
, 8592, 9002, ent_uni_8592_9002
},
313 { cs_utf_8
, 9674, 9674, ent_uni_9674
},
314 { cs_utf_8
, 9824, 9830, ent_uni_9824_9830
},
315 { cs_big5
, 0xa0, 0xff, ent_iso_8859_1
},
316 { cs_gb2312
, 0xa0, 0xff, ent_iso_8859_1
},
317 { cs_big5hkscs
, 0xa0, 0xff, ent_iso_8859_1
},
318 { cs_sjis
, 0xa0, 0xff, ent_iso_8859_1
},
319 { cs_eucjp
, 0xa0, 0xff, ent_iso_8859_1
},
320 /* Missing support for these at the moment
321 { cs_koi8r, 0xa3, 0xff, ent_koi8r },
322 { cs_macroman, 0x0b, 0xff, ent_macroman },
327 static const struct {
329 entity_charset charset
;
331 { "ISO-8859-1", cs_8859_1
},
332 { "ISO8859-1", cs_8859_1
},
333 { "ISO-8859-5", cs_8859_5
},
334 { "ISO8859-5", cs_8859_5
},
335 { "ISO-8859-15", cs_8859_15
},
336 { "ISO8859-15", cs_8859_15
},
337 { "utf-8", cs_utf_8
},
338 { "cp866", cs_cp866
},
340 { "ibm866", cs_cp866
},
341 { "cp1251", cs_cp1251
},
342 { "Windows-1251", cs_cp1251
},
343 { "win-1251", cs_cp1251
},
344 { "cp1252", cs_cp1252
},
345 { "Windows-1252", cs_cp1252
},
346 { "1252", cs_cp1252
},
349 { "GB2312", cs_gb2312
},
350 { "936", cs_gb2312
},
351 { "BIG5-HKSCS", cs_big5hkscs
},
352 { "Shift_JIS", cs_sjis
},
355 { "EUCJP", cs_eucjp
},
356 /* Missing support for these at the moment
357 { "EUC-JP", cs_eucjp },
358 { "KOI8-R", cs_koi8r },
359 { "koi8-ru", cs_koi8r },
360 { "koi8r", cs_koi8r },
361 { "MacRoman", cs_macroman },
366 ///////////////////////////////////////////////////////////////////////////////
368 entity_charset
determine_charset(const char *charset_hint
) {
369 entity_charset charset
= cs_unknown
;
371 if (charset_hint
== nullptr) {
376 size_t len
= strlen(charset_hint
);
378 /* now walk the charset map and look for the codeset */
379 for (int i
= 0; charset_map
[i
].codeset
; i
++) {
380 if (len
== strlen(charset_map
[i
].codeset
) &&
381 strncasecmp(charset_hint
, charset_map
[i
].codeset
, len
) == 0) {
382 charset
= charset_map
[i
].charset
;
390 static int utf32_to_utf8(unsigned char *buf
, int k
) {
396 } else if (k
< 0x800) {
397 buf
[0] = 0xc0 | (k
>> 6);
398 buf
[1] = 0x80 | (k
& 0x3f);
400 } else if (k
< 0x10000) {
401 buf
[0] = 0xe0 | (k
>> 12);
402 buf
[1] = 0x80 | ((k
>> 6) & 0x3f);
403 buf
[2] = 0x80 | (k
& 0x3f);
405 } else if (k
< 0x200000) {
406 buf
[0] = 0xf0 | (k
>> 18);
407 buf
[1] = 0x80 | ((k
>> 12) & 0x3f);
408 buf
[2] = 0x80 | ((k
>> 6) & 0x3f);
409 buf
[3] = 0x80 | (k
& 0x3f);
411 } else if (k
< 0x4000000) {
412 buf
[0] = 0xf8 | (k
>> 24);
413 buf
[1] = 0x80 | ((k
>> 18) & 0x3f);
414 buf
[2] = 0x80 | ((k
>> 12) & 0x3f);
415 buf
[3] = 0x80 | ((k
>> 6) & 0x3f);
416 buf
[4] = 0x80 | (k
& 0x3f);
419 buf
[0] = 0xfc | (k
>> 30);
420 buf
[1] = 0x80 | ((k
>> 24) & 0x3f);
421 buf
[2] = 0x80 | ((k
>> 18) & 0x3f);
422 buf
[3] = 0x80 | ((k
>> 12) & 0x3f);
423 buf
[4] = 0x80 | ((k
>> 6) & 0x3f);
424 buf
[5] = 0x80 | (k
& 0x3f);
432 using HtmlEntityMap
= hphp_const_char_map
<std::string
>;
434 static volatile bool EntityMapInited
= false;
435 static Mutex EntityMapMutex
;
436 static HtmlEntityMap EntityMap
[cs_end
];
437 static HtmlEntityMap XHPEntityMap
[cs_end
];
439 static void init_entity_table() {
440 for (unsigned int i
= 0; entity_map
[i
].charset
!= cs_terminator
; i
++) {
441 const html_entity_map
&em
= entity_map
[i
];
442 const entity_charset charset
= entity_map
[i
].charset
;
445 for (int ch
= em
.basechar
; ch
<= em
.endchar
; ch
++, index
++) {
446 const char *entity
= em
.table
[index
];
447 if (entity
== nullptr) {
450 unsigned char buf
[10];
464 utf32_to_utf8(buf
, ch
);
470 EntityMap
[charset
][entity
] = (const char *)buf
;
471 XHPEntityMap
[charset
][entity
] = (const char *)buf
;
474 EntityMap
[charset
]["quot"] = "\"";
475 EntityMap
[charset
]["lt"] = "<";
476 EntityMap
[charset
]["gt"] = ">";
477 EntityMap
[charset
]["amp"] = "&";
479 XHPEntityMap
[charset
]["quot"] = "\"";
480 XHPEntityMap
[charset
]["lt"] = "<";
481 XHPEntityMap
[charset
]["gt"] = ">";
482 XHPEntityMap
[charset
]["amp"] = "&";
483 // XHP-specific entities
484 XHPEntityMap
[charset
]["apos"] = "\'";
485 XHPEntityMap
[charset
]["cloud"] = (const char *)u8
"\u2601";
486 XHPEntityMap
[charset
]["umbrella"] = (const char *)u8
"\u2602";
487 XHPEntityMap
[charset
]["snowman"] = (const char *)u8
"\u2603";
488 XHPEntityMap
[charset
]["snowflake"] = (const char *)u8
"\u2745";
489 XHPEntityMap
[charset
]["comet"] = (const char *)u8
"\u2604";
490 XHPEntityMap
[charset
]["thunderstorm"] = (const char *)u8
"\u2608";
493 // the first element is an empty table
494 EntityMap
[cs_terminator
]["quot"] = "\"";
495 EntityMap
[cs_terminator
]["lt"] = "<";
496 EntityMap
[cs_terminator
]["gt"] = ">";
497 EntityMap
[cs_terminator
]["amp"] = "&";
498 // XHP-specific entities
499 XHPEntityMap
[cs_terminator
]["apos"] = "\'";
500 XHPEntityMap
[cs_terminator
]["cloud"] = (const char *)u8
"\u2601";
501 XHPEntityMap
[cs_terminator
]["umbrella"] = (const char *)u8
"\u2602";
502 XHPEntityMap
[cs_terminator
]["snowman"] = (const char *)u8
"\u2603";
503 XHPEntityMap
[cs_terminator
]["snowflake"] = (const char *)u8
"\u2745";
504 XHPEntityMap
[cs_terminator
]["comet"] = (const char *)u8
"\u2604";
505 XHPEntityMap
[cs_terminator
]["thunderstorm"] = (const char *)u8
"\u2608";
508 ///////////////////////////////////////////////////////////////////////////////
509 inline static bool decode_entity(char *entity
, int *len
,
510 bool decode_double_quote
,
511 bool decode_single_quote
,
512 entity_charset charset
, bool all
,
514 // entity is 16 bytes, allocated statically below
516 assert(entity
&& *entity
);
517 if (entity
[0] == '#') {
519 if (entity
[1] == 'x' || entity
[1] == 'X') {
520 if (!isxdigit(entity
[2])) return false;
521 code
= strtol(entity
+ 2, nullptr, 16);
523 if (!isdigit(entity
[1])) return false;
524 code
= strtol(entity
+ 1, nullptr, 10);
527 // since we don't support multibyte chars other than utf-8
530 if (code
== 39 && decode_single_quote
) {
537 if (!all
&& (code
!= '&') &&
538 (code
!= '<') && (code
!= '>') &&
539 (code
!= '"') && (code
!= '\'')) {
540 // htmlspecialchars_decode() does not parse numeric
541 // entities other than & < > " '
548 unsigned char buf
[10];
549 int size
= utf32_to_utf8(buf
, code
);
550 memcpy(entity
, buf
, size
+ 1);
558 if ((code
>= 0x80 && code
< 0xa0) || code
> 0xff) {
605 HtmlEntityMap
*entityMap
;
607 if (strncasecmp(entity
, "quot", 4) == 0 && !decode_double_quote
) {
612 entityMap
= xhp
? &XHPEntityMap
[charset
] : &EntityMap
[charset
];
614 entityMap
= xhp
? &XHPEntityMap
[cs_terminator
]
615 : &EntityMap
[cs_terminator
];
617 HtmlEntityMap::const_iterator iter
= entityMap
->find(entity
);
618 if (iter
!= entityMap
->end()) {
619 memcpy(entity
, iter
->second
.c_str(), iter
->second
.length() + 1);
620 *len
= iter
->second
.length();
628 inline static bool encode_entity(char* buf
, int* buflen
,
629 const char* entity
, bool utf8
) {
630 entity_charset charset
= cs_utf_8
;
631 if (!utf8
){ charset
= cs_8859_1
; }
633 HtmlEntityMap
*entityMap
= &EntityMap
[charset
];
635 for(HtmlEntityMap::const_iterator iter
= entityMap
->begin();
636 iter
!= entityMap
->end(); iter
++) {
637 if (strcmp(iter
->second
.c_str(), entity
) == 0) {
638 memcpy(buf
, iter
->first
, strlen(iter
->first
));
639 *buflen
= strlen(iter
->first
);
646 char *string_html_encode(const char *input
, int &len
,
647 const int64_t qsBitmask
, bool utf8
,
648 bool dEncode
, bool htmlEnt
) {
651 * Though seems to be wasting memory a lot, we have to realize most of the
652 * time this function is called with small strings, or fragments of HTMLs.
653 * Allocating/deallocating anything less than 1K is trivial these days, and
654 * we want avoid string copying as much as possible. Of course, the return
655 * char * is really sent back at large, occupying unnecessary space for
656 * potentially longer time than we need, we have to realize the two closest
657 * solutions are not that much better, either:
659 * 1. pre-calculate size by iterating through the string once: too time
661 * 2. take a guess and double buffer size when over: still wasting, and
662 * it may not save that much.
664 * Note: Amount of allocation per character to be encoded may have to be
665 * increased as larger HTML Entities are implemented.
667 char *ret
= (char *)malloc(len
* 14uL + 1);
672 for (const char *p
= input
, *end
= input
+ len
; p
< end
; p
++) {
673 unsigned char c
= *p
;
678 if (qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_DOUBLE
)) {
679 *q
++ = '&'; *q
++ = 'q'; *q
++ = 'u'; *q
++ = 'o'; *q
++ = 't'; *q
++ = ';';
685 if (qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_SINGLE
)) {
687 if ((qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_XML1
))) {
688 *q
++ = 'a'; *q
++ = 'p'; *q
++ = 'o'; *q
++ = 's';
690 *q
++ = '#'; *q
++ = '0'; *q
++ = '3'; *q
++ = '9';
698 *q
++ = '&'; *q
++ = 'l'; *q
++ = 't'; *q
++ = ';';
701 *q
++ = '&'; *q
++ = 'g'; *q
++ = 't'; *q
++ = ';';
707 html_get_entity_map();
710 for (const char *t
= p
; *t
; t
++) {
717 buf
= (char* )malloc(l
+ 1);
723 if (decode_entity(buf
, &l
, true, true,
727 for(const char *s
= p
; s
<= t
; s
++) {
741 *q
++ = '&'; *q
++ = 'a'; *q
++ = 'm'; *q
++ = 'p'; *q
++ = ';';
744 *q
++ = '&'; *q
++ = 'a'; *q
++ = 'm'; *q
++ = 'p'; *q
++ = ';';
747 case static_cast<unsigned char>('\xc2'):
748 if (htmlEnt
&& utf8
&& p
!= end
&& *(p
+1) == '\xa0') {
749 *q
++ = '&'; *q
++ = 'n'; *q
++ = 'b'; *q
++ = 's'; *q
++ = 'p'; *q
++ = ';';
756 if (LIKELY(c
< 0x80)) {
759 } else if (htmlEnt
&& !utf8
&& (c
- 160) < sizeof(ent_iso_8859_1
) - 1) {
761 * https://github.com/facebook/hhvm/issues/2186
762 * If not UTF8, and we are converting to HTML entities, use known
763 * entity equivalent of the character, if possible.
764 * Since we only support ISO-8859-1 or UTF8 right now, and they use
765 * the same mapping array, use it.
766 * Start at 0xA0 = 160
769 const char *s
= ent_iso_8859_1
[c
- 160];
771 for (int n
= 0; n
< len
; n
++) {
779 qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_IGNORE
);
780 bool should_replace
=
781 qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_SUBSTITUTE
);
783 if (!utf8
&& should_skip
) {
788 auto avail
= end
- p
;
789 auto utf8_trail
= [](unsigned char c
) { return c
>= 0x80 && c
<= 0xbf; };
790 auto utf8_lead
= [](unsigned char c
) {
791 return c
< 0x80 || (c
>= 0xC2 && c
<= 0xF4);
794 // This has to be a macro since it needs to be able to break away from
795 // the for loop we're in.
796 // ENT_IGNORE has higher precedence than ENT_SUBSTITUTE
797 // \uFFFD is Unicode Replacement Character (U+FFFD)
798 #define UTF8_ERROR_IF_LEN(cond, len) \
801 if (should_skip) { break; } \
802 else if (should_replace) { strcpy(q, (const char *)u8"\uFFFD"); q += 3; break; } \
803 else { goto exit_error; } \
806 #define UTF8_ERROR_IF(cond) UTF8_ERROR_IF_LEN(cond, 1)
811 } else if (c
< 0xe0) {
812 UTF8_ERROR_IF(avail
< 2);
813 UTF8_ERROR_IF_LEN(!utf8_trail(*(p
+ 1)), utf8_lead(*(p
+ 1)) ? 1 : 2);
815 uint16_t tc
= ((c
& 0x1f) << 6) | (p
[1] & 0x3f);
816 UTF8_ERROR_IF_LEN(tc
< 0x80, 2); // non-shortest form
820 entity
[1] = *(p
+ 1);
822 } else if (c
< 0xf0) {
823 if (avail
< 3 || !utf8_trail(*(p
+ 1)) || !utf8_trail(*(p
+ 2))) {
824 UTF8_ERROR_IF_LEN(avail
< 2 || utf8_lead(*(p
+ 1)), 1);
825 UTF8_ERROR_IF_LEN(avail
< 3 || utf8_lead(*(p
+ 2)), 2);
826 UTF8_ERROR_IF_LEN(true, 3);
829 uint32_t tc
= ((c
& 0x0f) << 12) |
830 ((*(p
+1) & 0x3f) << 6) |
832 UTF8_ERROR_IF_LEN(tc
< 0x800, 3); // non-shortest form
833 UTF8_ERROR_IF_LEN(tc
>= 0xd800 && tc
<= 0xdfff, 3); // surrogate
837 entity
[1] = *(p
+ 1);
838 entity
[2] = *(p
+ 2);
840 } else if (c
< 0xf5) {
841 if (avail
< 4 || !utf8_trail(*(p
+ 1)) || !utf8_trail(*(p
+ 2)) ||
842 !utf8_trail(*(p
+ 3))) {
843 UTF8_ERROR_IF_LEN(avail
< 2 || utf8_lead(*(p
+ 1)), 1);
844 UTF8_ERROR_IF_LEN(avail
< 3 || utf8_lead(*(p
+ 2)), 2);
845 UTF8_ERROR_IF_LEN(avail
< 4 || utf8_lead(*(p
+ 3)), 3);
846 UTF8_ERROR_IF_LEN(true, 4);
849 uint32_t tc
= ((c
& 0x07) << 18) |
850 ((*(p
+1) & 0x3f) << 12) |
851 ((*(p
+2) & 0x3f) << 6) |
854 // non-shortest form or outside range
855 UTF8_ERROR_IF_LEN(tc
< 0x10000 || tc
> 0x10ffff, 4);
859 entity
[1] = *(p
+ 1);
860 entity
[2] = *(p
+ 2);
861 entity
[3] = *(p
+ 3);
873 html_get_entity_map();
879 if (encode_entity(buf
, &len
, const_cast<char*>(entity
), utf8
)) {
882 for (int n
= 0; n
< len
; n
++) {
887 memcpy(q
, p
, codeLength
);
891 memcpy(q
, p
, codeLength
);
903 #undef UTF8_ERROR_IF_LEN
905 if (q
- ret
> INT_MAX
) {
917 char *string_html_encode_extra(const char *input
, int &len
,
918 StringHtmlEncoding flags
,
919 const AsciiMap
*asciiMap
) {
922 * Though seems to be wasting memory a lot, we have to realize most of the
923 * time this function is called with small strings, or fragments of HTMLs.
924 * Allocating/deallocating anything less than 1K is trivial these days, and
925 * we want avoid string copying as much as possible. Of course, the return
926 * char * is really sent back at large, occupying unnecessary space for
927 * potentially longer time than we need, we have to realize the two closest
928 * solutions are not that much better, either:
930 * 1. pre-calculate size by iterating through the string once: too time
932 * 2. take a guess and double buffer size when over: still wasting, and
933 * it may not save that much.
935 char *ret
= (char *)malloc(len
* 8uL + 1);
940 const char *rep
= (const char *)u8
"\ufffd";
942 for (srcPosBytes
= 0; srcPosBytes
< len
; /* incremented in-loop */) {
943 unsigned char c
= input
[srcPosBytes
];
945 srcPosBytes
++; // Optimize US-ASCII case
946 if ((asciiMap
->map
[c
& 64 ? 1 : 0] >> (c
& 63)) & 1) {
949 *q
++ = '&'; *q
++ = 'q'; *q
++ = 'u';
950 *q
++ = 'o'; *q
++ = 't'; *q
++ = ';';
953 *q
++ = '&'; *q
++ = '#'; *q
++ = '0';
954 *q
++ = '3'; *q
++ = '9'; *q
++ = ';';
957 *q
++ = '&'; *q
++ = 'l'; *q
++ = 't'; *q
++ = ';';
960 *q
++ = '&'; *q
++ = 'g'; *q
++ = 't'; *q
++ = ';';
963 *q
++ = '&'; *q
++ = 'a'; *q
++ = 'm'; *q
++ = 'p'; *q
++ = ';';
966 *q
++ = '&'; *q
++ = '#';
967 *q
++ = c
>= 100 ? '1' : '0';
968 *q
++ = ((c
/ 10) % 10) + '0';
969 *q
++ = (c
% 10) + '0';
976 } else if (flags
& STRING_HTML_ENCODE_UTF8
) {
977 UChar32 curCodePoint
;
978 U8_NEXT(input
, srcPosBytes
, len
, curCodePoint
);
979 if ((flags
& STRING_HTML_ENCODE_NBSP
) && curCodePoint
== 0xC2A0) {
980 *q
++ = '&'; *q
++ = 'n'; *q
++ = 'b'; *q
++ = 's'; *q
++ = 'p'; *q
++ = ';';
981 } else if (curCodePoint
<= 0) {
982 if (flags
& STRING_HTML_ENCODE_UTF8IZE_REPLACE
) {
983 if (flags
& STRING_HTML_ENCODE_HIGH
) {
984 *q
++ = '&'; *q
++ = '#'; *q
++ = 'x';
985 *q
++ = 'f'; *q
++ = 'f'; *q
++ = 'f'; *q
++ = 'd';
989 while (*r
) *q
++ = *r
++;
992 } else if (flags
& STRING_HTML_ENCODE_HIGH
) {
993 q
+= sprintf(q
, "&#x%x;", curCodePoint
);
996 U8_APPEND_UNSAFE(q
, pos
, curCodePoint
);
1000 srcPosBytes
++; // Optimize US-ASCII case
1002 *q
++ = '&'; *q
++ = 'n'; *q
++ = 'b'; *q
++ = 's'; *q
++ = 'p'; *q
++ = ';';
1003 } else if (flags
& STRING_HTML_ENCODE_HIGH
) {
1004 *q
++ = '&'; *q
++ = '#';
1005 *q
++ = c
>= 200 ? '2' : '1';
1006 *q
++ = ((c
/ 10) % 10) + '0';
1007 *q
++ = (c
% 10) + '0';
1014 if (q
- ret
> INT_MAX
) {
1023 char *string_html_decode(const char *input
, int &len
,
1024 bool decode_double_quote
, bool decode_single_quote
,
1025 const char *charset_hint
, bool all
,
1026 bool xhp
/* = false */) {
1029 if (!EntityMapInited
) {
1030 Lock
lock(EntityMapMutex
);
1031 if (!EntityMapInited
) {
1032 init_entity_table();
1033 EntityMapInited
= true;
1037 entity_charset charset
= determine_charset(charset_hint
);
1038 if (charset
== cs_unknown
) {
1042 char *ret
= (char *)malloc(len
+ 1);
1044 for (const char *p
= input
; *p
|| UNLIKELY(p
- input
< len
); p
++) {
1053 for (const char *t
= p
; *t
; t
++) {
1057 char sbuf
[16] = {0};
1060 buf
= (char* )malloc(l
+ 1);
1066 if (decode_entity(buf
, &l
, decode_double_quote
, decode_single_quote
,
1067 charset
, all
, xhp
)) {
1082 *q
++ = '&'; // not an entity
1090 const html_entity_map
* html_get_entity_map() {
1091 if (!EntityMapInited
) {
1092 Lock
lock(EntityMapMutex
);
1093 if (!EntityMapInited
) {
1094 init_entity_table();
1095 EntityMapInited
= true;
1101 ///////////////////////////////////////////////////////////////////////////////