Remove deprecated slice() and kvzip() methods
[hiphop-php.git] / hphp / zend / zend-html.cpp
bloba354d85ee9656871fc7055f156d515fbf75fd96c
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2014 Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
17 #include "hphp/zend/zend-html.h"
19 #include <unicode/uchar.h>
20 #include <unicode/utf8.h>
22 #include "hphp/util/lock.h"
24 namespace HPHP {
26 ///////////////////////////////////////////////////////////////////////////////
27 // UTF-8 entity tables
29 using namespace entity_charset_enum;
31 /* codepage 1252 is a Windows extension to iso-8859-1. */
32 static entity_table_t ent_cp_1252[] = {
33 "euro", nullptr, "sbquo", "fnof", "bdquo", "hellip", "dagger",
34 "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
35 nullptr, nullptr, nullptr, nullptr, "lsquo", "rsquo", "ldquo", "rdquo",
36 "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
37 "oelig", nullptr, nullptr, "Yuml"
40 static entity_table_t ent_iso_8859_1[] = {
41 "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
42 "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
43 "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
44 "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
45 "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
46 "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
47 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
48 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
49 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
50 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
51 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
52 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
53 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
54 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
55 "uuml", "yacute", "thorn", "yuml"
58 static entity_table_t ent_iso_8859_15[] = {
59 "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
60 "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
61 "macr", "deg", "plusmn", "sup2", "sup3", nullptr, /* Zcaron */
62 "micro", "para", "middot", nullptr, /* zcaron */ "sup1", "ordm",
63 "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
64 "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
65 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
66 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
67 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
68 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
69 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
70 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
71 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
72 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
73 "uuml", "yacute", "thorn", "yuml"
76 static entity_table_t ent_uni_338_402[] = {
77 /* 338 (0x0152) */
78 "OElig", "oelig", nullptr, nullptr, nullptr, nullptr,
79 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
80 /* 352 (0x0160) */
81 "Scaron", "scaron", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
82 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
83 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
84 /* 376 (0x0178) */
85 "Yuml", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
86 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
87 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
88 /* 400 (0x0190) */
89 nullptr, nullptr, "fnof"
92 static entity_table_t ent_uni_spacing[] = {
93 /* 710 */
94 "circ",
95 /* 711 - 730 */
96 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
97 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
98 /* 731 - 732 */
99 nullptr, "tilde"
102 static entity_table_t ent_uni_greek[] = {
103 /* 913 */
104 "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
105 "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
106 nullptr, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
107 /* 938 - 944 are not mapped */
108 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
109 "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
110 "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
111 "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
112 /* 970 - 976 are not mapped */
113 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
114 "thetasym", "upsih",
115 nullptr, nullptr, nullptr,
116 "piv"
119 static entity_table_t ent_uni_punct[] = {
120 /* 8194 */
121 "ensp", "emsp", nullptr, nullptr, nullptr, nullptr, nullptr,
122 "thinsp", nullptr, nullptr, "zwnj", "zwj", "lrm", "rlm",
123 nullptr, nullptr, nullptr, "ndash", "mdash", nullptr, nullptr, nullptr,
124 /* 8216 */
125 "lsquo", "rsquo", "sbquo", nullptr, "ldquo", "rdquo", "bdquo", nullptr,
126 "dagger", "Dagger", "bull", nullptr, nullptr, nullptr, "hellip",
127 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "permil", nullptr,
128 /* 8242 */
129 "prime", "Prime", nullptr, nullptr, nullptr, nullptr, nullptr, "lsaquo", "rsaquo", nullptr,
130 nullptr, nullptr, "oline", nullptr, nullptr, nullptr, nullptr, nullptr,
131 "frasl"
134 static entity_table_t ent_uni_euro[] = {
135 "euro"
138 static entity_table_t ent_uni_8465_8501[] = {
139 /* 8465 */
140 "image", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
141 /* 8472 */
142 "weierp", nullptr, nullptr, nullptr,
143 /* 8476 */
144 "real", nullptr, nullptr, nullptr, nullptr, nullptr,
145 /* 8482 */
146 "trade", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
147 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
148 /* 8501 */
149 "alefsym",
152 static entity_table_t ent_uni_8592_9002[] = {
153 /* 8592 (0x2190) */
154 "larr", "uarr", "rarr", "darr", "harr", nullptr, nullptr, nullptr,
155 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
156 /* 8608 (0x21a0) */
157 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
158 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
159 /* 8624 (0x21b0) */
160 nullptr, nullptr, nullptr, nullptr, nullptr, "crarr", nullptr, nullptr,
161 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
162 /* 8640 (0x21c0) */
163 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
164 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
165 /* 8656 (0x21d0) */
166 "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", nullptr, nullptr,
167 nullptr, nullptr, "lAarr", "rAarr", nullptr, "rarrw", nullptr, nullptr,
168 /* 8672 (0x21e0) */
169 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
170 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
171 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
172 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
173 /* 8704 (0x2200) */
174 "forall", "comp", "part", "exist", "nexist", "empty", nullptr, "nabla",
175 "isin", "notin", "epsis", "ni", "notni", "bepsi", nullptr, "prod",
176 /* 8720 (0x2210) */
177 "coprod", "sum", "minus", "mnplus", "plusdo", nullptr, "setmn", "lowast",
178 "compfn", nullptr, "radic", nullptr, nullptr, "prop", "infin", "ang90",
179 /* 8736 (0x2220) */
180 "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
181 "or", "cap", "cup", "int", nullptr, nullptr, "conint", nullptr,
182 /* 8752 (0x2230) */
183 nullptr, nullptr, nullptr, nullptr, "there4", "becaus", nullptr, nullptr,
184 nullptr, nullptr, nullptr, nullptr, "sim", "bsim", nullptr, nullptr,
185 /* 8768 (0x2240) */
186 "wreath", "nsim", nullptr, "sime", "nsime", "cong", nullptr, "ncong",
187 "asymp", "nap", "ape", nullptr, "bcong", "asymp", "bump", "bumpe",
188 /* 8784 (0x2250) */
189 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
190 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
191 /* 8800 (0x2260) */
192 "ne", "equiv", nullptr, nullptr, "le", "ge", "lE", "gE",
193 "lnE", "gnE", "Lt", "Gt", "twixt", nullptr, "nlt", "ngt",
194 /* 8816 (0x2270) */
195 "nles", "nges", "lsim", "gsim", nullptr, nullptr, "lg", "gl",
196 nullptr, nullptr, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
197 /* 8832 (0x2280) */
198 "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
199 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
200 /* 8848 (0x2290) */
201 nullptr, nullptr, nullptr, nullptr, nullptr, "oplus", nullptr, "otimes",
202 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
203 /* 8864 (0x22a0) */
204 nullptr, nullptr, nullptr, nullptr, nullptr, "perp", nullptr, nullptr,
205 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
206 /* 8880 (0x22b0) */
207 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
208 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
209 /* 8896 (0x22c0) */
210 nullptr, nullptr, nullptr, nullptr, nullptr, "sdot", nullptr, nullptr,
211 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
212 /* 8912 (0x22d0) */
213 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
214 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
215 /* 8928 (0x22e0) */
216 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
217 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
218 /* 8944 (0x22f0) */
219 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
220 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
221 /* 8960 (0x2300) */
222 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
223 "lceil", "rceil", "lfloor", "rfloor", nullptr, nullptr, nullptr, nullptr,
224 /* 8976 (0x2310) */
225 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
226 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
227 /* 8992 (0x2320) */
228 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
229 nullptr, "lang", "rang"
232 static entity_table_t ent_uni_9674[] = {
233 /* 9674 */
234 "loz"
237 static entity_table_t ent_uni_9824_9830[] = {
238 /* 9824 */
239 "spades", nullptr, nullptr, "clubs", nullptr, "hearts", "diams"
242 static const struct html_entity_map entity_map[] = {
243 { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
244 { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
245 { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
246 { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
247 { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
248 { cs_utf_8, 338, 402, ent_uni_338_402 },
249 { cs_utf_8, 710, 732, ent_uni_spacing },
250 { cs_utf_8, 913, 982, ent_uni_greek },
251 { cs_utf_8, 8194, 8260, ent_uni_punct },
252 { cs_utf_8, 8364, 8364, ent_uni_euro },
253 { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
254 { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
255 { cs_utf_8, 9674, 9674, ent_uni_9674 },
256 { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
257 { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
258 { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
259 { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
260 { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
261 { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
262 /* Missing support for these at the moment
263 { cs_koi8r, 0xa3, 0xff, ent_koi8r },
264 { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
265 { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
266 { cs_cp866, 0xc0, 0xff, ent_cp_866 },
267 { cs_macroman, 0x0b, 0xff, ent_macroman },
269 { cs_terminator }
272 static const struct {
273 const char *codeset;
274 entity_charset charset;
275 } charset_map[] = {
276 { "ISO-8859-1", cs_8859_1 },
277 { "ISO8859-1", cs_8859_1 },
278 { "ISO-8859-15", cs_8859_15 },
279 { "ISO8859-15", cs_8859_15 },
280 { "utf-8", cs_utf_8 },
281 { "cp1252", cs_cp1252 },
282 { "Windows-1252", cs_cp1252 },
283 { "1252", cs_cp1252 },
284 { "BIG5", cs_big5 },
285 { "950", cs_big5 },
286 { "GB2312", cs_gb2312 },
287 { "936", cs_gb2312 },
288 { "BIG5-HKSCS", cs_big5hkscs },
289 { "Shift_JIS", cs_sjis },
290 { "SJIS", cs_sjis },
291 { "932", cs_sjis },
292 { "EUCJP", cs_eucjp },
293 /* Missing support for these at the moment
294 { "EUC-JP", cs_eucjp },
295 { "KOI8-R", cs_koi8r },
296 { "koi8-ru", cs_koi8r },
297 { "koi8r", cs_koi8r },
298 { "cp1251", cs_cp1251 },
299 { "Windows-1251", cs_cp1251 },
300 { "win-1251", cs_cp1251 },
301 { "iso8859-5", cs_8859_5 },
302 { "iso-8859-5", cs_8859_5 },
303 { "cp866", cs_cp866 },
304 { "866", cs_cp866 },
305 { "ibm866", cs_cp866 },
306 { "MacRoman", cs_macroman },
308 { nullptr }
311 ///////////////////////////////////////////////////////////////////////////////
313 entity_charset determine_charset(const char *charset_hint) {
314 entity_charset charset = cs_unknown;
316 if (charset_hint == nullptr) {
317 // default to utf-8
318 return cs_utf_8;
321 size_t len = strlen(charset_hint);
323 /* now walk the charset map and look for the codeset */
324 for (int i = 0; charset_map[i].codeset; i++) {
325 if (len == strlen(charset_map[i].codeset) &&
326 strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
327 charset = charset_map[i].charset;
328 break;
332 return charset;
335 static int utf32_to_utf8(unsigned char *buf, int k) {
336 int retval = 0;
338 if (k < 0x80) {
339 buf[0] = k;
340 retval = 1;
341 } else if (k < 0x800) {
342 buf[0] = 0xc0 | (k >> 6);
343 buf[1] = 0x80 | (k & 0x3f);
344 retval = 2;
345 } else if (k < 0x10000) {
346 buf[0] = 0xe0 | (k >> 12);
347 buf[1] = 0x80 | ((k >> 6) & 0x3f);
348 buf[2] = 0x80 | (k & 0x3f);
349 retval = 3;
350 } else if (k < 0x200000) {
351 buf[0] = 0xf0 | (k >> 18);
352 buf[1] = 0x80 | ((k >> 12) & 0x3f);
353 buf[2] = 0x80 | ((k >> 6) & 0x3f);
354 buf[3] = 0x80 | (k & 0x3f);
355 retval = 4;
356 } else if (k < 0x4000000) {
357 buf[0] = 0xf8 | (k >> 24);
358 buf[1] = 0x80 | ((k >> 18) & 0x3f);
359 buf[2] = 0x80 | ((k >> 12) & 0x3f);
360 buf[3] = 0x80 | ((k >> 6) & 0x3f);
361 buf[4] = 0x80 | (k & 0x3f);
362 retval = 5;
363 } else {
364 buf[0] = 0xfc | (k >> 30);
365 buf[1] = 0x80 | ((k >> 24) & 0x3f);
366 buf[2] = 0x80 | ((k >> 18) & 0x3f);
367 buf[3] = 0x80 | ((k >> 12) & 0x3f);
368 buf[4] = 0x80 | ((k >> 6) & 0x3f);
369 buf[5] = 0x80 | (k & 0x3f);
370 retval = 6;
372 buf[retval] = '\0';
374 return retval;
377 using HtmlEntityMap = hphp_hash_map<const char*,std::string,cstr_hash,eqstr>;
379 static volatile bool EntityMapInited = false;
380 static Mutex EntityMapMutex;
381 static HtmlEntityMap EntityMap[cs_end];
382 static HtmlEntityMap XHPEntityMap[cs_end];
384 static void init_entity_table() {
385 for (unsigned int i = 0; entity_map[i].charset != cs_terminator; i++) {
386 const html_entity_map &em = entity_map[i];
387 const entity_charset charset = entity_map[i].charset;
389 int index = 0;
390 for (int ch = em.basechar; ch <= em.endchar; ch++, index++) {
391 const char *entity = em.table[index];
392 if (entity == nullptr) {
393 continue;
395 unsigned char buf[10];
396 switch (charset) {
397 case cs_8859_1:
398 case cs_cp1252:
399 case cs_8859_15:
400 case cs_cp1251:
401 case cs_8859_5:
402 case cs_cp866:
403 case cs_koi8r:
404 buf[0] = ch;
405 buf[1] = '\0';
406 break;
408 case cs_utf_8:
409 utf32_to_utf8(buf, ch);
410 break;
412 default:
413 continue;
415 EntityMap[charset][entity] = (const char *)buf;
416 XHPEntityMap[charset][entity] = (const char *)buf;
419 EntityMap[charset]["quot"] = "\"";
420 EntityMap[charset]["lt"] = "<";
421 EntityMap[charset]["gt"] = ">";
422 EntityMap[charset]["amp"] = "&";
424 XHPEntityMap[charset]["quot"] = "\"";
425 XHPEntityMap[charset]["lt"] = "<";
426 XHPEntityMap[charset]["gt"] = ">";
427 XHPEntityMap[charset]["amp"] = "&";
428 // XHP-specific entities
429 XHPEntityMap[charset]["apos"] = "\'";
430 XHPEntityMap[charset]["cloud"] = "\u2601";
431 XHPEntityMap[charset]["umbrella"] = "\u2602";
432 XHPEntityMap[charset]["snowman"] = "\u2603";
433 XHPEntityMap[charset]["snowflake"] = "\u2745";
434 XHPEntityMap[charset]["comet"] = "\u2604";
435 XHPEntityMap[charset]["thunderstorm"] = "\u2608";
438 // the first element is an empty table
439 EntityMap[cs_terminator]["quot"] = "\"";
440 EntityMap[cs_terminator]["lt"] = "<";
441 EntityMap[cs_terminator]["gt"] = ">";
442 EntityMap[cs_terminator]["amp"] = "&";
443 // XHP-specific entities
444 XHPEntityMap[cs_terminator]["apos"] = "\'";
445 XHPEntityMap[cs_terminator]["cloud"] = "\u2601";
446 XHPEntityMap[cs_terminator]["umbrella"] = "\u2602";
447 XHPEntityMap[cs_terminator]["snowman"] = "\u2603";
448 XHPEntityMap[cs_terminator]["snowflake"] = "\u2745";
449 XHPEntityMap[cs_terminator]["comet"] = "\u2604";
450 XHPEntityMap[cs_terminator]["thunderstorm"] = "\u2608";
453 ///////////////////////////////////////////////////////////////////////////////
454 inline static bool decode_entity(char *entity, int *len,
455 bool decode_double_quote,
456 bool decode_single_quote,
457 entity_charset charset, bool all,
458 bool xhp = false) {
459 // entity is 16 bytes, allocated statically below
460 // default in PHP
461 assert(entity && *entity);
462 if (entity[0] == '#') {
463 int code;
464 if (entity[1] == 'x' || entity[1] == 'X') {
465 code = strtol(entity + 2, nullptr, 16);
466 } else {
467 code = strtol(entity + 1, nullptr, 10);
470 // since we don't support multibyte chars other than utf-8
471 int l = 1;
473 if (code == 39 && decode_single_quote) {
474 entity[0] = code;
475 entity[1] = '\0';
476 *len = l;
477 return true;
480 switch (charset) {
481 case cs_utf_8:
483 unsigned char buf[10];
484 int size = utf32_to_utf8(buf, code);
485 memcpy(entity, buf, size + 1);
486 l = size;
487 break;
490 case cs_8859_1:
491 case cs_8859_5:
492 case cs_8859_15:
493 if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
494 return false;
495 } else {
496 if (code == 39) {
497 return false;
499 entity[0] = code;
500 entity[1] = '\0';
502 break;
504 case cs_cp1252:
505 case cs_cp1251:
506 case cs_cp866:
507 if (code > 0xff) {
508 return false;
510 entity[0] = code;
511 entity[1] = '\0';
512 break;
514 case cs_big5:
515 case cs_big5hkscs:
516 case cs_sjis:
517 case cs_eucjp:
518 if (code >= 0x80) {
519 return false;
521 entity[0] = code;
522 entity[1] = '\0';
523 break;
525 case cs_gb2312:
526 if (code >= 0x81) {
527 return false;
529 entity[0] = code;
530 entity[1] = '\0';
531 break;
533 default:
534 return false;
535 break;
537 *len = l;
538 return true;
539 } else {
540 HtmlEntityMap *entityMap;
542 if (strncasecmp(entity, "quot", 4) == 0 && !decode_double_quote) {
543 return false;
546 if (all) {
547 entityMap = xhp ? &XHPEntityMap[charset] : &EntityMap[charset];
548 } else {
549 entityMap = xhp ? &XHPEntityMap[cs_terminator]
550 : &EntityMap[cs_terminator];
552 HtmlEntityMap::const_iterator iter = entityMap->find(entity);
553 if (iter != entityMap->end()) {
554 memcpy(entity, iter->second.c_str(), iter->second.length() + 1);
555 *len = iter->second.length();
556 return true;
560 return false;
563 inline static bool encode_entity(char* buf, int* buflen,
564 const char* entity, bool utf8) {
565 entity_charset charset = cs_utf_8;
566 if (!utf8){ charset = cs_8859_1; }
568 HtmlEntityMap *entityMap = &EntityMap[charset];
570 for(HtmlEntityMap::const_iterator iter = entityMap->begin();
571 iter != entityMap->end(); iter++) {
572 if (strcmp(iter->second.c_str(), entity) == 0) {
573 memcpy(buf, iter->first, strlen(iter->first));
574 *buflen = strlen(iter->first);
575 return true;
578 return false;
581 char *string_html_encode(const char *input, int &len,
582 const int64_t qsBitmask, bool utf8,
583 bool dEncode, bool htmlEnt) {
584 assert(input);
586 * Though seems to be wasting memory a lot, we have to realize most of the
587 * time this function is called with small strings, or fragments of HTMLs.
588 * Allocating/deallocating anything less than 1K is trivial these days, and
589 * we want avoid string copying as much as possible. Of course, the return
590 * char * is really sent back at large, occupying unnessary space for
591 * potentially longer time than we need, we have to realize the two closest
592 * solutions are not that much better, either:
594 * 1. pre-calculate size by iterating through the string once: too time
595 * consuming;
596 * 2. take a guess and double buffer size when over: still wasting, and
597 * it may not save that much.
599 * Note: Amount of allocation per character to be encoded may have to be
600 * increased as larger HTML Entities are implemented.
602 char *ret = (char *)malloc(len * 14uL + 1);
603 if (!ret) {
604 return nullptr;
606 char *q = ret;
607 for (const char *p = input, *end = input + len; p < end; p++) {
608 unsigned char c = *p;
609 char entity[5];
610 int codeLength = 0;
611 switch (c) {
612 case '"':
613 if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_DOUBLE)) {
614 *q++ = '&'; *q++ = 'q'; *q++ = 'u'; *q++ = 'o'; *q++ = 't'; *q++ = ';';
615 } else {
616 *q++ = c;
618 break;
619 case '\'':
620 if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SINGLE)) {
621 *q++ = '&'; *q++ = '#'; *q++ = '0'; *q++ = '3'; *q++ = '9'; *q++ = ';';
622 } else {
623 *q++ = c;
625 break;
626 case '<':
627 *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
628 break;
629 case '>':
630 *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
631 break;
632 case '&':
633 if (!dEncode) {
634 p++;
636 html_get_entity_map();
638 bool found = false;
639 for (const char *t = p; *t; t++) {
640 if (*t == ';') {
641 int l = t - p;
642 if (l > 0) {
643 char sbuf[16] = {0};
644 char *buf;
645 if (l > 10) {
646 buf = (char* )malloc(l + 1);
647 } else {
648 buf = sbuf;
650 memcpy(buf, p, l);
651 buf[l] = '\0';
652 if (decode_entity(buf, &l, true, true,
653 cs_utf_8, true)) {
654 found = true;
655 *q++ = '&';
656 for(const char *s = p; s <= t; s++) {
657 *q++ = *s;
659 p = t;
661 if (buf != sbuf) {
662 free(buf);
665 break;
668 if (!found) {
669 p--;
670 *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
672 } else {
673 *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
675 break;
676 case static_cast<unsigned char>('\xc2'):
677 if (htmlEnt && utf8 && p != end && *(p+1) == '\xa0') {
678 *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
679 p++;
680 break;
683 // fallthrough
684 default: {
685 if (LIKELY(c < 0x80)) {
686 *q++ = c;
687 break;
690 bool should_skip =
691 qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_IGNORE);
692 bool should_replace =
693 qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SUBSTITUTE);
695 if (!utf8 && should_skip) {
696 break;
699 auto avail = end - p;
700 auto utf8_trail = [](unsigned char c) { return c >= 0x80 && c <= 0xbf; };
702 // This has to be a macro since it needs to be able to break away from
703 // the for loop we're in.
704 // ENT_IGNORE has higher precedence than ENT_SUBSTITUTE
705 // \uFFFD is Unicode Replacement Character (U+FFFD)
706 #define UTF8_ERROR_IF(cond) \
707 if (cond) { \
708 if (should_skip) { break; } \
709 else if (should_replace) { strcpy(q, "\uFFFD"); q += 3; break; } \
710 else { goto exit_error; } \
713 if (utf8) {
714 if (c < 0xc2) {
715 UTF8_ERROR_IF(true);
716 } else if (c < 0xe0) {
717 UTF8_ERROR_IF(avail < 2 || !utf8_trail(*(p + 1)));
719 uint16_t tc = ((c & 0x1f) << 6) | (p[1] & 0x3f);
720 UTF8_ERROR_IF(tc < 0x80); // non-shortest form
722 codeLength = 2;
723 entity[0] = *p;
724 entity[1] = *(p + 1);
725 entity[2] = '\0';
726 } else if (c < 0xf0) {
727 UTF8_ERROR_IF(avail < 3);
728 for (int i = 1; i < 3; ++i) {
729 UTF8_ERROR_IF(!utf8_trail(*(p + i)));
732 uint32_t tc = ((c & 0x0f) << 12) |
733 ((*(p+1) & 0x3f) << 6) |
734 (*(p+2) & 0x3f);
735 UTF8_ERROR_IF(tc < 0x800); // non-shortest form
736 UTF8_ERROR_IF(tc >= 0xd800 && tc <= 0xdfff); // surrogate
738 codeLength = 3;
739 entity[0] = *p;
740 entity[1] = *(p + 1);
741 entity[2] = *(p + 2);
742 entity[3] = '\0';
743 } else if (c < 0xf5) {
744 UTF8_ERROR_IF(avail < 4);
745 for (int i = 1; i < 4; ++i) {
746 UTF8_ERROR_IF(!utf8_trail(*(p + i)));
749 uint32_t tc = ((c & 0x07) << 18) |
750 ((*(p+1) & 0x3f) << 12) |
751 ((*(p+2) & 0x3f) << 6) |
752 (*(p+3) & 0x3f);
754 // non-shortest form or outside range
755 UTF8_ERROR_IF(tc < 0x10000 || tc > 0x10ffff);
757 codeLength = 4;
758 entity[0] = *p;
759 entity[1] = *(p + 1);
760 entity[2] = *(p + 2);
761 entity[3] = *(p + 3);
762 entity[4] = '\0';
763 } else {
764 UTF8_ERROR_IF(true);
766 } else {
767 codeLength = 1;
768 entity[0] = *p;
769 entity[1] = '\0';
772 if (htmlEnt) {
773 html_get_entity_map();
775 char buf[16] = {0};
776 buf[0] = c;
777 int len = 1;
779 if (encode_entity(buf, &len, const_cast<char*>(entity), utf8)) {
780 *q++ = '&';
781 const char *s = buf;
782 for (int n = 0; n < len; n++) {
783 *q++ = *s++;
785 *q++ = ';';
786 } else {
787 memcpy(q, p, codeLength);
788 q += codeLength;
790 } else {
791 memcpy(q, p, codeLength);
792 q += codeLength;
794 p += codeLength - 1;
796 break;
802 #undef UTF8_ERROR_IF
804 if (q - ret > INT_MAX) {
805 goto exit_error;
807 *q = 0;
808 len = q - ret;
809 return ret;
811 exit_error:
812 free(ret);
813 return nullptr;
816 char *string_html_encode_extra(const char *input, int &len,
817 StringHtmlEncoding flags,
818 const AsciiMap *asciiMap) {
819 assert(input);
821 * Though seems to be wasting memory a lot, we have to realize most of the
822 * time this function is called with small strings, or fragments of HTMLs.
823 * Allocating/deallocating anything less than 1K is trivial these days, and
824 * we want avoid string copying as much as possible. Of course, the return
825 * char * is really sent back at large, occupying unnessary space for
826 * potentially longer time than we need, we have to realize the two closest
827 * solutions are not that much better, either:
829 * 1. pre-calculate size by iterating through the string once: too time
830 * consuming;
831 * 2. take a guess and double buffer size when over: still wasting, and
832 * it may not save that much.
834 char *ret = (char *)malloc(len * 8uL + 1);
835 if (!ret) {
836 return nullptr;
838 char *q = ret;
839 const char *rep = "\ufffd";
840 int32_t srcPosBytes;
841 for (srcPosBytes = 0; srcPosBytes < len; /* incremented in-loop */) {
842 unsigned char c = input[srcPosBytes];
843 if (c && c < 128) {
844 srcPosBytes++; // Optimize US-ASCII case
845 if ((asciiMap->map[c & 64 ? 1 : 0] >> (c & 63)) & 1) {
846 switch (c) {
847 case '"':
848 *q++ = '&'; *q++ = 'q'; *q++ = 'u';
849 *q++ = 'o'; *q++ = 't'; *q++ = ';';
850 break;
851 case '\'':
852 *q++ = '&'; *q++ = '#'; *q++ = '0';
853 *q++ = '3'; *q++ = '9'; *q++ = ';';
854 break;
855 case '<':
856 *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
857 break;
858 case '>':
859 *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
860 break;
861 case '&':
862 *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
863 break;
864 default:
865 *q++ = '&'; *q++ = '#';
866 *q++ = c >= 100 ? '1' : '0';
867 *q++ = ((c / 10) % 10) + '0';
868 *q++ = (c % 10) + '0';
869 *q++ = ';';
870 break;
872 } else {
873 *q++ = c;
875 } else if (flags & STRING_HTML_ENCODE_UTF8) {
876 UChar32 curCodePoint;
877 U8_NEXT(input, srcPosBytes, len, curCodePoint);
878 if ((flags & STRING_HTML_ENCODE_NBSP) && curCodePoint == 0xC2A0) {
879 *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
880 } else if (curCodePoint <= 0) {
881 if (flags & STRING_HTML_ENCODE_UTF8IZE_REPLACE) {
882 if (flags & STRING_HTML_ENCODE_HIGH) {
883 *q++ = '&'; *q++ = '#'; *q++ = 'x';
884 *q++ = 'f'; *q++ = 'f'; *q++ = 'f'; *q++ = 'd';
885 *q++ = ';';
886 } else {
887 const char *r = rep;
888 while (*r) *q++ = *r++;
891 } else if (flags & STRING_HTML_ENCODE_HIGH) {
892 q += sprintf(q, "&#x%x;", curCodePoint);
893 } else {
894 int32_t pos = 0;
895 U8_APPEND_UNSAFE(q, pos, curCodePoint);
896 q += pos;
898 } else {
899 srcPosBytes++; // Optimize US-ASCII case
900 if (c == 0xa0) {
901 *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
902 } else if (flags & STRING_HTML_ENCODE_HIGH) {
903 *q++ = '&'; *q++ = '#';
904 *q++ = c >= 200 ? '2' : '1';
905 *q++ = ((c / 10) % 10) + '0';
906 *q++ = (c % 10) + '0';
907 *q++ = ';';
908 } else {
909 *q++ = c;
913 if (q - ret > INT_MAX) {
914 free(ret);
915 return nullptr;
917 *q = 0;
918 len = q - ret;
919 return ret;
922 char *string_html_decode(const char *input, int &len,
923 bool decode_double_quote, bool decode_single_quote,
924 const char *charset_hint, bool all,
925 bool xhp /* = false */) {
926 assert(input);
928 if (!EntityMapInited) {
929 Lock lock(EntityMapMutex);
930 if (!EntityMapInited) {
931 init_entity_table();
932 EntityMapInited = true;
936 entity_charset charset = determine_charset(charset_hint);
937 if (charset == cs_unknown) {
938 return nullptr;
941 char *ret = (char *)malloc(len + 1);
942 char *q = ret;
943 for (const char *p = input; *p || UNLIKELY(p - input < len); p++) {
944 char ch = *p;
945 if (ch != '&') {
946 *q++ = ch;
947 continue;
949 p++;
951 bool found = false;
952 for (const char *t = p; *t; t++) {
953 if (*t == ';') {
954 int l = t - p;
955 if (l > 0) {
956 char sbuf[16] = {0};
957 char *buf;
958 if (l > 10) {
959 buf = (char* )malloc(l + 1);
960 } else {
961 buf = sbuf;
963 memcpy(buf, p, l);
964 buf[l] = '\0';
965 if (decode_entity(buf, &l, decode_double_quote, decode_single_quote,
966 charset, all, xhp)) {
967 memcpy(q, buf, l);
968 found = true;
969 p = t;
970 q += l;
972 if (buf != sbuf) {
973 free(buf);
976 break;
979 if (!found) {
980 p--;
981 *q++ = '&'; // not an entity
984 *q = '\0';
985 len = q - ret;
986 return ret;
989 const html_entity_map* html_get_entity_map() {
990 if (!EntityMapInited) {
991 Lock lock(EntityMapMutex);
992 if (!EntityMapInited) {
993 init_entity_table();
994 EntityMapInited = true;
997 return entity_map;
1000 ///////////////////////////////////////////////////////////////////////////////