2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2014 Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
17 #include "hphp/zend/zend-html.h"
19 #include <unicode/uchar.h>
20 #include <unicode/utf8.h>
22 #include "hphp/util/lock.h"
26 ///////////////////////////////////////////////////////////////////////////////
27 // UTF-8 entity tables
29 using namespace entity_charset_enum
;
31 /* codepage 1252 is a Windows extension to iso-8859-1. */
32 static entity_table_t ent_cp_1252
[] = {
33 "euro", nullptr, "sbquo", "fnof", "bdquo", "hellip", "dagger",
34 "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
35 nullptr, nullptr, nullptr, nullptr, "lsquo", "rsquo", "ldquo", "rdquo",
36 "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
37 "oelig", nullptr, nullptr, "Yuml"
40 static entity_table_t ent_iso_8859_1
[] = {
41 "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
42 "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
43 "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
44 "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
45 "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
46 "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
47 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
48 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
49 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
50 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
51 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
52 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
53 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
54 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
55 "uuml", "yacute", "thorn", "yuml"
58 static entity_table_t ent_iso_8859_15
[] = {
59 "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
60 "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
61 "macr", "deg", "plusmn", "sup2", "sup3", nullptr, /* Zcaron */
62 "micro", "para", "middot", nullptr, /* zcaron */ "sup1", "ordm",
63 "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
64 "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
65 "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
66 "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
67 "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
68 "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
69 "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
70 "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
71 "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
72 "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
73 "uuml", "yacute", "thorn", "yuml"
76 static entity_table_t ent_uni_338_402
[] = {
78 "OElig", "oelig", nullptr, nullptr, nullptr, nullptr,
79 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
81 "Scaron", "scaron", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
82 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
83 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
85 "Yuml", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
86 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
87 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
89 nullptr, nullptr, "fnof"
92 static entity_table_t ent_uni_spacing
[] = {
96 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
97 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
102 static entity_table_t ent_uni_greek
[] = {
104 "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
105 "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
106 nullptr, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
107 /* 938 - 944 are not mapped */
108 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
109 "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
110 "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
111 "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
112 /* 970 - 976 are not mapped */
113 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
115 nullptr, nullptr, nullptr,
119 static entity_table_t ent_uni_punct
[] = {
121 "ensp", "emsp", nullptr, nullptr, nullptr, nullptr, nullptr,
122 "thinsp", nullptr, nullptr, "zwnj", "zwj", "lrm", "rlm",
123 nullptr, nullptr, nullptr, "ndash", "mdash", nullptr, nullptr, nullptr,
125 "lsquo", "rsquo", "sbquo", nullptr, "ldquo", "rdquo", "bdquo", nullptr,
126 "dagger", "Dagger", "bull", nullptr, nullptr, nullptr, "hellip",
127 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "permil", nullptr,
129 "prime", "Prime", nullptr, nullptr, nullptr, nullptr, nullptr, "lsaquo", "rsaquo", nullptr,
130 nullptr, nullptr, "oline", nullptr, nullptr, nullptr, nullptr, nullptr,
134 static entity_table_t ent_uni_euro
[] = {
138 static entity_table_t ent_uni_8465_8501
[] = {
140 "image", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
142 "weierp", nullptr, nullptr, nullptr,
144 "real", nullptr, nullptr, nullptr, nullptr, nullptr,
146 "trade", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
147 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
152 static entity_table_t ent_uni_8592_9002
[] = {
154 "larr", "uarr", "rarr", "darr", "harr", nullptr, nullptr, nullptr,
155 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
157 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
158 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
160 nullptr, nullptr, nullptr, nullptr, nullptr, "crarr", nullptr, nullptr,
161 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
163 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
164 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
166 "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", nullptr, nullptr,
167 nullptr, nullptr, "lAarr", "rAarr", nullptr, "rarrw", nullptr, nullptr,
169 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
170 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
171 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
172 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
174 "forall", "comp", "part", "exist", "nexist", "empty", nullptr, "nabla",
175 "isin", "notin", "epsis", "ni", "notni", "bepsi", nullptr, "prod",
177 "coprod", "sum", "minus", "mnplus", "plusdo", nullptr, "setmn", "lowast",
178 "compfn", nullptr, "radic", nullptr, nullptr, "prop", "infin", "ang90",
180 "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
181 "or", "cap", "cup", "int", nullptr, nullptr, "conint", nullptr,
183 nullptr, nullptr, nullptr, nullptr, "there4", "becaus", nullptr, nullptr,
184 nullptr, nullptr, nullptr, nullptr, "sim", "bsim", nullptr, nullptr,
186 "wreath", "nsim", nullptr, "sime", "nsime", "cong", nullptr, "ncong",
187 "asymp", "nap", "ape", nullptr, "bcong", "asymp", "bump", "bumpe",
189 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
190 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
192 "ne", "equiv", nullptr, nullptr, "le", "ge", "lE", "gE",
193 "lnE", "gnE", "Lt", "Gt", "twixt", nullptr, "nlt", "ngt",
195 "nles", "nges", "lsim", "gsim", nullptr, nullptr, "lg", "gl",
196 nullptr, nullptr, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
198 "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
199 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
201 nullptr, nullptr, nullptr, nullptr, nullptr, "oplus", nullptr, "otimes",
202 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
204 nullptr, nullptr, nullptr, nullptr, nullptr, "perp", nullptr, nullptr,
205 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
207 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
208 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
210 nullptr, nullptr, nullptr, nullptr, nullptr, "sdot", nullptr, nullptr,
211 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
213 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
214 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
216 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
217 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
219 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
220 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
222 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
223 "lceil", "rceil", "lfloor", "rfloor", nullptr, nullptr, nullptr, nullptr,
225 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
226 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
228 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
229 nullptr, "lang", "rang"
232 static entity_table_t ent_uni_9674
[] = {
237 static entity_table_t ent_uni_9824_9830
[] = {
239 "spades", nullptr, nullptr, "clubs", nullptr, "hearts", "diams"
242 static const struct html_entity_map entity_map
[] = {
243 { cs_cp1252
, 0x80, 0x9f, ent_cp_1252
},
244 { cs_cp1252
, 0xa0, 0xff, ent_iso_8859_1
},
245 { cs_8859_1
, 0xa0, 0xff, ent_iso_8859_1
},
246 { cs_8859_15
, 0xa0, 0xff, ent_iso_8859_15
},
247 { cs_utf_8
, 0xa0, 0xff, ent_iso_8859_1
},
248 { cs_utf_8
, 338, 402, ent_uni_338_402
},
249 { cs_utf_8
, 710, 732, ent_uni_spacing
},
250 { cs_utf_8
, 913, 982, ent_uni_greek
},
251 { cs_utf_8
, 8194, 8260, ent_uni_punct
},
252 { cs_utf_8
, 8364, 8364, ent_uni_euro
},
253 { cs_utf_8
, 8465, 8501, ent_uni_8465_8501
},
254 { cs_utf_8
, 8592, 9002, ent_uni_8592_9002
},
255 { cs_utf_8
, 9674, 9674, ent_uni_9674
},
256 { cs_utf_8
, 9824, 9830, ent_uni_9824_9830
},
257 { cs_big5
, 0xa0, 0xff, ent_iso_8859_1
},
258 { cs_gb2312
, 0xa0, 0xff, ent_iso_8859_1
},
259 { cs_big5hkscs
, 0xa0, 0xff, ent_iso_8859_1
},
260 { cs_sjis
, 0xa0, 0xff, ent_iso_8859_1
},
261 { cs_eucjp
, 0xa0, 0xff, ent_iso_8859_1
},
262 /* Missing support for these at the moment
263 { cs_koi8r, 0xa3, 0xff, ent_koi8r },
264 { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
265 { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
266 { cs_cp866, 0xc0, 0xff, ent_cp_866 },
267 { cs_macroman, 0x0b, 0xff, ent_macroman },
272 static const struct {
274 entity_charset charset
;
276 { "ISO-8859-1", cs_8859_1
},
277 { "ISO8859-1", cs_8859_1
},
278 { "ISO-8859-15", cs_8859_15
},
279 { "ISO8859-15", cs_8859_15
},
280 { "utf-8", cs_utf_8
},
281 { "cp1252", cs_cp1252
},
282 { "Windows-1252", cs_cp1252
},
283 { "1252", cs_cp1252
},
286 { "GB2312", cs_gb2312
},
287 { "936", cs_gb2312
},
288 { "BIG5-HKSCS", cs_big5hkscs
},
289 { "Shift_JIS", cs_sjis
},
292 { "EUCJP", cs_eucjp
},
293 /* Missing support for these at the moment
294 { "EUC-JP", cs_eucjp },
295 { "KOI8-R", cs_koi8r },
296 { "koi8-ru", cs_koi8r },
297 { "koi8r", cs_koi8r },
298 { "cp1251", cs_cp1251 },
299 { "Windows-1251", cs_cp1251 },
300 { "win-1251", cs_cp1251 },
301 { "iso8859-5", cs_8859_5 },
302 { "iso-8859-5", cs_8859_5 },
303 { "cp866", cs_cp866 },
305 { "ibm866", cs_cp866 },
306 { "MacRoman", cs_macroman },
311 ///////////////////////////////////////////////////////////////////////////////
313 entity_charset
determine_charset(const char *charset_hint
) {
314 entity_charset charset
= cs_unknown
;
316 if (charset_hint
== nullptr) {
321 size_t len
= strlen(charset_hint
);
323 /* now walk the charset map and look for the codeset */
324 for (int i
= 0; charset_map
[i
].codeset
; i
++) {
325 if (len
== strlen(charset_map
[i
].codeset
) &&
326 strncasecmp(charset_hint
, charset_map
[i
].codeset
, len
) == 0) {
327 charset
= charset_map
[i
].charset
;
335 static int utf32_to_utf8(unsigned char *buf
, int k
) {
341 } else if (k
< 0x800) {
342 buf
[0] = 0xc0 | (k
>> 6);
343 buf
[1] = 0x80 | (k
& 0x3f);
345 } else if (k
< 0x10000) {
346 buf
[0] = 0xe0 | (k
>> 12);
347 buf
[1] = 0x80 | ((k
>> 6) & 0x3f);
348 buf
[2] = 0x80 | (k
& 0x3f);
350 } else if (k
< 0x200000) {
351 buf
[0] = 0xf0 | (k
>> 18);
352 buf
[1] = 0x80 | ((k
>> 12) & 0x3f);
353 buf
[2] = 0x80 | ((k
>> 6) & 0x3f);
354 buf
[3] = 0x80 | (k
& 0x3f);
356 } else if (k
< 0x4000000) {
357 buf
[0] = 0xf8 | (k
>> 24);
358 buf
[1] = 0x80 | ((k
>> 18) & 0x3f);
359 buf
[2] = 0x80 | ((k
>> 12) & 0x3f);
360 buf
[3] = 0x80 | ((k
>> 6) & 0x3f);
361 buf
[4] = 0x80 | (k
& 0x3f);
364 buf
[0] = 0xfc | (k
>> 30);
365 buf
[1] = 0x80 | ((k
>> 24) & 0x3f);
366 buf
[2] = 0x80 | ((k
>> 18) & 0x3f);
367 buf
[3] = 0x80 | ((k
>> 12) & 0x3f);
368 buf
[4] = 0x80 | ((k
>> 6) & 0x3f);
369 buf
[5] = 0x80 | (k
& 0x3f);
377 using HtmlEntityMap
= hphp_hash_map
<const char*,std::string
,cstr_hash
,eqstr
>;
379 static volatile bool EntityMapInited
= false;
380 static Mutex EntityMapMutex
;
381 static HtmlEntityMap EntityMap
[cs_end
];
382 static HtmlEntityMap XHPEntityMap
[cs_end
];
384 static void init_entity_table() {
385 for (unsigned int i
= 0; entity_map
[i
].charset
!= cs_terminator
; i
++) {
386 const html_entity_map
&em
= entity_map
[i
];
387 const entity_charset charset
= entity_map
[i
].charset
;
390 for (int ch
= em
.basechar
; ch
<= em
.endchar
; ch
++, index
++) {
391 const char *entity
= em
.table
[index
];
392 if (entity
== nullptr) {
395 unsigned char buf
[10];
409 utf32_to_utf8(buf
, ch
);
415 EntityMap
[charset
][entity
] = (const char *)buf
;
416 XHPEntityMap
[charset
][entity
] = (const char *)buf
;
419 EntityMap
[charset
]["quot"] = "\"";
420 EntityMap
[charset
]["lt"] = "<";
421 EntityMap
[charset
]["gt"] = ">";
422 EntityMap
[charset
]["amp"] = "&";
424 XHPEntityMap
[charset
]["quot"] = "\"";
425 XHPEntityMap
[charset
]["lt"] = "<";
426 XHPEntityMap
[charset
]["gt"] = ">";
427 XHPEntityMap
[charset
]["amp"] = "&";
428 // XHP-specific entities
429 XHPEntityMap
[charset
]["apos"] = "\'";
430 XHPEntityMap
[charset
]["cloud"] = "\u2601";
431 XHPEntityMap
[charset
]["umbrella"] = "\u2602";
432 XHPEntityMap
[charset
]["snowman"] = "\u2603";
433 XHPEntityMap
[charset
]["snowflake"] = "\u2745";
434 XHPEntityMap
[charset
]["comet"] = "\u2604";
435 XHPEntityMap
[charset
]["thunderstorm"] = "\u2608";
438 // the first element is an empty table
439 EntityMap
[cs_terminator
]["quot"] = "\"";
440 EntityMap
[cs_terminator
]["lt"] = "<";
441 EntityMap
[cs_terminator
]["gt"] = ">";
442 EntityMap
[cs_terminator
]["amp"] = "&";
443 // XHP-specific entities
444 XHPEntityMap
[cs_terminator
]["apos"] = "\'";
445 XHPEntityMap
[cs_terminator
]["cloud"] = "\u2601";
446 XHPEntityMap
[cs_terminator
]["umbrella"] = "\u2602";
447 XHPEntityMap
[cs_terminator
]["snowman"] = "\u2603";
448 XHPEntityMap
[cs_terminator
]["snowflake"] = "\u2745";
449 XHPEntityMap
[cs_terminator
]["comet"] = "\u2604";
450 XHPEntityMap
[cs_terminator
]["thunderstorm"] = "\u2608";
453 ///////////////////////////////////////////////////////////////////////////////
454 inline static bool decode_entity(char *entity
, int *len
,
455 bool decode_double_quote
,
456 bool decode_single_quote
,
457 entity_charset charset
, bool all
,
459 // entity is 16 bytes, allocated statically below
461 assert(entity
&& *entity
);
462 if (entity
[0] == '#') {
464 if (entity
[1] == 'x' || entity
[1] == 'X') {
465 code
= strtol(entity
+ 2, nullptr, 16);
467 code
= strtol(entity
+ 1, nullptr, 10);
470 // since we don't support multibyte chars other than utf-8
473 if (code
== 39 && decode_single_quote
) {
483 unsigned char buf
[10];
484 int size
= utf32_to_utf8(buf
, code
);
485 memcpy(entity
, buf
, size
+ 1);
493 if ((code
>= 0x80 && code
< 0xa0) || code
> 0xff) {
540 HtmlEntityMap
*entityMap
;
542 if (strncasecmp(entity
, "quot", 4) == 0 && !decode_double_quote
) {
547 entityMap
= xhp
? &XHPEntityMap
[charset
] : &EntityMap
[charset
];
549 entityMap
= xhp
? &XHPEntityMap
[cs_terminator
]
550 : &EntityMap
[cs_terminator
];
552 HtmlEntityMap::const_iterator iter
= entityMap
->find(entity
);
553 if (iter
!= entityMap
->end()) {
554 memcpy(entity
, iter
->second
.c_str(), iter
->second
.length() + 1);
555 *len
= iter
->second
.length();
563 inline static bool encode_entity(char* buf
, int* buflen
,
564 const char* entity
, bool utf8
) {
565 entity_charset charset
= cs_utf_8
;
566 if (!utf8
){ charset
= cs_8859_1
; }
568 HtmlEntityMap
*entityMap
= &EntityMap
[charset
];
570 for(HtmlEntityMap::const_iterator iter
= entityMap
->begin();
571 iter
!= entityMap
->end(); iter
++) {
572 if (strcmp(iter
->second
.c_str(), entity
) == 0) {
573 memcpy(buf
, iter
->first
, strlen(iter
->first
));
574 *buflen
= strlen(iter
->first
);
581 char *string_html_encode(const char *input
, int &len
,
582 const int64_t qsBitmask
, bool utf8
,
583 bool dEncode
, bool htmlEnt
) {
586 * Though seems to be wasting memory a lot, we have to realize most of the
587 * time this function is called with small strings, or fragments of HTMLs.
588 * Allocating/deallocating anything less than 1K is trivial these days, and
589 * we want avoid string copying as much as possible. Of course, the return
590 * char * is really sent back at large, occupying unnessary space for
591 * potentially longer time than we need, we have to realize the two closest
592 * solutions are not that much better, either:
594 * 1. pre-calculate size by iterating through the string once: too time
596 * 2. take a guess and double buffer size when over: still wasting, and
597 * it may not save that much.
599 * Note: Amount of allocation per character to be encoded may have to be
600 * increased as larger HTML Entities are implemented.
602 char *ret
= (char *)malloc(len
* 14uL + 1);
607 for (const char *p
= input
, *end
= input
+ len
; p
< end
; p
++) {
608 unsigned char c
= *p
;
613 if (qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_DOUBLE
)) {
614 *q
++ = '&'; *q
++ = 'q'; *q
++ = 'u'; *q
++ = 'o'; *q
++ = 't'; *q
++ = ';';
620 if (qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_SINGLE
)) {
621 *q
++ = '&'; *q
++ = '#'; *q
++ = '0'; *q
++ = '3'; *q
++ = '9'; *q
++ = ';';
627 *q
++ = '&'; *q
++ = 'l'; *q
++ = 't'; *q
++ = ';';
630 *q
++ = '&'; *q
++ = 'g'; *q
++ = 't'; *q
++ = ';';
636 html_get_entity_map();
639 for (const char *t
= p
; *t
; t
++) {
646 buf
= (char* )malloc(l
+ 1);
652 if (decode_entity(buf
, &l
, true, true,
656 for(const char *s
= p
; s
<= t
; s
++) {
670 *q
++ = '&'; *q
++ = 'a'; *q
++ = 'm'; *q
++ = 'p'; *q
++ = ';';
673 *q
++ = '&'; *q
++ = 'a'; *q
++ = 'm'; *q
++ = 'p'; *q
++ = ';';
676 case static_cast<unsigned char>('\xc2'):
677 if (htmlEnt
&& utf8
&& p
!= end
&& *(p
+1) == '\xa0') {
678 *q
++ = '&'; *q
++ = 'n'; *q
++ = 'b'; *q
++ = 's'; *q
++ = 'p'; *q
++ = ';';
685 if (LIKELY(c
< 0x80)) {
691 qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_IGNORE
);
692 bool should_replace
=
693 qsBitmask
& static_cast<int64_t>(EntBitmask::ENT_BM_SUBSTITUTE
);
695 if (!utf8
&& should_skip
) {
699 auto avail
= end
- p
;
700 auto utf8_trail
= [](unsigned char c
) { return c
>= 0x80 && c
<= 0xbf; };
702 // This has to be a macro since it needs to be able to break away from
703 // the for loop we're in.
704 // ENT_IGNORE has higher precedence than ENT_SUBSTITUTE
705 // \uFFFD is Unicode Replacement Character (U+FFFD)
706 #define UTF8_ERROR_IF(cond) \
708 if (should_skip) { break; } \
709 else if (should_replace) { strcpy(q, "\uFFFD"); q += 3; break; } \
710 else { goto exit_error; } \
716 } else if (c
< 0xe0) {
717 UTF8_ERROR_IF(avail
< 2 || !utf8_trail(*(p
+ 1)));
719 uint16_t tc
= ((c
& 0x1f) << 6) | (p
[1] & 0x3f);
720 UTF8_ERROR_IF(tc
< 0x80); // non-shortest form
724 entity
[1] = *(p
+ 1);
726 } else if (c
< 0xf0) {
727 UTF8_ERROR_IF(avail
< 3);
728 for (int i
= 1; i
< 3; ++i
) {
729 UTF8_ERROR_IF(!utf8_trail(*(p
+ i
)));
732 uint32_t tc
= ((c
& 0x0f) << 12) |
733 ((*(p
+1) & 0x3f) << 6) |
735 UTF8_ERROR_IF(tc
< 0x800); // non-shortest form
736 UTF8_ERROR_IF(tc
>= 0xd800 && tc
<= 0xdfff); // surrogate
740 entity
[1] = *(p
+ 1);
741 entity
[2] = *(p
+ 2);
743 } else if (c
< 0xf5) {
744 UTF8_ERROR_IF(avail
< 4);
745 for (int i
= 1; i
< 4; ++i
) {
746 UTF8_ERROR_IF(!utf8_trail(*(p
+ i
)));
749 uint32_t tc
= ((c
& 0x07) << 18) |
750 ((*(p
+1) & 0x3f) << 12) |
751 ((*(p
+2) & 0x3f) << 6) |
754 // non-shortest form or outside range
755 UTF8_ERROR_IF(tc
< 0x10000 || tc
> 0x10ffff);
759 entity
[1] = *(p
+ 1);
760 entity
[2] = *(p
+ 2);
761 entity
[3] = *(p
+ 3);
773 html_get_entity_map();
779 if (encode_entity(buf
, &len
, const_cast<char*>(entity
), utf8
)) {
782 for (int n
= 0; n
< len
; n
++) {
787 memcpy(q
, p
, codeLength
);
791 memcpy(q
, p
, codeLength
);
804 if (q
- ret
> INT_MAX
) {
816 char *string_html_encode_extra(const char *input
, int &len
,
817 StringHtmlEncoding flags
,
818 const AsciiMap
*asciiMap
) {
821 * Though seems to be wasting memory a lot, we have to realize most of the
822 * time this function is called with small strings, or fragments of HTMLs.
823 * Allocating/deallocating anything less than 1K is trivial these days, and
824 * we want avoid string copying as much as possible. Of course, the return
825 * char * is really sent back at large, occupying unnessary space for
826 * potentially longer time than we need, we have to realize the two closest
827 * solutions are not that much better, either:
829 * 1. pre-calculate size by iterating through the string once: too time
831 * 2. take a guess and double buffer size when over: still wasting, and
832 * it may not save that much.
834 char *ret
= (char *)malloc(len
* 8uL + 1);
839 const char *rep
= "\ufffd";
841 for (srcPosBytes
= 0; srcPosBytes
< len
; /* incremented in-loop */) {
842 unsigned char c
= input
[srcPosBytes
];
844 srcPosBytes
++; // Optimize US-ASCII case
845 if ((asciiMap
->map
[c
& 64 ? 1 : 0] >> (c
& 63)) & 1) {
848 *q
++ = '&'; *q
++ = 'q'; *q
++ = 'u';
849 *q
++ = 'o'; *q
++ = 't'; *q
++ = ';';
852 *q
++ = '&'; *q
++ = '#'; *q
++ = '0';
853 *q
++ = '3'; *q
++ = '9'; *q
++ = ';';
856 *q
++ = '&'; *q
++ = 'l'; *q
++ = 't'; *q
++ = ';';
859 *q
++ = '&'; *q
++ = 'g'; *q
++ = 't'; *q
++ = ';';
862 *q
++ = '&'; *q
++ = 'a'; *q
++ = 'm'; *q
++ = 'p'; *q
++ = ';';
865 *q
++ = '&'; *q
++ = '#';
866 *q
++ = c
>= 100 ? '1' : '0';
867 *q
++ = ((c
/ 10) % 10) + '0';
868 *q
++ = (c
% 10) + '0';
875 } else if (flags
& STRING_HTML_ENCODE_UTF8
) {
876 UChar32 curCodePoint
;
877 U8_NEXT(input
, srcPosBytes
, len
, curCodePoint
);
878 if ((flags
& STRING_HTML_ENCODE_NBSP
) && curCodePoint
== 0xC2A0) {
879 *q
++ = '&'; *q
++ = 'n'; *q
++ = 'b'; *q
++ = 's'; *q
++ = 'p'; *q
++ = ';';
880 } else if (curCodePoint
<= 0) {
881 if (flags
& STRING_HTML_ENCODE_UTF8IZE_REPLACE
) {
882 if (flags
& STRING_HTML_ENCODE_HIGH
) {
883 *q
++ = '&'; *q
++ = '#'; *q
++ = 'x';
884 *q
++ = 'f'; *q
++ = 'f'; *q
++ = 'f'; *q
++ = 'd';
888 while (*r
) *q
++ = *r
++;
891 } else if (flags
& STRING_HTML_ENCODE_HIGH
) {
892 q
+= sprintf(q
, "&#x%x;", curCodePoint
);
895 U8_APPEND_UNSAFE(q
, pos
, curCodePoint
);
899 srcPosBytes
++; // Optimize US-ASCII case
901 *q
++ = '&'; *q
++ = 'n'; *q
++ = 'b'; *q
++ = 's'; *q
++ = 'p'; *q
++ = ';';
902 } else if (flags
& STRING_HTML_ENCODE_HIGH
) {
903 *q
++ = '&'; *q
++ = '#';
904 *q
++ = c
>= 200 ? '2' : '1';
905 *q
++ = ((c
/ 10) % 10) + '0';
906 *q
++ = (c
% 10) + '0';
913 if (q
- ret
> INT_MAX
) {
922 char *string_html_decode(const char *input
, int &len
,
923 bool decode_double_quote
, bool decode_single_quote
,
924 const char *charset_hint
, bool all
,
925 bool xhp
/* = false */) {
928 if (!EntityMapInited
) {
929 Lock
lock(EntityMapMutex
);
930 if (!EntityMapInited
) {
932 EntityMapInited
= true;
936 entity_charset charset
= determine_charset(charset_hint
);
937 if (charset
== cs_unknown
) {
941 char *ret
= (char *)malloc(len
+ 1);
943 for (const char *p
= input
; *p
|| UNLIKELY(p
- input
< len
); p
++) {
952 for (const char *t
= p
; *t
; t
++) {
959 buf
= (char* )malloc(l
+ 1);
965 if (decode_entity(buf
, &l
, decode_double_quote
, decode_single_quote
,
966 charset
, all
, xhp
)) {
981 *q
++ = '&'; // not an entity
989 const html_entity_map
* html_get_entity_map() {
990 if (!EntityMapInited
) {
991 Lock
lock(EntityMapMutex
);
992 if (!EntityMapInited
) {
994 EntityMapInited
= true;
1000 ///////////////////////////////////////////////////////////////////////////////