hphp/zend/zend-html.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-2014 Facebook, Inc. (http://www.facebook.com)     |
   6    | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
   7    +----------------------------------------------------------------------+
   8    | This source file is subject to version 2.00 of the Zend license,     |
   9    | that is bundled with this package in the file LICENSE, and is        |
  10    | available through the world-wide-web at the following url:           |
  11    | http://www.zend.com/license/2_00.txt.                                |
  12    | If you did not receive a copy of the Zend license and are unable to  |
  13    | obtain it through the world-wide-web, please send a note to          |
  14    | license@zend.com so we can mail you a copy immediately.              |
  15    +----------------------------------------------------------------------+
  16 */
  17 #include "hphp/zend/zend-html.h"
  18
  19 #include <unicode/uchar.h>
  20 #include <unicode/utf8.h>
  21
  22 #include "hphp/util/lock.h"
  23
  24 namespace HPHP {
  25
  26 ///////////////////////////////////////////////////////////////////////////////
  27 // UTF-8 entity tables
  28
  29 using namespace entity_charset_enum;
  30
  31 /* codepage 1252 is a Windows extension to iso-8859-1. */
  32 static entity_table_t ent_cp_1252[] = {
  33   "euro", nullptr, "sbquo", "fnof", "bdquo", "hellip", "dagger",
  34   "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
  35   nullptr, nullptr, nullptr, nullptr, "lsquo", "rsquo", "ldquo", "rdquo",
  36   "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
  37   "oelig", nullptr, nullptr, "Yuml"
  38 };
  39
  40 static entity_table_t ent_iso_8859_1[] = {
  41   "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
  42   "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
  43   "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
  44   "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
  45   "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
  46   "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
  47   "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
  48   "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
  49   "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
  50   "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
  51   "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
  52   "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
  53   "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
  54   "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
  55   "uuml", "yacute", "thorn", "yuml"
  56 };
  57
  58 static entity_table_t ent_iso_8859_15[] = {
  59   "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
  60   "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
  61   "macr", "deg", "plusmn", "sup2", "sup3", nullptr, /* Zcaron */
  62   "micro", "para", "middot", nullptr, /* zcaron */ "sup1", "ordm",
  63   "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
  64   "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
  65   "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
  66   "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
  67   "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
  68   "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
  69   "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
  70   "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
  71   "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
  72   "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
  73   "uuml", "yacute", "thorn", "yuml"
  74 };
  75
  76 static entity_table_t ent_uni_338_402[] = {
  77   /* 338 (0x0152) */
  78   "OElig", "oelig", nullptr, nullptr, nullptr, nullptr,
  79   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
  80   /* 352 (0x0160) */
  81   "Scaron", "scaron", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
  82   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
  83   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
  84   /* 376 (0x0178) */
  85   "Yuml", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
  86   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
  87   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
  88   /* 400 (0x0190) */
  89   nullptr, nullptr, "fnof"
  90 };
  91
  92 static entity_table_t ent_uni_spacing[] = {
  93   /* 710 */
  94   "circ",
  95   /* 711 - 730 */
  96   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
  97   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
  98   /* 731 - 732 */
  99   nullptr, "tilde"
 100 };
 101
 102 static entity_table_t ent_uni_greek[] = {
 103   /* 913 */
 104   "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
 105   "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
 106   nullptr, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
 107   /* 938 - 944 are not mapped */
 108   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 109   "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
 110   "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
 111   "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
 112   /* 970 - 976 are not mapped */
 113   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 114   "thetasym", "upsih",
 115   nullptr, nullptr, nullptr,
 116   "piv"
 117 };
 118
 119 static entity_table_t ent_uni_punct[] = {
 120   /* 8194 */
 121   "ensp", "emsp", nullptr, nullptr, nullptr, nullptr, nullptr,
 122   "thinsp", nullptr, nullptr, "zwnj", "zwj", "lrm", "rlm",
 123   nullptr, nullptr, nullptr, "ndash", "mdash", nullptr, nullptr, nullptr,
 124   /* 8216 */
 125   "lsquo", "rsquo", "sbquo", nullptr, "ldquo", "rdquo", "bdquo", nullptr,
 126   "dagger", "Dagger", "bull", nullptr, nullptr, nullptr, "hellip",
 127   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "permil", nullptr,
 128   /* 8242 */
 129   "prime", "Prime", nullptr, nullptr, nullptr, nullptr, nullptr, "lsaquo", "rsaquo", nullptr,
 130   nullptr, nullptr, "oline", nullptr, nullptr, nullptr, nullptr, nullptr,
 131   "frasl"
 132 };
 133
 134 static entity_table_t ent_uni_euro[] = {
 135   "euro"
 136 };
 137
 138 static entity_table_t ent_uni_8465_8501[] = {
 139   /* 8465 */
 140   "image", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 141   /* 8472 */
 142   "weierp", nullptr, nullptr, nullptr,
 143   /* 8476 */
 144   "real", nullptr, nullptr, nullptr, nullptr, nullptr,
 145   /* 8482 */
 146   "trade", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 147   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 148   /* 8501 */
 149   "alefsym",
 150 };
 151
 152 static entity_table_t ent_uni_8592_9002[] = {
 153   /* 8592 (0x2190) */
 154   "larr", "uarr", "rarr", "darr", "harr", nullptr, nullptr, nullptr,
 155   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 156   /* 8608 (0x21a0) */
 157   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 158   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 159   /* 8624 (0x21b0) */
 160   nullptr, nullptr, nullptr, nullptr, nullptr, "crarr", nullptr, nullptr,
 161   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 162   /* 8640 (0x21c0) */
 163   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 164   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 165   /* 8656 (0x21d0) */
 166   "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", nullptr, nullptr,
 167   nullptr, nullptr, "lAarr", "rAarr", nullptr, "rarrw", nullptr, nullptr,
 168   /* 8672 (0x21e0) */
 169   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 170   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 171   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 172   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 173   /* 8704 (0x2200) */
 174   "forall", "comp", "part", "exist", "nexist", "empty", nullptr, "nabla",
 175   "isin", "notin", "epsis", "ni", "notni", "bepsi", nullptr, "prod",
 176   /* 8720 (0x2210) */
 177   "coprod", "sum", "minus", "mnplus", "plusdo", nullptr, "setmn", "lowast",
 178   "compfn", nullptr, "radic", nullptr, nullptr, "prop", "infin", "ang90",
 179   /* 8736 (0x2220) */
 180   "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
 181   "or", "cap", "cup", "int", nullptr, nullptr, "conint", nullptr,
 182   /* 8752 (0x2230) */
 183   nullptr, nullptr, nullptr, nullptr, "there4", "becaus", nullptr, nullptr,
 184   nullptr, nullptr, nullptr, nullptr, "sim", "bsim", nullptr, nullptr,
 185   /* 8768 (0x2240) */
 186   "wreath", "nsim", nullptr, "sime", "nsime", "cong", nullptr, "ncong",
 187   "asymp", "nap", "ape", nullptr, "bcong", "asymp", "bump", "bumpe",
 188   /* 8784 (0x2250) */
 189   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 190   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 191   /* 8800 (0x2260) */
 192   "ne", "equiv", nullptr, nullptr, "le", "ge", "lE", "gE",
 193   "lnE", "gnE", "Lt", "Gt", "twixt", nullptr, "nlt", "ngt",
 194   /* 8816 (0x2270) */
 195   "nles", "nges", "lsim", "gsim", nullptr, nullptr, "lg", "gl",
 196   nullptr, nullptr, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
 197   /* 8832 (0x2280) */
 198   "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
 199   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 200   /* 8848 (0x2290) */
 201   nullptr, nullptr, nullptr, nullptr, nullptr, "oplus", nullptr, "otimes",
 202   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 203   /* 8864 (0x22a0) */
 204   nullptr, nullptr, nullptr, nullptr, nullptr, "perp", nullptr, nullptr,
 205   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 206   /* 8880 (0x22b0) */
 207   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 208   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 209   /* 8896 (0x22c0) */
 210   nullptr, nullptr, nullptr, nullptr, nullptr, "sdot", nullptr, nullptr,
 211   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 212   /* 8912 (0x22d0) */
 213   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 214   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 215   /* 8928 (0x22e0) */
 216   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 217   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 218   /* 8944 (0x22f0) */
 219   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 220   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 221   /* 8960 (0x2300) */
 222   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 223   "lceil", "rceil", "lfloor", "rfloor", nullptr, nullptr, nullptr, nullptr,
 224   /* 8976 (0x2310) */
 225   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 226   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 227   /* 8992 (0x2320) */
 228   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 229   nullptr, "lang", "rang"
 230 };
 231
 232 static entity_table_t ent_uni_9674[] = {
 233   /* 9674 */
 234   "loz"
 235 };
 236
 237 static entity_table_t ent_uni_9824_9830[] = {
 238   /* 9824 */
 239   "spades", nullptr, nullptr, "clubs", nullptr, "hearts", "diams"
 240 };
 241
 242 static const struct html_entity_map entity_map[] = {
 243   { cs_cp1252,    0x80, 0x9f, ent_cp_1252 },
 244   { cs_cp1252,    0xa0, 0xff, ent_iso_8859_1 },
 245   { cs_8859_1,    0xa0, 0xff, ent_iso_8859_1 },
 246   { cs_8859_15,   0xa0, 0xff, ent_iso_8859_15 },
 247   { cs_utf_8,     0xa0, 0xff, ent_iso_8859_1 },
 248   { cs_utf_8,     338,  402,  ent_uni_338_402 },
 249   { cs_utf_8,     710,  732,  ent_uni_spacing },
 250   { cs_utf_8,     913,  982,  ent_uni_greek },
 251   { cs_utf_8,     8194, 8260, ent_uni_punct },
 252   { cs_utf_8,     8364, 8364, ent_uni_euro },
 253   { cs_utf_8,     8465, 8501, ent_uni_8465_8501 },
 254   { cs_utf_8,     8592, 9002, ent_uni_8592_9002 },
 255   { cs_utf_8,     9674, 9674, ent_uni_9674 },
 256   { cs_utf_8,     9824, 9830, ent_uni_9824_9830 },
 257   { cs_big5,      0xa0, 0xff, ent_iso_8859_1 },
 258   { cs_gb2312,    0xa0, 0xff, ent_iso_8859_1 },
 259   { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
 260   { cs_sjis,      0xa0, 0xff, ent_iso_8859_1 },
 261   { cs_eucjp,     0xa0, 0xff, ent_iso_8859_1 },
 262   /* Missing support for these at the moment
 263   { cs_koi8r,     0xa3, 0xff, ent_koi8r },
 264   { cs_cp1251,    0x80, 0xff, ent_cp_1251 },
 265   { cs_8859_5,    0xc0, 0xff, ent_iso_8859_5 },
 266   { cs_cp866,     0xc0, 0xff, ent_cp_866 },
 267   { cs_macroman,  0x0b, 0xff, ent_macroman },
 268   */
 269   { cs_terminator }
 270 };
 271
 272 static const struct {
 273   const char *codeset;
 274   entity_charset charset;
 275 } charset_map[] = {
 276   { "ISO-8859-1",     cs_8859_1 },
 277   { "ISO8859-1",      cs_8859_1 },
 278   { "ISO-8859-15",    cs_8859_15 },
 279   { "ISO8859-15",     cs_8859_15 },
 280   { "utf-8",          cs_utf_8 },
 281   { "cp1252",         cs_cp1252 },
 282   { "Windows-1252",   cs_cp1252 },
 283   { "1252",           cs_cp1252 },
 284   { "BIG5",           cs_big5 },
 285   { "950",            cs_big5 },
 286   { "GB2312",         cs_gb2312 },
 287   { "936",            cs_gb2312 },
 288   { "BIG5-HKSCS",     cs_big5hkscs },
 289   { "Shift_JIS",      cs_sjis },
 290   { "SJIS",           cs_sjis },
 291   { "932",            cs_sjis },
 292   { "EUCJP",          cs_eucjp },
 293   /* Missing support for these at the moment
 294   { "EUC-JP",         cs_eucjp },
 295   { "KOI8-R",         cs_koi8r },
 296   { "koi8-ru",        cs_koi8r },
 297   { "koi8r",          cs_koi8r },
 298   { "cp1251",         cs_cp1251 },
 299   { "Windows-1251",   cs_cp1251 },
 300   { "win-1251",       cs_cp1251 },
 301   { "iso8859-5",      cs_8859_5 },
 302   { "iso-8859-5",     cs_8859_5 },
 303   { "cp866",          cs_cp866 },
 304   { "866",            cs_cp866 },
 305   { "ibm866",         cs_cp866 },
 306   { "MacRoman",       cs_macroman },
 307   */
 308   { nullptr }
 309 };
 310
 311 ///////////////////////////////////////////////////////////////////////////////
 312
 313 entity_charset determine_charset(const char *charset_hint) {
 314   entity_charset charset = cs_unknown;
 315
 316   if (charset_hint == nullptr) {
 317     // default to utf-8
 318     return cs_utf_8;
 319   }
 320
 321   size_t len = strlen(charset_hint);
 322
 323   /* now walk the charset map and look for the codeset */
 324   for (int i = 0; charset_map[i].codeset; i++) {
 325     if (len == strlen(charset_map[i].codeset) &&
 326       strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
 327       charset = charset_map[i].charset;
 328       break;
 329     }
 330   }
 331
 332   return charset;
 333 }
 334
 335 static int utf32_to_utf8(unsigned char *buf, int k) {
 336   int retval = 0;
 337
 338   if (k < 0x80) {
 339     buf[0] = k;
 340     retval = 1;
 341   } else if (k < 0x800) {
 342     buf[0] = 0xc0 | (k >> 6);
 343     buf[1] = 0x80 | (k & 0x3f);
 344     retval = 2;
 345   } else if (k < 0x10000) {
 346     buf[0] = 0xe0 | (k >> 12);
 347     buf[1] = 0x80 | ((k >> 6) & 0x3f);
 348     buf[2] = 0x80 | (k & 0x3f);
 349     retval = 3;
 350   } else if (k < 0x200000) {
 351     buf[0] = 0xf0 | (k >> 18);
 352     buf[1] = 0x80 | ((k >> 12) & 0x3f);
 353     buf[2] = 0x80 | ((k >> 6) & 0x3f);
 354     buf[3] = 0x80 | (k & 0x3f);
 355     retval = 4;
 356   } else if (k < 0x4000000) {
 357     buf[0] = 0xf8 | (k >> 24);
 358     buf[1] = 0x80 | ((k >> 18) & 0x3f);
 359     buf[2] = 0x80 | ((k >> 12) & 0x3f);
 360     buf[3] = 0x80 | ((k >> 6) & 0x3f);
 361     buf[4] = 0x80 | (k & 0x3f);
 362     retval = 5;
 363   } else {
 364     buf[0] = 0xfc | (k >> 30);
 365     buf[1] = 0x80 | ((k >> 24) & 0x3f);
 366     buf[2] = 0x80 | ((k >> 18) & 0x3f);
 367     buf[3] = 0x80 | ((k >> 12) & 0x3f);
 368     buf[4] = 0x80 | ((k >> 6) & 0x3f);
 369     buf[5] = 0x80 | (k & 0x3f);
 370     retval = 6;
 371   }
 372   buf[retval] = '\0';
 373
 374   return retval;
 375 }
 376
 377 using HtmlEntityMap = hphp_hash_map<const char*,std::string,cstr_hash,eqstr>;
 378
 379 static volatile bool EntityMapInited = false;
 380 static Mutex EntityMapMutex;
 381 static HtmlEntityMap EntityMap[cs_end];
 382 static HtmlEntityMap XHPEntityMap[cs_end];
 383
 384 static void init_entity_table() {
 385   for (unsigned int i = 0; entity_map[i].charset != cs_terminator; i++) {
 386     const html_entity_map &em = entity_map[i];
 387     const entity_charset charset = entity_map[i].charset;
 388
 389     int index = 0;
 390     for (int ch = em.basechar; ch <= em.endchar; ch++, index++) {
 391       const char *entity = em.table[index];
 392       if (entity == nullptr) {
 393         continue;
 394       }
 395       unsigned char buf[10];
 396       switch (charset) {
 397         case cs_8859_1:
 398         case cs_cp1252:
 399         case cs_8859_15:
 400         case cs_cp1251:
 401         case cs_8859_5:
 402         case cs_cp866:
 403         case cs_koi8r:
 404           buf[0] = ch;
 405           buf[1] = '\0';
 406           break;
 407
 408         case cs_utf_8:
 409           utf32_to_utf8(buf, ch);
 410           break;
 411
 412         default:
 413           continue;
 414       }
 415       EntityMap[charset][entity] = (const char *)buf;
 416       XHPEntityMap[charset][entity] = (const char *)buf;
 417     }
 418
 419     EntityMap[charset]["quot"] = "\"";
 420     EntityMap[charset]["lt"] = "<";
 421     EntityMap[charset]["gt"] = ">";
 422     EntityMap[charset]["amp"] = "&";
 423
 424     XHPEntityMap[charset]["quot"] = "\"";
 425     XHPEntityMap[charset]["lt"] = "<";
 426     XHPEntityMap[charset]["gt"] = ">";
 427     XHPEntityMap[charset]["amp"] = "&";
 428     // XHP-specific entities
 429     XHPEntityMap[charset]["apos"] = "\'";
 430     XHPEntityMap[charset]["cloud"] = "\u2601";
 431     XHPEntityMap[charset]["umbrella"] = "\u2602";
 432     XHPEntityMap[charset]["snowman"] = "\u2603";
 433     XHPEntityMap[charset]["snowflake"] = "\u2745";
 434     XHPEntityMap[charset]["comet"] = "\u2604";
 435     XHPEntityMap[charset]["thunderstorm"] = "\u2608";
 436   }
 437
 438   // the first element is an empty table
 439   EntityMap[cs_terminator]["quot"] = "\"";
 440   EntityMap[cs_terminator]["lt"] = "<";
 441   EntityMap[cs_terminator]["gt"] = ">";
 442   EntityMap[cs_terminator]["amp"] = "&";
 443   // XHP-specific entities
 444   XHPEntityMap[cs_terminator]["apos"] = "\'";
 445   XHPEntityMap[cs_terminator]["cloud"] = "\u2601";
 446   XHPEntityMap[cs_terminator]["umbrella"] = "\u2602";
 447   XHPEntityMap[cs_terminator]["snowman"] = "\u2603";
 448   XHPEntityMap[cs_terminator]["snowflake"] = "\u2745";
 449   XHPEntityMap[cs_terminator]["comet"] = "\u2604";
 450   XHPEntityMap[cs_terminator]["thunderstorm"] = "\u2608";
 451 }
 452
 453 ///////////////////////////////////////////////////////////////////////////////
 454 inline static bool decode_entity(char *entity, int *len,
 455                                  bool decode_double_quote,
 456                                  bool decode_single_quote,
 457                                  entity_charset charset, bool all,
 458                                  bool xhp = false) {
 459   // entity is 16 bytes, allocated statically below
 460   // default in PHP
 461   assert(entity && *entity);
 462   if (entity[0] == '#') {
 463     int code;
 464     if (entity[1] == 'x' || entity[1] == 'X') {
 465       code = strtol(entity + 2, nullptr, 16);
 466     } else {
 467       code = strtol(entity + 1, nullptr, 10);
 468     }
 469
 470     // since we don't support multibyte chars other than utf-8
 471     int l = 1;
 472
 473     if (code == 39 && decode_single_quote) {
 474       entity[0] = code;
 475       entity[1] = '\0';
 476       *len = l;
 477       return true;
 478     }
 479
 480     switch (charset) {
 481       case cs_utf_8:
 482       {
 483         unsigned char buf[10];
 484         int size = utf32_to_utf8(buf, code);
 485         memcpy(entity, buf, size + 1);
 486         l = size;
 487         break;
 488       }
 489
 490       case cs_8859_1:
 491       case cs_8859_5:
 492       case cs_8859_15:
 493         if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
 494           return false;
 495         } else {
 496           if (code == 39) {
 497             return false;
 498           }
 499           entity[0] = code;
 500           entity[1] = '\0';
 501         }
 502         break;
 503
 504       case cs_cp1252:
 505       case cs_cp1251:
 506       case cs_cp866:
 507         if (code > 0xff) {
 508           return false;
 509         }
 510         entity[0] = code;
 511         entity[1] = '\0';
 512         break;
 513
 514       case cs_big5:
 515       case cs_big5hkscs:
 516       case cs_sjis:
 517       case cs_eucjp:
 518         if (code >= 0x80) {
 519           return false;
 520         }
 521         entity[0] = code;
 522         entity[1] = '\0';
 523         break;
 524
 525       case cs_gb2312:
 526         if (code >= 0x81) {
 527           return false;
 528         }
 529         entity[0] = code;
 530         entity[1] = '\0';
 531         break;
 532
 533       default:
 534         return false;
 535         break;
 536     }
 537     *len = l;
 538     return true;
 539   } else {
 540     HtmlEntityMap *entityMap;
 541
 542     if (strncasecmp(entity, "quot", 4) == 0 && !decode_double_quote) {
 543       return false;
 544     }
 545
 546     if (all) {
 547       entityMap = xhp ? &XHPEntityMap[charset] : &EntityMap[charset];
 548     } else {
 549       entityMap = xhp ? &XHPEntityMap[cs_terminator]
 550                       : &EntityMap[cs_terminator];
 551     }
 552     HtmlEntityMap::const_iterator iter = entityMap->find(entity);
 553     if (iter != entityMap->end()) {
 554       memcpy(entity, iter->second.c_str(), iter->second.length() + 1);
 555       *len = iter->second.length();
 556       return true;
 557     }
 558   }
 559
 560   return false;
 561 }
 562
 563 inline static bool encode_entity(char* buf, int* buflen,
 564                                  const char* entity, bool utf8) {
 565   entity_charset charset = cs_utf_8;
 566   if (!utf8){ charset = cs_8859_1; }
 567
 568   HtmlEntityMap *entityMap = &EntityMap[charset];
 569
 570   for(HtmlEntityMap::const_iterator iter = entityMap->begin();
 571       iter != entityMap->end(); iter++) {
 572     if (strcmp(iter->second.c_str(), entity) == 0) {
 573       memcpy(buf, iter->first, strlen(iter->first));
 574       *buflen = strlen(iter->first);
 575       return true;
 576     }
 577   }
 578   return false;
 579 }
 580
 581 char *string_html_encode(const char *input, int &len,
 582                          const int64_t qsBitmask, bool utf8,
 583                          bool dEncode, bool htmlEnt) {
 584   assert(input);
 585   /**
 586    * Though seems to be wasting memory a lot, we have to realize most of the
 587    * time this function is called with small strings, or fragments of HTMLs.
 588    * Allocating/deallocating anything less than 1K is trivial these days, and
 589    * we want avoid string copying as much as possible. Of course, the return
 590    * char * is really sent back at large, occupying unnessary space for
 591    * potentially longer time than we need, we have to realize the two closest
 592    * solutions are not that much better, either:
 593    *
 594    * 1. pre-calculate size by iterating through the string once: too time
 595    *    consuming;
 596    * 2. take a guess and double buffer size when over: still wasting, and
 597    *    it may not save that much.
 598    *
 599    * Note: Amount of  allocation per character to be encoded may have to be
 600    * increased as larger HTML Entities are implemented.
 601    */
 602   char *ret = (char *)malloc(len * 14uL + 1);
 603   if (!ret) {
 604     return nullptr;
 605   }
 606   char *q = ret;
 607   for (const char *p = input, *end = input + len; p < end; p++) {
 608     unsigned char c = *p;
 609     char entity[5];
 610     int codeLength = 0;
 611     switch (c) {
 612     case '"':
 613       if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_DOUBLE)) {
 614         *q++ = '&'; *q++ = 'q'; *q++ = 'u'; *q++ = 'o'; *q++ = 't'; *q++ = ';';
 615       } else {
 616         *q++ = c;
 617       }
 618       break;
 619     case '\'':
 620       if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SINGLE)) {
 621         *q++ = '&'; *q++ = '#'; *q++ = '0'; *q++ = '3'; *q++ = '9'; *q++ = ';';
 622       } else {
 623         *q++ = c;
 624       }
 625       break;
 626     case '<':
 627       *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
 628       break;
 629     case '>':
 630       *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
 631       break;
 632     case '&':
 633       if (!dEncode) {
 634         p++;
 635
 636         html_get_entity_map();
 637
 638         bool found = false;
 639         for (const char *t = p; *t; t++) {
 640           if (*t == ';') {
 641             int l = t - p;
 642             if (l > 0) {
 643               char sbuf[16] = {0};
 644               char *buf;
 645               if (l > 10) {
 646                 buf = (char* )malloc(l + 1);
 647               } else {
 648                 buf = sbuf;
 649               }
 650               memcpy(buf, p, l);
 651               buf[l] = '\0';
 652               if (decode_entity(buf, &l, true, true,
 653                 cs_utf_8, true)) {
 654                 found = true;
 655                 *q++ = '&';
 656                 for(const char *s = p; s <= t; s++) {
 657                   *q++ = *s;
 658                 }
 659                 p = t;
 660               }
 661               if (buf != sbuf) {
 662                 free(buf);
 663               }
 664             }
 665             break;
 666           }
 667         }
 668         if (!found) {
 669           p--;
 670           *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
 671         }
 672       } else {
 673         *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
 674       }
 675       break;
 676     case static_cast<unsigned char>('\xc2'):
 677       if (htmlEnt && utf8 && p != end && *(p+1) == '\xa0') {
 678         *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
 679         p++;
 680         break;
 681       }
 682
 683       // fallthrough
 684     default: {
 685       if (LIKELY(c < 0x80)) {
 686         *q++ = c;
 687         break;
 688       }
 689
 690       bool should_skip =
 691         qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_IGNORE);
 692       bool should_replace =
 693         qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SUBSTITUTE);
 694
 695       if (!utf8 && should_skip) {
 696         break;
 697       }
 698
 699       auto avail = end - p;
 700       auto utf8_trail = [](unsigned char c) { return c >= 0x80 && c <= 0xbf; };
 701
 702       // This has to be a macro since it needs to be able to break away from
 703       // the for loop we're in.
 704       // ENT_IGNORE has higher precedence than ENT_SUBSTITUTE
 705       // \uFFFD is Unicode Replacement Character (U+FFFD)
 706       #define UTF8_ERROR_IF(cond) \
 707         if (cond) { \
 708           if (should_skip) { break; } \
 709           else if (should_replace) { strcpy(q, "\uFFFD"); q += 3; break; } \
 710           else { goto exit_error; } \
 711         }
 712
 713       if (utf8) {
 714         if (c < 0xc2) {
 715           UTF8_ERROR_IF(true);
 716         } else if (c < 0xe0) {
 717           UTF8_ERROR_IF(avail < 2 || !utf8_trail(*(p + 1)));
 718
 719           uint16_t tc = ((c & 0x1f) << 6) | (p[1] & 0x3f);
 720           UTF8_ERROR_IF(tc < 0x80); // non-shortest form
 721
 722           codeLength = 2;
 723           entity[0] = *p;
 724           entity[1] = *(p + 1);
 725           entity[2] = '\0';
 726         } else if (c < 0xf0) {
 727           UTF8_ERROR_IF(avail < 3);
 728           for (int i = 1; i < 3; ++i) {
 729             UTF8_ERROR_IF(!utf8_trail(*(p + i)));
 730           }
 731
 732           uint32_t tc = ((c & 0x0f) << 12) |
 733                         ((*(p+1) & 0x3f) << 6) |
 734                         (*(p+2) & 0x3f);
 735           UTF8_ERROR_IF(tc < 0x800); // non-shortest form
 736           UTF8_ERROR_IF(tc >= 0xd800 && tc <= 0xdfff); // surrogate
 737
 738           codeLength = 3;
 739           entity[0] = *p;
 740           entity[1] = *(p + 1);
 741           entity[2] = *(p + 2);
 742           entity[3] = '\0';
 743         } else if (c < 0xf5) {
 744           UTF8_ERROR_IF(avail < 4);
 745           for (int i = 1; i < 4; ++i) {
 746             UTF8_ERROR_IF(!utf8_trail(*(p + i)));
 747           }
 748
 749           uint32_t tc = ((c & 0x07) << 18) |
 750                         ((*(p+1) & 0x3f) << 12) |
 751                         ((*(p+2) & 0x3f) << 6) |
 752                         (*(p+3) & 0x3f);
 753
 754           // non-shortest form or outside range
 755           UTF8_ERROR_IF(tc < 0x10000 || tc > 0x10ffff);
 756
 757           codeLength = 4;
 758           entity[0] = *p;
 759           entity[1] = *(p + 1);
 760           entity[2] = *(p + 2);
 761           entity[3] = *(p + 3);
 762           entity[4] = '\0';
 763         } else {
 764           UTF8_ERROR_IF(true);
 765         }
 766       } else {
 767         codeLength = 1;
 768         entity[0] = *p;
 769         entity[1] = '\0';
 770       }
 771
 772       if (htmlEnt) {
 773         html_get_entity_map();
 774
 775         char buf[16] = {0};
 776         buf[0] = c;
 777         int len = 1;
 778
 779         if (encode_entity(buf, &len, const_cast<char*>(entity), utf8)) {
 780           *q++ = '&';
 781           const char *s = buf;
 782           for (int n = 0; n < len; n++) {
 783             *q++ = *s++;
 784           }
 785           *q++ = ';';
 786         } else {
 787           memcpy(q, p, codeLength);
 788           q += codeLength;
 789         }
 790       } else {
 791         memcpy(q, p, codeLength);
 792         q += codeLength;
 793       }
 794       p += codeLength - 1;
 795
 796       break;
 797     }
 798     }
 799
 800   }
 801
 802   #undef UTF8_ERROR_IF
 803
 804   if (q - ret > INT_MAX) {
 805     goto exit_error;
 806   }
 807   *q = 0;
 808   len = q - ret;
 809   return ret;
 810
 811 exit_error:
 812   free(ret);
 813   return nullptr;
 814 }
 815
 816 char *string_html_encode_extra(const char *input, int &len,
 817                                StringHtmlEncoding flags,
 818                                const AsciiMap *asciiMap) {
 819   assert(input);
 820   /**
 821    * Though seems to be wasting memory a lot, we have to realize most of the
 822    * time this function is called with small strings, or fragments of HTMLs.
 823    * Allocating/deallocating anything less than 1K is trivial these days, and
 824    * we want avoid string copying as much as possible. Of course, the return
 825    * char * is really sent back at large, occupying unnessary space for
 826    * potentially longer time than we need, we have to realize the two closest
 827    * solutions are not that much better, either:
 828    *
 829    * 1. pre-calculate size by iterating through the string once: too time
 830    *    consuming;
 831    * 2. take a guess and double buffer size when over: still wasting, and
 832    *    it may not save that much.
 833    */
 834   char *ret = (char *)malloc(len * 8uL + 1);
 835   if (!ret) {
 836     return nullptr;
 837   }
 838   char *q = ret;
 839   const char *rep = "\ufffd";
 840   int32_t srcPosBytes;
 841   for (srcPosBytes = 0; srcPosBytes < len; /* incremented in-loop */) {
 842     unsigned char c = input[srcPosBytes];
 843     if (c && c < 128) {
 844       srcPosBytes++; // Optimize US-ASCII case
 845       if ((asciiMap->map[c & 64 ? 1 : 0] >> (c & 63)) & 1) {
 846         switch (c) {
 847           case '"':
 848             *q++ = '&'; *q++ = 'q'; *q++ = 'u';
 849             *q++ = 'o'; *q++ = 't'; *q++ = ';';
 850             break;
 851           case '\'':
 852             *q++ = '&'; *q++ = '#'; *q++ = '0';
 853             *q++ = '3'; *q++ = '9'; *q++ = ';';
 854             break;
 855           case '<':
 856             *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
 857             break;
 858           case '>':
 859             *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
 860             break;
 861           case '&':
 862             *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
 863             break;
 864           default:
 865             *q++ = '&'; *q++ = '#';
 866             *q++ = c >= 100 ? '1' : '0';
 867             *q++ = ((c / 10) % 10) + '0';
 868             *q++ = (c % 10) + '0';
 869             *q++ = ';';
 870             break;
 871         }
 872       } else {
 873         *q++ = c;
 874       }
 875     } else if (flags & STRING_HTML_ENCODE_UTF8) {
 876       UChar32 curCodePoint;
 877       U8_NEXT(input, srcPosBytes, len, curCodePoint);
 878       if ((flags & STRING_HTML_ENCODE_NBSP) && curCodePoint == 0xC2A0) {
 879         *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
 880       } else if (curCodePoint <= 0) {
 881         if (flags & STRING_HTML_ENCODE_UTF8IZE_REPLACE) {
 882           if (flags & STRING_HTML_ENCODE_HIGH) {
 883             *q++ = '&'; *q++ = '#'; *q++ = 'x';
 884             *q++ = 'f'; *q++ = 'f'; *q++ = 'f'; *q++ = 'd';
 885             *q++ = ';';
 886           } else {
 887             const char *r = rep;
 888             while (*r) *q++ = *r++;
 889           }
 890         }
 891       } else if (flags & STRING_HTML_ENCODE_HIGH) {
 892         q += sprintf(q, "&#x%x;", curCodePoint);
 893       } else {
 894         int32_t pos = 0;
 895         U8_APPEND_UNSAFE(q, pos, curCodePoint);
 896         q += pos;
 897       }
 898     } else {
 899       srcPosBytes++; // Optimize US-ASCII case
 900       if (c == 0xa0) {
 901         *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
 902       } else if (flags & STRING_HTML_ENCODE_HIGH) {
 903         *q++ = '&'; *q++ = '#';
 904         *q++ = c >= 200 ? '2' : '1';
 905         *q++ = ((c / 10) % 10) + '0';
 906         *q++ = (c % 10) + '0';
 907         *q++ = ';';
 908       } else {
 909         *q++ = c;
 910       }
 911     }
 912   }
 913   if (q - ret > INT_MAX) {
 914     free(ret);
 915     return nullptr;
 916   }
 917   *q = 0;
 918   len = q - ret;
 919   return ret;
 920 }
 921
 922 char *string_html_decode(const char *input, int &len,
 923                          bool decode_double_quote, bool decode_single_quote,
 924                          const char *charset_hint, bool all,
 925                          bool xhp /* = false */) {
 926   assert(input);
 927
 928   if (!EntityMapInited) {
 929     Lock lock(EntityMapMutex);
 930     if (!EntityMapInited) {
 931       init_entity_table();
 932       EntityMapInited = true;
 933     }
 934   }
 935
 936   entity_charset charset = determine_charset(charset_hint);
 937   if (charset == cs_unknown) {
 938     return nullptr;
 939   }
 940
 941   char *ret = (char *)malloc(len + 1);
 942   char *q = ret;
 943   for (const char *p = input; *p || UNLIKELY(p - input < len); p++) {
 944     char ch = *p;
 945     if (ch != '&') {
 946       *q++ = ch;
 947       continue;
 948     }
 949     p++;
 950
 951     bool found = false;
 952     for (const char *t = p; *t; t++) {
 953       if (*t == ';') {
 954         int l = t - p;
 955         if (l > 0) {
 956           char sbuf[16] = {0};
 957           char *buf;
 958           if (l > 10) {
 959             buf = (char* )malloc(l + 1);
 960           } else {
 961             buf = sbuf;
 962           }
 963           memcpy(buf, p, l);
 964           buf[l] = '\0';
 965           if (decode_entity(buf, &l, decode_double_quote, decode_single_quote,
 966                             charset, all, xhp)) {
 967             memcpy(q, buf, l);
 968             found = true;
 969             p = t;
 970             q += l;
 971           }
 972           if (buf != sbuf) {
 973             free(buf);
 974           }
 975         }
 976         break;
 977       }
 978     }
 979     if (!found) {
 980       p--;
 981       *q++ = '&'; // not an entity
 982     }
 983   }
 984   *q = '\0';
 985   len = q - ret;
 986   return ret;
 987 }
 988
 989 const html_entity_map* html_get_entity_map() {
 990   if (!EntityMapInited) {
 991     Lock lock(EntityMapMutex);
 992     if (!EntityMapInited) {
 993       init_entity_table();
 994       EntityMapInited = true;
 995     }
 996   }
 997   return entity_map;
 998 }
 999
1000 ///////////////////////////////////////////////////////////////////////////////
1001 }