hphp/zend/zend-html.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
   7    +----------------------------------------------------------------------+
   8    | This source file is subject to version 2.00 of the Zend license,     |
   9    | that is bundled with this package in the file LICENSE, and is        |
  10    | available through the world-wide-web at the following url:           |
  11    | http://www.zend.com/license/2_00.txt.                                |
  12    | If you did not receive a copy of the Zend license and are unable to  |
  13    | obtain it through the world-wide-web, please send a note to          |
  14    | license@zend.com so we can mail you a copy immediately.              |
  15    +----------------------------------------------------------------------+
  16 */
  17 #include "hphp/zend/zend-html.h"
  18
  19 #include <unicode/uchar.h>
  20 #include <unicode/utf8.h>
  21
  22 #include "hphp/util/lock.h"
  23 #include "hphp/util/functional.h"
  24 #include "hphp/util/hash-map.h"
  25
  26 namespace HPHP {
  27
  28 ///////////////////////////////////////////////////////////////////////////////
  29 // UTF-8 entity tables
  30
  31 using namespace entity_charset_enum;
  32
  33 static entity_table_t ent_cp_866[] = {
  34   "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
  35   "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
  36   "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
  37   "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
  38   "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
  39   "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
  40   "blk14", "blk12", "blk34", "boxv", "boxvl", "boxvL", "boxVl", "boxDl",
  41   "boxdL", "boxVL", "boxV", "boxDL", "boxUL", "boxUl", "boxuL", "boxdl",
  42   "boxur", "boxhu", "boxhd", "boxvr", "boxh", "boxvh", "boxvR", "boxVr",
  43   "boxUR", "boxDR", "boxHU", "boxHD", "boxVR", "boxH", "boxVH", "boxHu",
  44   "boxhU", "boxHd", "boxhD", "boxUr", "boxuR", "boxdR", "boxDr", "boxVh",
  45   "boxvH", "boxul", "boxdr", "block", "lhblk", nullptr, nullptr, "uhblk",
  46   "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
  47   "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
  48   "IOcy", "iocy", "Jukcy", "jukcy", "YIcy", "yicy", "Ubrcy", "ubrcy",
  49   "deg", nullptr, "middot", "Sqrt", "numero", "curren", nullptr, "nbsp"
  50 };
  51
  52 static entity_table_t ent_cp_1251[] = {
  53   "DJcy", "GJcy", "sbquo", "gjcy", "bdquo", "hellip", "dagger", "Dagger",
  54   "euro", "permil", "LJcy", "lsaquo", "NJcy", "KJcy", "TSHcy", "DZcy",
  55   "djcy", "lsquo", "rsquo", "ldquo", "rdquo", "bull", "ndash", "mdash",
  56   nullptr, "trade", "ljcy", "rsaquo", "njcy", "kjcy", "tshcy", "dzcy",
  57   "nbsp", "Ubrcy", "ubrcy", "Jsercy", "curren", nullptr, "brvbar", "sect",
  58   "IOcy", "copy", "Jukcy", "laquo", "not", "shy", "reg", "YIcy",
  59   "deg", "pm", "Iukcy", "iukcy", nullptr, "micro", "para", "middot",
  60   "iocy", "numero", "jukcy", "raquo", "jsercy", "DScy", "dscy", "yicy",
  61   "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
  62   "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
  63   "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
  64   "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
  65   "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
  66   "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
  67   "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
  68   "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy"
  69 };
  70
  71 /* codepage 1252 is a Windows extension to iso-8859-1. */
  72 static entity_table_t ent_cp_1252[] = {
  73   "euro", nullptr, "sbquo", "fnof", "bdquo", "hellip", "dagger",
  74   "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
  75   nullptr, nullptr, nullptr, nullptr, "lsquo", "rsquo", "ldquo", "rdquo",
  76   "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
  77   "oelig", nullptr, nullptr, "Yuml"
  78 };
  79
  80 static entity_table_t ent_iso_8859_1[] = {
  81   "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
  82   "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
  83   "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
  84   "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
  85   "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
  86   "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
  87   "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
  88   "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
  89   "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
  90   "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
  91   "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
  92   "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
  93   "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
  94   "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
  95   "uuml", "yacute", "thorn", "yuml"
  96 };
  97
  98 static entity_table_t ent_iso_8859_5[] = {
  99   "nbsp", "IOcy", "DJcy", "GJcy", "Jukcy", "DScy", "Iukcy", "YIcy",
 100   "Jsercy", "LJcy", "NJcy", "TSHcy", "KJcy", "shy", "Ubrcy", "DZcy",
 101   "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy",
 102   "Icy", "Jcy", "Kcy", "Lcy", "Mcy", "Ncy", "Ocy", "Pcy",
 103   "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy",
 104   "SHcy", "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy",
 105   "acy", "bcy", "vcy", "gcy", "dcy", "iecy", "zhcy", "zcy",
 106   "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy",
 107   "rcy", "scy", "tcy", "ucy", "fcy", "khcy", "tscy", "chcy",
 108   "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy",
 109   "numero", "iocy", "djcy", "gjcy", "jukcy", "dscy", "iukcy", "yicy",
 110   "jsercy", "ljcy", "njcy", "tshcy", "kjcy", "sect", "ubrcy", "dzcy"
 111 };
 112
 113 static entity_table_t ent_iso_8859_15[] = {
 114   "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
 115   "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
 116   "macr", "deg", "plusmn", "sup2", "sup3", nullptr, /* Zcaron */
 117   "micro", "para", "middot", nullptr, /* zcaron */ "sup1", "ordm",
 118   "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
 119   "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
 120   "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
 121   "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
 122   "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
 123   "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
 124   "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
 125   "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
 126   "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
 127   "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
 128   "uuml", "yacute", "thorn", "yuml"
 129 };
 130
 131 static entity_table_t ent_uni_338_402[] = {
 132   /* 338 (0x0152) */
 133   "OElig", "oelig", nullptr, nullptr, nullptr, nullptr,
 134   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 135   /* 352 (0x0160) */
 136   "Scaron", "scaron", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 137   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 138   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 139   /* 376 (0x0178) */
 140   "Yuml", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 141   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 142   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 143   /* 400 (0x0190) */
 144   nullptr, nullptr, "fnof"
 145 };
 146
 147 static entity_table_t ent_uni_spacing[] = {
 148   /* 710 */
 149   "circ",
 150   /* 711 - 730 */
 151   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 152   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 153   /* 731 - 732 */
 154   nullptr, "tilde"
 155 };
 156
 157 static entity_table_t ent_uni_greek[] = {
 158   /* 913 */
 159   "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
 160   "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
 161   nullptr, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
 162   /* 938 - 944 are not mapped */
 163   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 164   "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
 165   "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
 166   "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
 167   /* 970 - 976 are not mapped */
 168   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 169   "thetasym", "upsih",
 170   nullptr, nullptr, nullptr,
 171   "piv"
 172 };
 173
 174 static entity_table_t ent_uni_punct[] = {
 175   /* 8194 */
 176   "ensp", "emsp", nullptr, nullptr, nullptr, nullptr, nullptr,
 177   "thinsp", nullptr, nullptr, "zwnj", "zwj", "lrm", "rlm",
 178   nullptr, nullptr, nullptr, "ndash", "mdash", nullptr, nullptr, nullptr,
 179   /* 8216 */
 180   "lsquo", "rsquo", "sbquo", nullptr, "ldquo", "rdquo", "bdquo", nullptr,
 181   "dagger", "Dagger", "bull", nullptr, nullptr, nullptr, "hellip",
 182   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "permil", nullptr,
 183   /* 8242 */
 184   "prime", "Prime", nullptr, nullptr, nullptr, nullptr, nullptr, "lsaquo", "rsaquo", nullptr,
 185   nullptr, nullptr, "oline", nullptr, nullptr, nullptr, nullptr, nullptr,
 186   "frasl"
 187 };
 188
 189 static entity_table_t ent_uni_euro[] = {
 190   "euro"
 191 };
 192
 193 static entity_table_t ent_uni_8465_8501[] = {
 194   /* 8465 */
 195   "image", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 196   /* 8472 */
 197   "weierp", nullptr, nullptr, nullptr,
 198   /* 8476 */
 199   "real", nullptr, nullptr, nullptr, nullptr, nullptr,
 200   /* 8482 */
 201   "trade", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 202   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 203   /* 8501 */
 204   "alefsym",
 205 };
 206
 207 static entity_table_t ent_uni_8592_9002[] = {
 208   /* 8592 (0x2190) */
 209   "larr", "uarr", "rarr", "darr", "harr", nullptr, nullptr, nullptr,
 210   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 211   /* 8608 (0x21a0) */
 212   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 213   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 214   /* 8624 (0x21b0) */
 215   nullptr, nullptr, nullptr, nullptr, nullptr, "crarr", nullptr, nullptr,
 216   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 217   /* 8640 (0x21c0) */
 218   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 219   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 220   /* 8656 (0x21d0) */
 221   "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", nullptr, nullptr,
 222   nullptr, nullptr, "lAarr", "rAarr", nullptr, "rarrw", nullptr, nullptr,
 223   /* 8672 (0x21e0) */
 224   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 225   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 226   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 227   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 228   /* 8704 (0x2200) */
 229   "forall", "comp", "part", "exist", "nexist", "empty", nullptr, "nabla",
 230   "isin", "notin", "epsis", "ni", "notni", "bepsi", nullptr, "prod",
 231   /* 8720 (0x2210) */
 232   "coprod", "sum", "minus", "mnplus", "plusdo", nullptr, "setmn", "lowast",
 233   "compfn", nullptr, "radic", nullptr, nullptr, "prop", "infin", "ang90",
 234   /* 8736 (0x2220) */
 235   "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
 236   "or", "cap", "cup", "int", nullptr, nullptr, "conint", nullptr,
 237   /* 8752 (0x2230) */
 238   nullptr, nullptr, nullptr, nullptr, "there4", "becaus", nullptr, nullptr,
 239   nullptr, nullptr, nullptr, nullptr, "sim", "bsim", nullptr, nullptr,
 240   /* 8768 (0x2240) */
 241   "wreath", "nsim", nullptr, "sime", "nsime", "cong", nullptr, "ncong",
 242   "asymp", "nap", "ape", nullptr, "bcong", "asymp", "bump", "bumpe",
 243   /* 8784 (0x2250) */
 244   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 245   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 246   /* 8800 (0x2260) */
 247   "ne", "equiv", nullptr, nullptr, "le", "ge", "lE", "gE",
 248   "lnE", "gnE", "Lt", "Gt", "twixt", nullptr, "nlt", "ngt",
 249   /* 8816 (0x2270) */
 250   "nles", "nges", "lsim", "gsim", nullptr, nullptr, "lg", "gl",
 251   nullptr, nullptr, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
 252   /* 8832 (0x2280) */
 253   "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
 254   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 255   /* 8848 (0x2290) */
 256   nullptr, nullptr, nullptr, nullptr, nullptr, "oplus", nullptr, "otimes",
 257   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 258   /* 8864 (0x22a0) */
 259   nullptr, nullptr, nullptr, nullptr, nullptr, "perp", nullptr, nullptr,
 260   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 261   /* 8880 (0x22b0) */
 262   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 263   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 264   /* 8896 (0x22c0) */
 265   nullptr, nullptr, nullptr, nullptr, nullptr, "sdot", nullptr, nullptr,
 266   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 267   /* 8912 (0x22d0) */
 268   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 269   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 270   /* 8928 (0x22e0) */
 271   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 272   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 273   /* 8944 (0x22f0) */
 274   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 275   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 276   /* 8960 (0x2300) */
 277   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 278   "lceil", "rceil", "lfloor", "rfloor", nullptr, nullptr, nullptr, nullptr,
 279   /* 8976 (0x2310) */
 280   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 281   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 282   /* 8992 (0x2320) */
 283   nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 284   nullptr, "lang", "rang"
 285 };
 286
 287 static entity_table_t ent_uni_9674[] = {
 288   /* 9674 */
 289   "loz"
 290 };
 291
 292 static entity_table_t ent_uni_9824_9830[] = {
 293   /* 9824 */
 294   "spades", nullptr, nullptr, "clubs", nullptr, "hearts", "diams"
 295 };
 296
 297 static const struct html_entity_map entity_map[] = {
 298   { cs_cp866,     0x80, 0xff, ent_cp_866 },
 299   { cs_cp1251,    0x80, 0xff, ent_cp_1251 },
 300   { cs_cp1252,    0x80, 0x9f, ent_cp_1252 },
 301   { cs_cp1252,    0xa0, 0xff, ent_iso_8859_1 },
 302   { cs_8859_1,    0xa0, 0xff, ent_iso_8859_1 },
 303   { cs_8859_5,    0xa0, 0xff, ent_iso_8859_5 },
 304   { cs_8859_15,   0xa0, 0xff, ent_iso_8859_15 },
 305   { cs_utf_8,     0xa0, 0xff, ent_iso_8859_1 },
 306   { cs_utf_8,     338,  402,  ent_uni_338_402 },
 307   { cs_utf_8,     710,  732,  ent_uni_spacing },
 308   { cs_utf_8,     913,  982,  ent_uni_greek },
 309   { cs_utf_8,     8194, 8260, ent_uni_punct },
 310   { cs_utf_8,     8364, 8364, ent_uni_euro },
 311   { cs_utf_8,     8465, 8501, ent_uni_8465_8501 },
 312   { cs_utf_8,     8592, 9002, ent_uni_8592_9002 },
 313   { cs_utf_8,     9674, 9674, ent_uni_9674 },
 314   { cs_utf_8,     9824, 9830, ent_uni_9824_9830 },
 315   { cs_big5,      0xa0, 0xff, ent_iso_8859_1 },
 316   { cs_gb2312,    0xa0, 0xff, ent_iso_8859_1 },
 317   { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
 318   { cs_sjis,      0xa0, 0xff, ent_iso_8859_1 },
 319   { cs_eucjp,     0xa0, 0xff, ent_iso_8859_1 },
 320   /* Missing support for these at the moment
 321   { cs_koi8r,     0xa3, 0xff, ent_koi8r },
 322   { cs_macroman,  0x0b, 0xff, ent_macroman },
 323   */
 324   { cs_terminator }
 325 };
 326
 327 static const struct {
 328   const char *codeset;
 329   entity_charset charset;
 330 } charset_map[] = {
 331   { "ISO-8859-1",     cs_8859_1 },
 332   { "ISO8859-1",      cs_8859_1 },
 333   { "ISO-8859-5",     cs_8859_5 },
 334   { "ISO8859-5",      cs_8859_5 },
 335   { "ISO-8859-15",    cs_8859_15 },
 336   { "ISO8859-15",     cs_8859_15 },
 337   { "utf-8",          cs_utf_8 },
 338   { "cp866",          cs_cp866 },
 339   { "866",            cs_cp866 },
 340   { "ibm866",         cs_cp866 },
 341   { "cp1251",         cs_cp1251 },
 342   { "Windows-1251",   cs_cp1251 },
 343   { "win-1251",       cs_cp1251 },
 344   { "cp1252",         cs_cp1252 },
 345   { "Windows-1252",   cs_cp1252 },
 346   { "1252",           cs_cp1252 },
 347   { "BIG5",           cs_big5 },
 348   { "950",            cs_big5 },
 349   { "GB2312",         cs_gb2312 },
 350   { "936",            cs_gb2312 },
 351   { "BIG5-HKSCS",     cs_big5hkscs },
 352   { "Shift_JIS",      cs_sjis },
 353   { "SJIS",           cs_sjis },
 354   { "932",            cs_sjis },
 355   { "EUCJP",          cs_eucjp },
 356   /* Missing support for these at the moment
 357   { "EUC-JP",         cs_eucjp },
 358   { "KOI8-R",         cs_koi8r },
 359   { "koi8-ru",        cs_koi8r },
 360   { "koi8r",          cs_koi8r },
 361   { "MacRoman",       cs_macroman },
 362   */
 363   { nullptr }
 364 };
 365
 366 ///////////////////////////////////////////////////////////////////////////////
 367
 368 entity_charset determine_charset(const char *charset_hint) {
 369   entity_charset charset = cs_unknown;
 370
 371   if (charset_hint == nullptr) {
 372     // default to utf-8
 373     return cs_utf_8;
 374   }
 375
 376   size_t len = strlen(charset_hint);
 377
 378   /* now walk the charset map and look for the codeset */
 379   for (int i = 0; charset_map[i].codeset; i++) {
 380     if (len == strlen(charset_map[i].codeset) &&
 381       strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
 382       charset = charset_map[i].charset;
 383       break;
 384     }
 385   }
 386
 387   return charset;
 388 }
 389
 390 static int utf32_to_utf8(unsigned char *buf, int k) {
 391   int retval = 0;
 392
 393   if (k < 0x80) {
 394     buf[0] = k;
 395     retval = 1;
 396   } else if (k < 0x800) {
 397     buf[0] = 0xc0 | (k >> 6);
 398     buf[1] = 0x80 | (k & 0x3f);
 399     retval = 2;
 400   } else if (k < 0x10000) {
 401     buf[0] = 0xe0 | (k >> 12);
 402     buf[1] = 0x80 | ((k >> 6) & 0x3f);
 403     buf[2] = 0x80 | (k & 0x3f);
 404     retval = 3;
 405   } else if (k < 0x200000) {
 406     buf[0] = 0xf0 | (k >> 18);
 407     buf[1] = 0x80 | ((k >> 12) & 0x3f);
 408     buf[2] = 0x80 | ((k >> 6) & 0x3f);
 409     buf[3] = 0x80 | (k & 0x3f);
 410     retval = 4;
 411   } else if (k < 0x4000000) {
 412     buf[0] = 0xf8 | (k >> 24);
 413     buf[1] = 0x80 | ((k >> 18) & 0x3f);
 414     buf[2] = 0x80 | ((k >> 12) & 0x3f);
 415     buf[3] = 0x80 | ((k >> 6) & 0x3f);
 416     buf[4] = 0x80 | (k & 0x3f);
 417     retval = 5;
 418   } else {
 419     buf[0] = 0xfc | (k >> 30);
 420     buf[1] = 0x80 | ((k >> 24) & 0x3f);
 421     buf[2] = 0x80 | ((k >> 18) & 0x3f);
 422     buf[3] = 0x80 | ((k >> 12) & 0x3f);
 423     buf[4] = 0x80 | ((k >> 6) & 0x3f);
 424     buf[5] = 0x80 | (k & 0x3f);
 425     retval = 6;
 426   }
 427   buf[retval] = '\0';
 428
 429   return retval;
 430 }
 431
 432 using HtmlEntityMap = hphp_const_char_map<std::string>;
 433
 434 static volatile bool EntityMapInited = false;
 435 static Mutex EntityMapMutex;
 436 static HtmlEntityMap EntityMap[cs_end];
 437 static HtmlEntityMap XHPEntityMap[cs_end];
 438
 439 static void init_entity_table() {
 440   for (unsigned int i = 0; entity_map[i].charset != cs_terminator; i++) {
 441     const html_entity_map &em = entity_map[i];
 442     const entity_charset charset = entity_map[i].charset;
 443
 444     int index = 0;
 445     for (int ch = em.basechar; ch <= em.endchar; ch++, index++) {
 446       const char *entity = em.table[index];
 447       if (entity == nullptr) {
 448         continue;
 449       }
 450       unsigned char buf[10];
 451       switch (charset) {
 452         case cs_8859_1:
 453         case cs_cp1252:
 454         case cs_8859_15:
 455         case cs_cp1251:
 456         case cs_8859_5:
 457         case cs_cp866:
 458         case cs_koi8r:
 459           buf[0] = ch;
 460           buf[1] = '\0';
 461           break;
 462
 463         case cs_utf_8:
 464           utf32_to_utf8(buf, ch);
 465           break;
 466
 467         default:
 468           continue;
 469       }
 470       EntityMap[charset][entity] = (const char *)buf;
 471       XHPEntityMap[charset][entity] = (const char *)buf;
 472     }
 473
 474     EntityMap[charset]["quot"] = "\"";
 475     EntityMap[charset]["lt"] = "<";
 476     EntityMap[charset]["gt"] = ">";
 477     EntityMap[charset]["amp"] = "&";
 478
 479     XHPEntityMap[charset]["quot"] = "\"";
 480     XHPEntityMap[charset]["lt"] = "<";
 481     XHPEntityMap[charset]["gt"] = ">";
 482     XHPEntityMap[charset]["amp"] = "&";
 483     // XHP-specific entities
 484     XHPEntityMap[charset]["apos"] = "\'";
 485     XHPEntityMap[charset]["cloud"] = u8"\u2601";
 486     XHPEntityMap[charset]["umbrella"] = u8"\u2602";
 487     XHPEntityMap[charset]["snowman"] = u8"\u2603";
 488     XHPEntityMap[charset]["snowflake"] = u8"\u2745";
 489     XHPEntityMap[charset]["comet"] = u8"\u2604";
 490     XHPEntityMap[charset]["thunderstorm"] = u8"\u2608";
 491   }
 492
 493   // the first element is an empty table
 494   EntityMap[cs_terminator]["quot"] = "\"";
 495   EntityMap[cs_terminator]["lt"] = "<";
 496   EntityMap[cs_terminator]["gt"] = ">";
 497   EntityMap[cs_terminator]["amp"] = "&";
 498   // XHP-specific entities
 499   XHPEntityMap[cs_terminator]["apos"] = "\'";
 500   XHPEntityMap[cs_terminator]["cloud"] = u8"\u2601";
 501   XHPEntityMap[cs_terminator]["umbrella"] = u8"\u2602";
 502   XHPEntityMap[cs_terminator]["snowman"] = u8"\u2603";
 503   XHPEntityMap[cs_terminator]["snowflake"] = u8"\u2745";
 504   XHPEntityMap[cs_terminator]["comet"] = u8"\u2604";
 505   XHPEntityMap[cs_terminator]["thunderstorm"] = u8"\u2608";
 506 }
 507
 508 ///////////////////////////////////////////////////////////////////////////////
 509 inline static bool decode_entity(char *entity, int *len,
 510                                  bool decode_double_quote,
 511                                  bool decode_single_quote,
 512                                  entity_charset charset, bool all,
 513                                  bool xhp = false) {
 514   // entity is 16 bytes, allocated statically below
 515   // default in PHP
 516   assert(entity && *entity);
 517   if (entity[0] == '#') {
 518     int code;
 519     if (entity[1] == 'x' || entity[1] == 'X') {
 520       if (!isxdigit(entity[2])) return false;
 521       code = strtol(entity + 2, nullptr, 16);
 522     } else {
 523       if (!isdigit(entity[1])) return false;
 524       code = strtol(entity + 1, nullptr, 10);
 525     }
 526
 527     // since we don't support multibyte chars other than utf-8
 528     int l = 1;
 529
 530     if (code == 39 && decode_single_quote) {
 531       entity[0] = code;
 532       entity[1] = '\0';
 533       *len = l;
 534       return true;
 535     }
 536
 537     if (!all          && (code != '&') &&
 538         (code != '<') && (code != '>') &&
 539         (code != '"') && (code != '\'')) {
 540       // htmlspecialchars_decode() does not parse numeric
 541       // entities other than & < > " '
 542       return false;
 543     }
 544
 545     switch (charset) {
 546       case cs_utf_8:
 547       {
 548         unsigned char buf[10];
 549         int size = utf32_to_utf8(buf, code);
 550         memcpy(entity, buf, size + 1);
 551         l = size;
 552         break;
 553       }
 554
 555       case cs_8859_1:
 556       case cs_8859_5:
 557       case cs_8859_15:
 558         if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
 559           return false;
 560         } else {
 561           if (code == 39) {
 562             return false;
 563           }
 564           entity[0] = code;
 565           entity[1] = '\0';
 566         }
 567         break;
 568
 569       case cs_cp1252:
 570       case cs_cp1251:
 571       case cs_cp866:
 572         if (code > 0xff) {
 573           return false;
 574         }
 575         entity[0] = code;
 576         entity[1] = '\0';
 577         break;
 578
 579       case cs_big5:
 580       case cs_big5hkscs:
 581       case cs_sjis:
 582       case cs_eucjp:
 583         if (code >= 0x80) {
 584           return false;
 585         }
 586         entity[0] = code;
 587         entity[1] = '\0';
 588         break;
 589
 590       case cs_gb2312:
 591         if (code >= 0x81) {
 592           return false;
 593         }
 594         entity[0] = code;
 595         entity[1] = '\0';
 596         break;
 597
 598       default:
 599         return false;
 600         break;
 601     }
 602     *len = l;
 603     return true;
 604   } else {
 605     HtmlEntityMap *entityMap;
 606
 607     if (strncasecmp(entity, "quot", 4) == 0 && !decode_double_quote) {
 608       return false;
 609     }
 610
 611     if (all) {
 612       entityMap = xhp ? &XHPEntityMap[charset] : &EntityMap[charset];
 613     } else {
 614       entityMap = xhp ? &XHPEntityMap[cs_terminator]
 615                       : &EntityMap[cs_terminator];
 616     }
 617     HtmlEntityMap::const_iterator iter = entityMap->find(entity);
 618     if (iter != entityMap->end()) {
 619       memcpy(entity, iter->second.c_str(), iter->second.length() + 1);
 620       *len = iter->second.length();
 621       return true;
 622     }
 623   }
 624
 625   return false;
 626 }
 627
 628 inline static bool encode_entity(char* buf, int* buflen,
 629                                  const char* entity, bool utf8) {
 630   entity_charset charset = cs_utf_8;
 631   if (!utf8){ charset = cs_8859_1; }
 632
 633   HtmlEntityMap *entityMap = &EntityMap[charset];
 634
 635   for(HtmlEntityMap::const_iterator iter = entityMap->begin();
 636       iter != entityMap->end(); iter++) {
 637     if (strcmp(iter->second.c_str(), entity) == 0) {
 638       memcpy(buf, iter->first, strlen(iter->first));
 639       *buflen = strlen(iter->first);
 640       return true;
 641     }
 642   }
 643   return false;
 644 }
 645
 646 char *string_html_encode(const char *input, int &len,
 647                          const int64_t qsBitmask, bool utf8,
 648                          bool dEncode, bool htmlEnt) {
 649   assert(input);
 650   /**
 651    * Though seems to be wasting memory a lot, we have to realize most of the
 652    * time this function is called with small strings, or fragments of HTMLs.
 653    * Allocating/deallocating anything less than 1K is trivial these days, and
 654    * we want avoid string copying as much as possible. Of course, the return
 655    * char * is really sent back at large, occupying unnessary space for
 656    * potentially longer time than we need, we have to realize the two closest
 657    * solutions are not that much better, either:
 658    *
 659    * 1. pre-calculate size by iterating through the string once: too time
 660    *    consuming;
 661    * 2. take a guess and double buffer size when over: still wasting, and
 662    *    it may not save that much.
 663    *
 664    * Note: Amount of  allocation per character to be encoded may have to be
 665    * increased as larger HTML Entities are implemented.
 666    */
 667   char *ret = (char *)malloc(len * 14uL + 1);
 668   if (!ret) {
 669     return nullptr;
 670   }
 671   char *q = ret;
 672   for (const char *p = input, *end = input + len; p < end; p++) {
 673     unsigned char c = *p;
 674     char entity[5];
 675     int codeLength = 0;
 676     switch (c) {
 677     case '"':
 678       if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_DOUBLE)) {
 679         *q++ = '&'; *q++ = 'q'; *q++ = 'u'; *q++ = 'o'; *q++ = 't'; *q++ = ';';
 680       } else {
 681         *q++ = c;
 682       }
 683       break;
 684     case '\'':
 685       if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SINGLE)) {
 686         *q++ = '&';
 687         if ((qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_XML1))) {
 688           *q++ = 'a'; *q++ = 'p'; *q++ = 'o'; *q++ = 's';
 689         } else {
 690           *q++ = '#'; *q++ = '0'; *q++ = '3'; *q++ = '9';
 691         }
 692         *q++ = ';';
 693       } else {
 694         *q++ = c;
 695       }
 696       break;
 697     case '<':
 698       *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
 699       break;
 700     case '>':
 701       *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
 702       break;
 703     case '&':
 704       if (!dEncode) {
 705         p++;
 706
 707         html_get_entity_map();
 708
 709         bool found = false;
 710         for (const char *t = p; *t; t++) {
 711           if (*t == ';') {
 712             int l = t - p;
 713             if (l > 0) {
 714               char sbuf[16] = {0};
 715               char *buf;
 716               if (l > 10) {
 717                 buf = (char* )malloc(l + 1);
 718               } else {
 719                 buf = sbuf;
 720               }
 721               memcpy(buf, p, l);
 722               buf[l] = '\0';
 723               if (decode_entity(buf, &l, true, true,
 724                 cs_utf_8, true)) {
 725                 found = true;
 726                 *q++ = '&';
 727                 for(const char *s = p; s <= t; s++) {
 728                   *q++ = *s;
 729                 }
 730                 p = t;
 731               }
 732               if (buf != sbuf) {
 733                 free(buf);
 734               }
 735             }
 736             break;
 737           }
 738         }
 739         if (!found) {
 740           p--;
 741           *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
 742         }
 743       } else {
 744         *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
 745       }
 746       break;
 747     case static_cast<unsigned char>('\xc2'):
 748       if (htmlEnt && utf8 && p != end && *(p+1) == '\xa0') {
 749         *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
 750         p++;
 751         break;
 752       }
 753
 754       // fallthrough
 755     default: {
 756       if (LIKELY(c < 0x80)) {
 757         *q++ = c;
 758         break;
 759       } else if (htmlEnt && !utf8 && (c - 160) < sizeof(ent_iso_8859_1) - 1) {
 760         /**
 761           * https://github.com/facebook/hhvm/issues/2186
 762           * If not UTF8, and we are converting to HTML entities, use known
 763           * entity equivalent of the character, if possible.
 764           * Since we only support ISO-8859-1 or UTF8 right now, and they use
 765           * the same mapping array, use it.
 766           * Start at 0xA0 = 160
 767           */
 768         *q++ = '&';
 769         const char *s = ent_iso_8859_1[c - 160];
 770         int len = strlen(s);
 771         for (int n = 0; n < len; n++) {
 772           *q++ = *s++;
 773         }
 774         *q++ = ';';
 775         break;
 776       }
 777
 778       bool should_skip =
 779         qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_IGNORE);
 780       bool should_replace =
 781         qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SUBSTITUTE);
 782
 783       if (!utf8 && should_skip) {
 784         *q++ = c;
 785         break;
 786       }
 787
 788       auto avail = end - p;
 789       auto utf8_trail = [](unsigned char c) { return c >= 0x80 && c <= 0xbf; };
 790       auto utf8_lead = [](unsigned char c) {
 791         return c < 0x80 || (c >= 0xC2 && c <= 0xF4);
 792       };
 793
 794       // This has to be a macro since it needs to be able to break away from
 795       // the for loop we're in.
 796       // ENT_IGNORE has higher precedence than ENT_SUBSTITUTE
 797       // \uFFFD is Unicode Replacement Character (U+FFFD)
 798       #define UTF8_ERROR_IF_LEN(cond, len) \
 799         if (cond) { \
 800           p += (len) - 1; \
 801           if (should_skip) { break; } \
 802           else if (should_replace) { strcpy(q, u8"\uFFFD"); q += 3; break; } \
 803           else { goto exit_error; } \
 804         }
 805
 806       #define UTF8_ERROR_IF(cond) UTF8_ERROR_IF_LEN(cond, 1)
 807
 808       if (utf8) {
 809         if (c < 0xc2) {
 810           UTF8_ERROR_IF(true);
 811         } else if (c < 0xe0) {
 812           UTF8_ERROR_IF(avail < 2);
 813           UTF8_ERROR_IF_LEN(!utf8_trail(*(p + 1)), utf8_lead(*(p + 1)) ? 1 : 2);
 814
 815           uint16_t tc = ((c & 0x1f) << 6) | (p[1] & 0x3f);
 816           UTF8_ERROR_IF_LEN(tc < 0x80, 2); // non-shortest form
 817
 818           codeLength = 2;
 819           entity[0] = *p;
 820           entity[1] = *(p + 1);
 821           entity[2] = '\0';
 822         } else if (c < 0xf0) {
 823           if (avail < 3 || !utf8_trail(*(p + 1)) || !utf8_trail(*(p + 2))) {
 824             UTF8_ERROR_IF_LEN(avail < 2 || utf8_lead(*(p + 1)), 1);
 825             UTF8_ERROR_IF_LEN(avail < 3 || utf8_lead(*(p + 2)), 2);
 826             UTF8_ERROR_IF_LEN(true, 3);
 827           }
 828
 829           uint32_t tc = ((c & 0x0f) << 12) |
 830                         ((*(p+1) & 0x3f) << 6) |
 831                         (*(p+2) & 0x3f);
 832           UTF8_ERROR_IF_LEN(tc < 0x800, 3); // non-shortest form
 833           UTF8_ERROR_IF_LEN(tc >= 0xd800 && tc <= 0xdfff, 3); // surrogate
 834
 835           codeLength = 3;
 836           entity[0] = *p;
 837           entity[1] = *(p + 1);
 838           entity[2] = *(p + 2);
 839           entity[3] = '\0';
 840         } else if (c < 0xf5) {
 841           if (avail < 4 || !utf8_trail(*(p + 1)) || !utf8_trail(*(p + 2)) ||
 842               !utf8_trail(*(p + 3))) {
 843             UTF8_ERROR_IF_LEN(avail < 2 || utf8_lead(*(p + 1)), 1);
 844             UTF8_ERROR_IF_LEN(avail < 3 || utf8_lead(*(p + 2)), 2);
 845             UTF8_ERROR_IF_LEN(avail < 4 || utf8_lead(*(p + 3)), 3);
 846             UTF8_ERROR_IF_LEN(true, 4);
 847           }
 848
 849           uint32_t tc = ((c & 0x07) << 18) |
 850                         ((*(p+1) & 0x3f) << 12) |
 851                         ((*(p+2) & 0x3f) << 6) |
 852                         (*(p+3) & 0x3f);
 853
 854           // non-shortest form or outside range
 855           UTF8_ERROR_IF_LEN(tc < 0x10000 || tc > 0x10ffff, 4);
 856
 857           codeLength = 4;
 858           entity[0] = *p;
 859           entity[1] = *(p + 1);
 860           entity[2] = *(p + 2);
 861           entity[3] = *(p + 3);
 862           entity[4] = '\0';
 863         } else {
 864           UTF8_ERROR_IF(true);
 865         }
 866       } else {
 867         codeLength = 1;
 868         entity[0] = *p;
 869         entity[1] = '\0';
 870       }
 871
 872       if (htmlEnt) {
 873         html_get_entity_map();
 874
 875         char buf[16] = {0};
 876         buf[0] = c;
 877         int len = 1;
 878
 879         if (encode_entity(buf, &len, const_cast<char*>(entity), utf8)) {
 880           *q++ = '&';
 881           const char *s = buf;
 882           for (int n = 0; n < len; n++) {
 883             *q++ = *s++;
 884           }
 885           *q++ = ';';
 886         } else {
 887           memcpy(q, p, codeLength);
 888           q += codeLength;
 889         }
 890       } else {
 891         memcpy(q, p, codeLength);
 892         q += codeLength;
 893       }
 894       p += codeLength - 1;
 895
 896       break;
 897     }
 898     }
 899
 900   }
 901
 902   #undef UTF8_ERROR_IF
 903   #undef UTF8_ERROR_IF_LEN
 904
 905   if (q - ret > INT_MAX) {
 906     goto exit_error;
 907   }
 908   *q = 0;
 909   len = q - ret;
 910   return ret;
 911
 912 exit_error:
 913   free(ret);
 914   return nullptr;
 915 }
 916
 917 char *string_html_encode_extra(const char *input, int &len,
 918                                StringHtmlEncoding flags,
 919                                const AsciiMap *asciiMap) {
 920   assert(input);
 921   /**
 922    * Though seems to be wasting memory a lot, we have to realize most of the
 923    * time this function is called with small strings, or fragments of HTMLs.
 924    * Allocating/deallocating anything less than 1K is trivial these days, and
 925    * we want avoid string copying as much as possible. Of course, the return
 926    * char * is really sent back at large, occupying unnessary space for
 927    * potentially longer time than we need, we have to realize the two closest
 928    * solutions are not that much better, either:
 929    *
 930    * 1. pre-calculate size by iterating through the string once: too time
 931    *    consuming;
 932    * 2. take a guess and double buffer size when over: still wasting, and
 933    *    it may not save that much.
 934    */
 935   char *ret = (char *)malloc(len * 8uL + 1);
 936   if (!ret) {
 937     return nullptr;
 938   }
 939   char *q = ret;
 940   const char *rep = u8"\ufffd";
 941   int32_t srcPosBytes;
 942   for (srcPosBytes = 0; srcPosBytes < len; /* incremented in-loop */) {
 943     unsigned char c = input[srcPosBytes];
 944     if (c && c < 128) {
 945       srcPosBytes++; // Optimize US-ASCII case
 946       if ((asciiMap->map[c & 64 ? 1 : 0] >> (c & 63)) & 1) {
 947         switch (c) {
 948           case '"':
 949             *q++ = '&'; *q++ = 'q'; *q++ = 'u';
 950             *q++ = 'o'; *q++ = 't'; *q++ = ';';
 951             break;
 952           case '\'':
 953             *q++ = '&'; *q++ = '#'; *q++ = '0';
 954             *q++ = '3'; *q++ = '9'; *q++ = ';';
 955             break;
 956           case '<':
 957             *q++ = '&'; *q++ = 'l'; *q++ = 't'; *q++ = ';';
 958             break;
 959           case '>':
 960             *q++ = '&'; *q++ = 'g'; *q++ = 't'; *q++ = ';';
 961             break;
 962           case '&':
 963             *q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
 964             break;
 965           default:
 966             *q++ = '&'; *q++ = '#';
 967             *q++ = c >= 100 ? '1' : '0';
 968             *q++ = ((c / 10) % 10) + '0';
 969             *q++ = (c % 10) + '0';
 970             *q++ = ';';
 971             break;
 972         }
 973       } else {
 974         *q++ = c;
 975       }
 976     } else if (flags & STRING_HTML_ENCODE_UTF8) {
 977       UChar32 curCodePoint;
 978       U8_NEXT(input, srcPosBytes, len, curCodePoint);
 979       if ((flags & STRING_HTML_ENCODE_NBSP) && curCodePoint == 0xC2A0) {
 980         *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
 981       } else if (curCodePoint <= 0) {
 982         if (flags & STRING_HTML_ENCODE_UTF8IZE_REPLACE) {
 983           if (flags & STRING_HTML_ENCODE_HIGH) {
 984             *q++ = '&'; *q++ = '#'; *q++ = 'x';
 985             *q++ = 'f'; *q++ = 'f'; *q++ = 'f'; *q++ = 'd';
 986             *q++ = ';';
 987           } else {
 988             const char *r = rep;
 989             while (*r) *q++ = *r++;
 990           }
 991         }
 992       } else if (flags & STRING_HTML_ENCODE_HIGH) {
 993         q += sprintf(q, "&#x%x;", curCodePoint);
 994       } else {
 995         int32_t pos = 0;
 996         U8_APPEND_UNSAFE(q, pos, curCodePoint);
 997         q += pos;
 998       }
 999     } else {
1000       srcPosBytes++; // Optimize US-ASCII case
1001       if (c == 0xa0) {
1002         *q++ = '&'; *q++ = 'n'; *q++ = 'b'; *q++ = 's'; *q++ = 'p'; *q++ = ';';
1003       } else if (flags & STRING_HTML_ENCODE_HIGH) {
1004         *q++ = '&'; *q++ = '#';
1005         *q++ = c >= 200 ? '2' : '1';
1006         *q++ = ((c / 10) % 10) + '0';
1007         *q++ = (c % 10) + '0';
1008         *q++ = ';';
1009       } else {
1010         *q++ = c;
1011       }
1012     }
1013   }
1014   if (q - ret > INT_MAX) {
1015     free(ret);
1016     return nullptr;
1017   }
1018   *q = 0;
1019   len = q - ret;
1020   return ret;
1021 }
1022
1023 char *string_html_decode(const char *input, int &len,
1024                          bool decode_double_quote, bool decode_single_quote,
1025                          const char *charset_hint, bool all,
1026                          bool xhp /* = false */) {
1027   assert(input);
1028
1029   if (!EntityMapInited) {
1030     Lock lock(EntityMapMutex);
1031     if (!EntityMapInited) {
1032       init_entity_table();
1033       EntityMapInited = true;
1034     }
1035   }
1036
1037   entity_charset charset = determine_charset(charset_hint);
1038   if (charset == cs_unknown) {
1039     return nullptr;
1040   }
1041
1042   char *ret = (char *)malloc(len + 1);
1043   char *q = ret;
1044   for (const char *p = input; *p || UNLIKELY(p - input < len); p++) {
1045     char ch = *p;
1046     if (ch != '&') {
1047       *q++ = ch;
1048       continue;
1049     }
1050     p++;
1051
1052     bool found = false;
1053     for (const char *t = p; *t; t++) {
1054       if (*t == ';') {
1055         int l = t - p;
1056         if (l > 0) {
1057           char sbuf[16] = {0};
1058           char *buf;
1059           if (l > 10) {
1060             buf = (char* )malloc(l + 1);
1061           } else {
1062             buf = sbuf;
1063           }
1064           memcpy(buf, p, l);
1065           buf[l] = '\0';
1066           if (decode_entity(buf, &l, decode_double_quote, decode_single_quote,
1067                             charset, all, xhp)) {
1068             memcpy(q, buf, l);
1069             found = true;
1070             p = t;
1071             q += l;
1072           }
1073           if (buf != sbuf) {
1074             free(buf);
1075           }
1076         }
1077         break;
1078       }
1079     }
1080     if (!found) {
1081       p--;
1082       *q++ = '&'; // not an entity
1083     }
1084   }
1085   *q = '\0';
1086   len = q - ret;
1087   return ret;
1088 }
1089
1090 const html_entity_map* html_get_entity_map() {
1091   if (!EntityMapInited) {
1092     Lock lock(EntityMapMutex);
1093     if (!EntityMapInited) {
1094       init_entity_table();
1095       EntityMapInited = true;
1096     }
1097   }
1098   return entity_map;
1099 }
1100
1101 ///////////////////////////////////////////////////////////////////////////////
1102 }