Add options to enable sharding
[hiphop-php.git] / hphp / zend / zend-html.h
blobf786bae28b3ae94ebc84e8b005bfc4be0206260f
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #ifndef incl_HPHP_ZEND_HTML_H_
19 #define incl_HPHP_ZEND_HTML_H_
21 #include <cstdint>
23 // Avoid dragging in the icu namespace.
24 #ifndef U_USING_ICU_NAMESPACE
25 #define U_USING_ICU_NAMESPACE 0
26 #endif
28 namespace HPHP {
29 ///////////////////////////////////////////////////////////////////////////////
30 /**
31 * Major departures from Zend:
33 * 1. We are only supporting UTF-8 and ISO-8859-1 encoding.
34 * Major reason for this is because the original get_next_char() bothers me,
35 * sacrificing performance for some character sets that people rarely used
36 * or that people shouldn't use. UTF-8 should really be the standard string
37 * format everywhere, and we ought to write coding specifilized for it to
38 * take full advantage of it: one example would be the new html encoding
39 * function that simply do *p one a time iterating through the strings to
40 * look for special characters for entity escaping.
42 * 2. HTML encoding function no longer encodes entities other than the basic
43 * ones. There is no need to encode them, since all browsers support UTF-8
44 * natively, and we are ok to send out UTF-8 encoding characters without
45 * turning them into printable ASCIIs. Basic entities are encoded for
46 * a different reason! In fact, I personally don't see why HTML spec has
47 * those extended list of entities, other than historical artifacts.
49 * 3. Double encoding parameter is not supported. That really sounds like
50 * a workaround of buggy coding. I don't find a legit use for that yet.
53 struct AsciiMap {
54 uint64_t map[2];
57 enum StringHtmlEncoding {
58 STRING_HTML_ENCODE_UTF8 = 1,
59 STRING_HTML_ENCODE_NBSP = 2,
60 STRING_HTML_ENCODE_HIGH = 4,
61 STRING_HTML_ENCODE_UTF8IZE_REPLACE = 8
64 enum class EntBitmask {
65 ENT_BM_NOQUOTES = 0, /* leave all quotes alone */
66 ENT_BM_SINGLE = 1, /* escape single quotes only */
67 ENT_BM_DOUBLE = 2, /* escape double quotes only */
68 ENT_BM_IGNORE = 4, /* silently discard invalid chars */
69 ENT_BM_SUBSTITUTE = 8, /* replace invalid chars with U+FFFD */
70 ENT_BM_XML1 = 16, /* XML1 mode*/
71 ENT_BM_XHTML = 32, /* XHTML mode */
74 namespace entity_charset_enum {
75 enum entity_charset_impl {
76 cs_terminator, cs_8859_1, cs_cp1252,
77 cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
78 cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
79 cs_cp1251, cs_8859_5, cs_cp866, cs_macroman,
80 cs_unknown,
81 cs_end
84 typedef entity_charset_enum::entity_charset_impl entity_charset;
86 struct HtmlBasicEntity {
87 unsigned short charcode;
88 const char *entity;
89 int entitylen;
90 int flags;
93 typedef const char *const entity_table_t;
95 struct html_entity_map {
96 entity_charset charset; /* charset identifier */
97 unsigned short basechar; /* char code at start of table */
98 unsigned short endchar; /* last char code in the table */
99 entity_table_t *table; /* the table of mappings */
102 const html_entity_map* html_get_entity_map();
105 * returns cs_unknown iff not found;
106 * if input null, returns default charset of cs_utf_8
108 entity_charset determine_charset(const char*);
110 char *string_html_encode(const char *input, int &len,
111 const int64_t qsBitmask, bool utf8,
112 bool dEncode, bool htmlEnt);
113 char *string_html_encode_extra(const char *input, int &len,
114 StringHtmlEncoding flags,
115 const AsciiMap *asciiMap);
118 * returns decoded string;
119 * note, can return nullptr if the charset could not be detected
120 * using the given charset_hint; can also pass in nullptr
121 * for the charset_hint to use the default one (UTF-8).
122 * (see determine_charset).
124 char *string_html_decode(const char *input, int &len,
125 bool decode_double_quote, bool decode_single_quote,
126 const char *charset_hint,
127 bool all, bool xhp = false );
129 ///////////////////////////////////////////////////////////////////////////////
132 #endif