Enable HHIRInliningUseReachableCost by default, tweak inlining constants
[hiphop-php.git] / hphp / runtime / base / utf8-decode.cpp
blobbc3a929799c4c5881141cdb7e07a8e3e2b6933b8
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1997-2014 The PHP Group |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 3.01 of the PHP license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.php.net/license/3_01.txt |
12 | If you did not receive a copy of the PHP license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@php.net so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
16 | Author: Omar Kilani <omar@php.net> |
17 +----------------------------------------------------------------------+
20 #include "hphp/runtime/base/utf8-decode.h"
21 #include "hphp/util/assertions.h"
23 namespace HPHP {
25 #define CHECK_LEN(pos, chars_need) ((m_strlen - (pos)) >= (chars_need))
27 /* valid as single byte character or leading byte */
28 static bool utf8_lead(unsigned char c) {
29 return c < 0x80 || (c >= 0xC2 && c <= 0xF4);
32 /* whether it's actually valid depends on other stuff;
33 * this macro cannot check for non-shortest forms, surrogates or
34 * code points above 0x10FFFF */
35 static bool utf8_trail(unsigned char c) {
36 return c >= 0x80 && c <= 0xBF;
39 #define MB_FAILURE(pos, advance) do { \
40 m_cursor = pos + (advance); \
41 return -1; \
42 } while (0)
44 // Inspired by ext/standard/html.c:get_next_char()
45 unsigned int UTF8To16Decoder::getNextChar() {
46 int pos = m_cursor;
47 unsigned int this_char = 0;
49 assertx(pos <= m_strlen);
51 if (!CHECK_LEN(pos, 1))
52 MB_FAILURE(pos, 1);
54 /* We'll follow strategy 2. from section 3.6.1 of UTR #36:
55 * "In a reported illegal byte sequence, do not include any
56 * non-initial byte that encodes a valid character or is a leading
57 * byte for a valid sequence." */
58 unsigned char c = m_str[pos];
59 if (c < 0x80) {
60 this_char = c;
61 pos++;
62 } else if (c < 0xc2) {
63 MB_FAILURE(pos, 1);
64 } else if (c < 0xe0) {
65 if (!CHECK_LEN(pos, 2))
66 MB_FAILURE(pos, 1);
68 if (!utf8_trail(m_str[pos + 1])) {
69 MB_FAILURE(pos, utf8_lead(m_str[pos + 1]) ? 1 : 2);
71 this_char = ((c & 0x1f) << 6) | (m_str[pos + 1] & 0x3f);
72 if (this_char < 0x80) { /* non-shortest form */
73 MB_FAILURE(pos, 2);
75 pos += 2;
76 } else if (c < 0xf0) {
77 int avail = m_strlen - pos;
79 if (avail < 3 ||
80 !utf8_trail(m_str[pos + 1]) || !utf8_trail(m_str[pos + 2])) {
81 if (avail < 2 || utf8_lead(m_str[pos + 1]))
82 MB_FAILURE(pos, 1);
83 else if (avail < 3 || utf8_lead(m_str[pos + 2]))
84 MB_FAILURE(pos, 2);
85 else
86 MB_FAILURE(pos, 3);
89 this_char = ((c & 0x0f) << 12) | ((m_str[pos + 1] & 0x3f) << 6) |
90 (m_str[pos + 2] & 0x3f);
91 if (this_char < 0x800) { /* non-shortest form */
92 MB_FAILURE(pos, 3);
93 } else if (this_char >= 0xd800 && this_char <= 0xdfff) { /* surrogate */
94 MB_FAILURE(pos, 3);
96 pos += 3;
97 } else if (c < 0xf5) {
98 int avail = m_strlen - pos;
100 if (avail < 4 ||
101 !utf8_trail(m_str[pos + 1]) || !utf8_trail(m_str[pos + 2]) ||
102 !utf8_trail(m_str[pos + 3])) {
103 if (avail < 2 || utf8_lead(m_str[pos + 1]))
104 MB_FAILURE(pos, 1);
105 else if (avail < 3 || utf8_lead(m_str[pos + 2]))
106 MB_FAILURE(pos, 2);
107 else if (avail < 4 || utf8_lead(m_str[pos + 3]))
108 MB_FAILURE(pos, 3);
109 else
110 MB_FAILURE(pos, 4);
113 this_char = ((c & 0x07) << 18) | ((m_str[pos + 1] & 0x3f) << 12) |
114 ((m_str[pos + 2] & 0x3f) << 6) | (m_str[pos + 3] & 0x3f);
115 if (this_char < 0x10000 || this_char > 0x10FFFF) {
116 /* non-shortest form or outside range */
117 MB_FAILURE(pos, 4);
119 pos += 4;
120 } else {
121 MB_FAILURE(pos, 1);
124 m_cursor = pos;
125 return this_char;
128 int UTF8To16Decoder::decodeTail() {
129 int c = getNext();
130 if (c < 0x10000) {
131 return c;
132 } else {
133 c -= 0x10000;
134 m_low_surrogate = (0xDC00 | (c & 0x3FF));
135 return (0xD800 | (c >> 10));
139 int UTF8To16Decoder::decodeAsUTF8() {
140 if (m_index == m_cursor) {
141 // validate the next char
142 int c = getNext();
143 if (c < 0) {
144 return c;
147 return m_str[m_index++] & 0xFF;
150 int UTF8To16Decoder::getNext() {
151 int c = getNextChar();
152 if (c < 0) {
153 /*** BEGIN Facebook: json_utf8_loose ***/
154 if (m_cursor > m_strlen) {
155 return UTF8_END;
157 if (m_loose) {
158 return '?';
159 } else {
160 return UTF8_ERROR;
162 /*** END Facebook: json_utf8_loose ***/
163 } else {
164 return c;
168 ///////////////////////////////////////////////////////////////////////////////