2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1997-2014 The PHP Group |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 3.01 of the PHP license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.php.net/license/3_01.txt |
12 | If you did not receive a copy of the PHP license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@php.net so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
16 | Author: Omar Kilani <omar@php.net> |
17 +----------------------------------------------------------------------+
20 #include "hphp/runtime/base/utf8-decode.h"
21 #include "hphp/util/assertions.h"
25 #define CHECK_LEN(pos, chars_need) ((m_strlen - (pos)) >= (chars_need))
27 /* valid as single byte character or leading byte */
28 static bool utf8_lead(unsigned char c
) {
29 return c
< 0x80 || (c
>= 0xC2 && c
<= 0xF4);
32 /* whether it's actually valid depends on other stuff;
33 * this macro cannot check for non-shortest forms, surrogates or
34 * code points above 0x10FFFF */
35 static bool utf8_trail(unsigned char c
) {
36 return c
>= 0x80 && c
<= 0xBF;
39 #define MB_FAILURE(pos, advance) do { \
40 m_cursor = pos + (advance); \
44 // Inspired by ext/standard/html.c:get_next_char()
45 unsigned int UTF8To16Decoder::getNextChar() {
47 unsigned int this_char
= 0;
49 assertx(pos
<= m_strlen
);
51 if (!CHECK_LEN(pos
, 1))
54 /* We'll follow strategy 2. from section 3.6.1 of UTR #36:
55 * "In a reported illegal byte sequence, do not include any
56 * non-initial byte that encodes a valid character or is a leading
57 * byte for a valid sequence." */
58 unsigned char c
= m_str
[pos
];
62 } else if (c
< 0xc2) {
64 } else if (c
< 0xe0) {
65 if (!CHECK_LEN(pos
, 2))
68 if (!utf8_trail(m_str
[pos
+ 1])) {
69 MB_FAILURE(pos
, utf8_lead(m_str
[pos
+ 1]) ? 1 : 2);
71 this_char
= ((c
& 0x1f) << 6) | (m_str
[pos
+ 1] & 0x3f);
72 if (this_char
< 0x80) { /* non-shortest form */
76 } else if (c
< 0xf0) {
77 int avail
= m_strlen
- pos
;
80 !utf8_trail(m_str
[pos
+ 1]) || !utf8_trail(m_str
[pos
+ 2])) {
81 if (avail
< 2 || utf8_lead(m_str
[pos
+ 1]))
83 else if (avail
< 3 || utf8_lead(m_str
[pos
+ 2]))
89 this_char
= ((c
& 0x0f) << 12) | ((m_str
[pos
+ 1] & 0x3f) << 6) |
90 (m_str
[pos
+ 2] & 0x3f);
91 if (this_char
< 0x800) { /* non-shortest form */
93 } else if (this_char
>= 0xd800 && this_char
<= 0xdfff) { /* surrogate */
97 } else if (c
< 0xf5) {
98 int avail
= m_strlen
- pos
;
101 !utf8_trail(m_str
[pos
+ 1]) || !utf8_trail(m_str
[pos
+ 2]) ||
102 !utf8_trail(m_str
[pos
+ 3])) {
103 if (avail
< 2 || utf8_lead(m_str
[pos
+ 1]))
105 else if (avail
< 3 || utf8_lead(m_str
[pos
+ 2]))
107 else if (avail
< 4 || utf8_lead(m_str
[pos
+ 3]))
113 this_char
= ((c
& 0x07) << 18) | ((m_str
[pos
+ 1] & 0x3f) << 12) |
114 ((m_str
[pos
+ 2] & 0x3f) << 6) | (m_str
[pos
+ 3] & 0x3f);
115 if (this_char
< 0x10000 || this_char
> 0x10FFFF) {
116 /* non-shortest form or outside range */
128 int UTF8To16Decoder::decodeTail() {
134 m_low_surrogate
= (0xDC00 | (c
& 0x3FF));
135 return (0xD800 | (c
>> 10));
139 int UTF8To16Decoder::decodeAsUTF8() {
140 if (m_index
== m_cursor
) {
141 // validate the next char
147 return m_str
[m_index
++] & 0xFF;
150 int UTF8To16Decoder::getNext() {
151 int c
= getNextChar();
153 /*** BEGIN Facebook: json_utf8_loose ***/
154 if (m_cursor
> m_strlen
) {
162 /*** END Facebook: json_utf8_loose ***/
168 ///////////////////////////////////////////////////////////////////////////////