1 /**********************************************************************
2 utf_16be.c - Oniguruma (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 static const int EncLen_UTF16
[] = {
35 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
36 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
37 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
38 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
39 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
40 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
41 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
42 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
49 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
55 utf16be_mbc_enc_len(const UChar
* p
, const OnigUChar
* e ARG_UNUSED
,
56 OnigEncoding enc ARG_UNUSED
)
59 if (!UTF16_IS_SURROGATE(byte
)) {
61 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2);
63 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
65 if (UTF16_IS_SURROGATE_FIRST(byte
)) {
67 case 1: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(3);
68 case 2: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2);
70 if (UTF16_IS_SURROGATE_SECOND(p
[2]))
71 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
74 if (UTF16_IS_SURROGATE_SECOND(p
[2]))
75 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
79 return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
83 utf16be_is_mbc_newline(const UChar
* p
, const UChar
* end
,
87 if (*(p
+1) == 0x0a && *p
== 0x00)
89 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
90 if ((*(p
+1) == 0x0b || *(p
+1) == 0x0c || *(p
+1) == 0x0d || *(p
+1) == 0x85)
93 if (*p
== 0x20 && (*(p
+1) == 0x29 || *(p
+1) == 0x28))
101 utf16be_mbc_to_code(const UChar
* p
, const UChar
* end ARG_UNUSED
,
106 if (UTF16_IS_SURROGATE_FIRST(*p
)) {
107 code
= ((((p
[0] << 8) + p
[1]) & 0x03ff) << 10)
108 + (((p
[2] << 8) + p
[3]) & 0x03ff) + 0x10000;
111 code
= p
[0] * 256 + p
[1];
117 utf16be_code_to_mbclen(OnigCodePoint code
,
120 return (code
> 0xffff ? 4 : 2);
124 utf16be_code_to_mbc(OnigCodePoint code
, UChar
*buf
,
130 unsigned int high
= (code
>> 10) + 0xD7C0;
131 unsigned int low
= (code
& 0x3FF) + 0xDC00;
132 *p
++ = (high
>> 8) & 0xFF;
134 *p
++ = (low
>> 8) & 0xFF;
139 *p
++ = (UChar
)((code
& 0xff00) >> 8);
140 *p
++ = (UChar
)(code
& 0xff);
146 utf16be_mbc_case_fold(OnigCaseFoldType flag
,
147 const UChar
** pp
, const UChar
* end
, UChar
* fold
,
150 const UChar
* p
= *pp
;
152 if (ONIGENC_IS_ASCII_CODE(*(p
+1)) && *p
== 0) {
154 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
155 if ((flag
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) != 0) {
166 *fold
= ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p
);
171 return onigenc_unicode_mbc_case_fold(enc
, flag
,
177 utf16be_is_mbc_ambiguous(OnigCaseFoldType flag
, const UChar
** pp
, const UChar
* end
)
179 const UChar
* p
= *pp
;
181 (*pp
) += EncLen_UTF16
[*p
];
187 if (*p
== SHARP_s
&& (flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) != 0) {
192 v
= ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c
,
193 (BIT_CTYPE_UPPER
| BIT_CTYPE_LOWER
));
195 if ((v
| BIT_CTYPE_LOWER
) != 0) {
196 /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
197 if (c
>= 0xaa && c
<= 0xba)
202 return (v
!= 0 ? TRUE
: FALSE
);
210 utf16be_left_adjust_char_head(const UChar
* start
, const UChar
* s
, const UChar
* end
,
211 OnigEncoding enc ARG_UNUSED
)
213 if (s
<= start
) return (UChar
* )s
;
215 if ((s
- start
) % 2 == 1) {
219 if (UTF16_IS_SURROGATE_SECOND(*s
) && s
> start
+ 1)
226 utf16be_get_case_fold_codes_by_str(OnigCaseFoldType flag
,
227 const OnigUChar
* p
, const OnigUChar
* end
,
228 OnigCaseFoldCodeItem items
[],
231 return onigenc_unicode_get_case_fold_codes_by_str(enc
,
232 flag
, p
, end
, items
);
235 OnigEncodingDefine(utf_16be
, UTF_16BE
) = {
237 "UTF-16BE", /* name */
238 4, /* max byte length */
239 2, /* min byte length */
240 utf16be_is_mbc_newline
,
242 utf16be_code_to_mbclen
,
244 utf16be_mbc_case_fold
,
245 onigenc_unicode_apply_all_case_fold
,
246 utf16be_get_case_fold_codes_by_str
,
247 onigenc_unicode_property_name_to_ctype
,
248 onigenc_unicode_is_code_ctype
,
249 onigenc_utf16_32_get_ctype_code_range
,
250 utf16be_left_adjust_char_head
,
251 onigenc_always_false_is_allowed_reverse_match
,
252 onigenc_unicode_case_map
,
254 ONIGENC_FLAG_UNICODE
,
256 ENC_ALIAS("UCS-2BE", "UTF-16BE")