1 /**********************************************************************
2 utf_32be.c - Oniguruma (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 static OnigCodePoint
utf32be_mbc_to_code(const UChar
* p
, const UChar
* end
, OnigEncoding enc
);
35 utf32be_mbc_enc_len(const UChar
* p ARG_UNUSED
, const OnigUChar
* e
,
39 return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
42 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-(int)(e
-p
));
45 OnigCodePoint c
= utf32be_mbc_to_code(p
, e
, enc
);
46 if (!UNICODE_VALID_CODEPOINT_P(c
))
47 return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
48 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
53 utf32be_is_mbc_newline(const UChar
* p
, const UChar
* end
,
54 OnigEncoding enc ARG_UNUSED
)
57 if (*(p
+3) == 0x0a && *(p
+2) == 0 && *(p
+1) == 0 && *p
== 0)
59 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
60 if ((*(p
+3) == 0x0b || *(p
+3) == 0x0c || *(p
+3) == 0x0d || *(p
+3) == 0x85)
61 && *(p
+2) == 0 && *(p
+1) == 0 && *p
== 0x00)
63 if (*(p
+2) == 0x20 && (*(p
+3) == 0x29 || *(p
+3) == 0x28)
64 && *(p
+1) == 0 && *p
== 0)
72 utf32be_mbc_to_code(const UChar
* p
, const UChar
* end ARG_UNUSED
,
73 OnigEncoding enc ARG_UNUSED
)
75 return (OnigCodePoint
)(((p
[0] * 256 + p
[1]) * 256 + p
[2]) * 256 + p
[3]);
79 utf32be_code_to_mbclen(OnigCodePoint code ARG_UNUSED
,
80 OnigEncoding enc ARG_UNUSED
)
86 utf32be_code_to_mbc(OnigCodePoint code
, UChar
*buf
,
87 OnigEncoding enc ARG_UNUSED
)
91 *p
++ = (UChar
)((code
& 0xff000000) >>24);
92 *p
++ = (UChar
)((code
& 0xff0000) >>16);
93 *p
++ = (UChar
)((code
& 0xff00) >> 8);
94 *p
++ = (UChar
) (code
& 0xff);
99 utf32be_mbc_case_fold(OnigCaseFoldType flag
,
100 const UChar
** pp
, const UChar
* end
, UChar
* fold
,
103 const UChar
* p
= *pp
;
105 if (ONIGENC_IS_ASCII_CODE(*(p
+3)) && *(p
+2) == 0 && *(p
+1) == 0 && *p
== 0) {
109 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
110 if ((flag
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) != 0) {
111 if (*(p
+3) == 0x49) {
121 *fold
= ONIGENC_ASCII_CODE_TO_LOWER_CASE(*(p
+3));
126 return onigenc_unicode_mbc_case_fold(enc
, flag
, pp
,
132 utf32be_is_mbc_ambiguous(OnigCaseFoldType flag
, const UChar
** pp
, const UChar
* end
)
134 const UChar
* p
= *pp
;
138 if (*(p
+2) == 0 && *(p
+1) == 0 && *p
== 0) {
142 if (*p
== SHARP_s
&& (flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) != 0) {
147 v
= ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c
,
148 (BIT_CTYPE_UPPER
| BIT_CTYPE_LOWER
));
149 if ((v
| BIT_CTYPE_LOWER
) != 0) {
150 /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
151 if (c
>= 0xaa && c
<= 0xba)
156 return (v
!= 0 ? TRUE
: FALSE
);
164 utf32be_left_adjust_char_head(const UChar
* start
, const UChar
* s
, const UChar
* end
,
165 OnigEncoding enc ARG_UNUSED
)
169 if (s
<= start
) return (UChar
* )s
;
171 rem
= (int )((s
- start
) % 4);
172 return (UChar
* )(s
- rem
);
176 utf32be_get_case_fold_codes_by_str(OnigCaseFoldType flag
,
177 const OnigUChar
* p
, const OnigUChar
* end
,
178 OnigCaseFoldCodeItem items
[],
181 return onigenc_unicode_get_case_fold_codes_by_str(enc
,
182 flag
, p
, end
, items
);
185 OnigEncodingDefine(utf_32be
, UTF_32BE
) = {
187 "UTF-32BE", /* name */
188 4, /* max byte length */
189 4, /* min byte length */
190 utf32be_is_mbc_newline
,
192 utf32be_code_to_mbclen
,
194 utf32be_mbc_case_fold
,
195 onigenc_unicode_apply_all_case_fold
,
196 utf32be_get_case_fold_codes_by_str
,
197 onigenc_unicode_property_name_to_ctype
,
198 onigenc_unicode_is_code_ctype
,
199 onigenc_utf16_32_get_ctype_code_range
,
200 utf32be_left_adjust_char_head
,
201 onigenc_always_false_is_allowed_reverse_match
,
202 onigenc_unicode_case_map
,
204 ONIGENC_FLAG_UNICODE
,
206 ENC_ALIAS("UCS-4BE", "UTF-32BE")