1 /**********************************************************************
2 iso8859_1.c - Oniguruma (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \
34 ((EncISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
36 static const unsigned short EncISO_8859_1_CtypeTable
[256] = {
37 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
38 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
39 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
40 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
41 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
42 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
43 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
44 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
45 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
46 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
47 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
48 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
49 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
50 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
51 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
52 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
53 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
54 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
55 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
56 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
57 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
58 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0,
59 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
60 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
61 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
62 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
63 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
64 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
65 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
66 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
67 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
68 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
71 static const OnigPairCaseFoldCodes CaseFoldMap
[] = {
106 apply_all_case_fold(OnigCaseFoldType flag
,
107 OnigApplyAllCaseFoldFunc f
, void* arg
,
108 OnigEncoding enc ARG_UNUSED
)
110 return onigenc_apply_all_case_fold_with_map(
111 numberof(CaseFoldMap
), CaseFoldMap
, 1,
116 get_case_fold_codes_by_str(OnigCaseFoldType flag
,
117 const OnigUChar
* p
, const OnigUChar
* end
,
118 OnigCaseFoldCodeItem items
[],
119 OnigEncoding enc ARG_UNUSED
)
121 if (0x41 <= *p
&& *p
<= 0x5a) {
122 items
[0].byte_len
= 1;
123 items
[0].code_len
= 1;
124 items
[0].code
[0] = (OnigCodePoint
)(*p
+ 0x20);
125 if (*p
== 0x53 && end
> p
+ 1
126 && (*(p
+1) == 0x53 || *(p
+1) == 0x73)) { /* SS */
127 items
[1].byte_len
= 2;
128 items
[1].code_len
= 1;
129 items
[1].code
[0] = (OnigCodePoint
)SHARP_s
;
135 else if (0x61 <= *p
&& *p
<= 0x7a) {
136 items
[0].byte_len
= 1;
137 items
[0].code_len
= 1;
138 items
[0].code
[0] = (OnigCodePoint
)(*p
- 0x20);
139 if (*p
== 0x73 && end
> p
+ 1
140 && (*(p
+1) == 0x73 || *(p
+1) == 0x53)) { /* ss */
141 items
[1].byte_len
= 2;
142 items
[1].code_len
= 1;
143 items
[1].code
[0] = (OnigCodePoint
)SHARP_s
;
149 else if (0xc0 <= *p
&& *p
<= 0xcf) {
150 items
[0].byte_len
= 1;
151 items
[0].code_len
= 1;
152 items
[0].code
[0] = (OnigCodePoint
)(*p
+ 0x20);
155 else if (0xd0 <= *p
&& *p
<= 0xdf) {
157 items
[0].byte_len
= 1;
158 items
[0].code_len
= 2;
159 items
[0].code
[0] = (OnigCodePoint
)'s';
160 items
[0].code
[1] = (OnigCodePoint
)'s';
162 items
[1].byte_len
= 1;
163 items
[1].code_len
= 2;
164 items
[1].code
[0] = (OnigCodePoint
)'S';
165 items
[1].code
[1] = (OnigCodePoint
)'S';
167 items
[2].byte_len
= 1;
168 items
[2].code_len
= 2;
169 items
[2].code
[0] = (OnigCodePoint
)'s';
170 items
[2].code
[1] = (OnigCodePoint
)'S';
172 items
[3].byte_len
= 1;
173 items
[3].code_len
= 2;
174 items
[3].code
[0] = (OnigCodePoint
)'S';
175 items
[3].code
[1] = (OnigCodePoint
)'s';
179 else if (*p
!= 0xd7) {
180 items
[0].byte_len
= 1;
181 items
[0].code_len
= 1;
182 items
[0].code
[0] = (OnigCodePoint
)(*p
+ 0x20);
186 else if (0xe0 <= *p
&& *p
<= 0xef) {
187 items
[0].byte_len
= 1;
188 items
[0].code_len
= 1;
189 items
[0].code
[0] = (OnigCodePoint
)(*p
- 0x20);
192 else if (0xf0 <= *p
&& *p
<= 0xfe) {
194 items
[0].byte_len
= 1;
195 items
[0].code_len
= 1;
196 items
[0].code
[0] = (OnigCodePoint
)(*p
- 0x20);
205 mbc_case_fold(OnigCaseFoldType flag
, const UChar
** pp
, const UChar
* end ARG_UNUSED
,
206 UChar
* lower
, OnigEncoding enc ARG_UNUSED
)
208 const UChar
* p
= *pp
;
210 if (*p
== SHARP_s
&& (flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) != 0) {
217 *lower
= ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p
);
224 is_mbc_ambiguous(OnigCaseFoldType flag
,
225 const UChar
** pp
, const UChar
* end
)
228 const UChar
* p
= *pp
;
230 if (*p
== SHARP_s
&& (flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) != 0) {
236 v
= (EncISO_8859_1_CtypeTable
[*p
] & (BIT_CTYPE_UPPER
| BIT_CTYPE_LOWER
));
237 if ((v
| BIT_CTYPE_LOWER
) != 0) {
238 /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
239 if (*p
>= 0xaa && *p
<= 0xba)
245 return (v
!= 0 ? TRUE
: FALSE
);
250 is_code_ctype(OnigCodePoint code
, unsigned int ctype
, OnigEncoding enc ARG_UNUSED
)
253 return ENC_IS_ISO_8859_1_CTYPE(code
, ctype
);
259 case_map(OnigCaseFoldType
* flagP
, const OnigUChar
** pp
,
260 const OnigUChar
* end
, OnigUChar
* to
, OnigUChar
* to_end
,
261 const struct OnigEncodingTypeST
* enc
)
264 OnigUChar
*to_start
= to
;
265 OnigCaseFoldType flags
= *flagP
;
267 while (*pp
< end
&& to
< to_end
) {
269 if (code
== SHARP_s
) {
270 if (flags
& ONIGENC_CASE_UPCASE
) {
271 flags
|= ONIGENC_CASE_MODIFIED
;
273 code
= (flags
& ONIGENC_CASE_TITLECASE
) ? 's' : 'S';
275 else if (flags
& ONIGENC_CASE_FOLD
) {
276 flags
|= ONIGENC_CASE_MODIFIED
;
281 else if ((EncISO_8859_1_CtypeTable
[code
] & BIT_CTYPE_UPPER
)
282 && (flags
& (ONIGENC_CASE_DOWNCASE
| ONIGENC_CASE_FOLD
))) {
283 flags
|= ONIGENC_CASE_MODIFIED
;
286 else if (code
== 0xAA || code
== 0xBA || code
== 0xB5 || code
== 0xFF)
288 else if ((EncISO_8859_1_CtypeTable
[code
] & BIT_CTYPE_LOWER
)
289 && (flags
& ONIGENC_CASE_UPCASE
)) {
290 flags
|= ONIGENC_CASE_MODIFIED
;
294 if (flags
& ONIGENC_CASE_TITLECASE
) /* switch from titlecase to lowercase for capitalize */
295 flags
^= (ONIGENC_CASE_UPCASE
| ONIGENC_CASE_DOWNCASE
| ONIGENC_CASE_TITLECASE
);
298 return (int )(to
- to_start
);
301 OnigEncodingDefine(iso_8859_1
, ISO_8859_1
) = {
302 onigenc_single_byte_mbc_enc_len
,
303 "ISO-8859-1", /* name */
304 1, /* max enc length */
305 1, /* min enc length */
306 onigenc_is_mbc_newline_0x0a
,
307 onigenc_single_byte_mbc_to_code
,
308 onigenc_single_byte_code_to_mbclen
,
309 onigenc_single_byte_code_to_mbc
,
312 get_case_fold_codes_by_str
,
313 onigenc_minimum_property_name_to_ctype
,
315 onigenc_not_support_get_ctype_code_range
,
316 onigenc_single_byte_left_adjust_char_head
,
317 onigenc_always_true_is_allowed_reverse_match
,
322 ENC_ALIAS("ISO8859-1", "ISO-8859-1")