1 /**********************************************************************
2 shift_jis.h - Onigmo (Oniguruma-mod) (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011 K.Takata <kentkt AT csc DOT jp>
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 static const int EncLen_SJIS
[] = {
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
52 static const char SJIS_CAN_BE_TRAIL_TABLE
[256] = {
53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
71 static const OnigPairCaseFoldCodes CaseFoldMap
[] = {
72 /* Fullwidth Alphabet */
162 #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
163 #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
165 typedef enum { FAILURE
= -2, ACCEPT
= -1, S0
= 0, S1
} state_t
;
168 static const signed char trans
[][0x100] = {
169 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
170 /* 0 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
171 /* 1 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
172 /* 2 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
173 /* 3 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
174 /* 4 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
175 /* 5 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
176 /* 6 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
177 /* 7 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
178 /* 8 */ F
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
179 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
180 /* a */ F
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
181 /* b */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
182 /* c */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
183 /* d */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
184 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
185 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
, F
, F
187 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
188 /* 0 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
189 /* 1 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
190 /* 2 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
191 /* 3 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
192 /* 4 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
193 /* 5 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
194 /* 6 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
195 /* 7 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, F
,
196 /* 8 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
197 /* 9 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
198 /* a */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
199 /* b */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
200 /* c */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
201 /* d */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
202 /* e */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
203 /* f */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, F
, F
, F
210 mbc_enc_len(const UChar
* p
, const UChar
* e
, OnigEncoding enc ARG_UNUSED
)
212 int firstbyte
= *p
++;
214 s
= trans
[0][firstbyte
];
215 if (s
< 0) return s
== ACCEPT
? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
216 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
217 if (p
== e
) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS
[firstbyte
]-1);
219 return s
== ACCEPT
? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
220 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
224 code_to_mbclen(OnigCodePoint code
, OnigEncoding enc ARG_UNUSED
)
227 if (EncLen_SJIS
[(int )code
] == 1)
230 return ONIGERR_INVALID_CODE_POINT_VALUE
;
232 else if (code
<= 0xffff) {
233 int low
= code
& 0xff;
234 if (! SJIS_ISMB_TRAIL(low
))
235 return ONIGERR_INVALID_CODE_POINT_VALUE
;
239 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
;
243 mbc_to_code(const UChar
* p
, const UChar
* end
, OnigEncoding enc
)
248 len
= mbc_enc_len(p
, end
, enc
);
251 if (len
== 1) return n
;
253 for (i
= 1; i
< len
; i
++) {
262 code_to_mbc(OnigCodePoint code
, UChar
*buf
, OnigEncoding enc
)
266 if ((code
& 0xff00) != 0) *p
++ = (UChar
)(((code
>> 8) & 0xff));
267 *p
++ = (UChar
)(code
& 0xff);
270 if (mbc_enc_len(buf
, p
, enc
) != (p
- buf
))
271 return REGERR_INVALID_CODE_POINT_VALUE
;
273 return (int )(p
- buf
);
277 apply_all_case_fold(OnigCaseFoldType flag
,
278 OnigApplyAllCaseFoldFunc f
, void* arg
, OnigEncoding enc
)
280 return onigenc_apply_all_case_fold_with_map(
281 numberof(CaseFoldMap
), CaseFoldMap
, 0,
286 get_lower_case(OnigCodePoint code
)
288 if (ONIGENC_IS_IN_RANGE(code
, 0x8260, 0x8279)) {
289 /* Fullwidth Alphabet */
290 return (OnigCodePoint
)(code
+ 0x0021);
292 else if (ONIGENC_IS_IN_RANGE(code
, 0x839f, 0x83b6)) {
294 return (OnigCodePoint
)(code
+ 0x0020);
296 else if (ONIGENC_IS_IN_RANGE(code
, 0x8440, 0x8460)) {
298 int d
= (code
>= 0x844f) ? 1 : 0;
299 return (OnigCodePoint
)(code
+ (0x0030 + d
));
305 get_upper_case(OnigCodePoint code
)
307 if (ONIGENC_IS_IN_RANGE(code
, 0x8281, 0x829a)) {
308 /* Fullwidth Alphabet */
309 return (OnigCodePoint
)(code
- 0x0021);
311 else if (ONIGENC_IS_IN_RANGE(code
, 0x83bf, 0x83d6)) {
313 return (OnigCodePoint
)(code
- 0x0020);
315 else if (ONIGENC_IS_IN_RANGE(code
, 0x8470, 0x847e) ||
316 ONIGENC_IS_IN_RANGE(code
, 0x8480, 0x8491)) {
318 int d
= (code
>= 0x8480) ? 1 : 0;
319 return (OnigCodePoint
)(code
- (0x0030 - d
));
325 get_case_fold_codes_by_str(OnigCaseFoldType flag
,
326 const OnigUChar
* p
, const OnigUChar
* end
,
327 OnigCaseFoldCodeItem items
[], OnigEncoding enc
)
330 OnigCodePoint code
, code_lo
, code_up
;
332 code
= mbc_to_code(p
, end
, enc
);
333 if (ONIGENC_IS_ASCII_CODE(code
))
334 return onigenc_ascii_get_case_fold_codes_by_str(flag
, p
, end
, items
, enc
);
336 len
= mbc_enc_len(p
, end
, enc
);
337 code_lo
= get_lower_case(code
);
338 code_up
= get_upper_case(code
);
340 if (code
!= code_lo
) {
341 items
[0].byte_len
= len
;
342 items
[0].code_len
= 1;
343 items
[0].code
[0] = code_lo
;
346 else if (code
!= code_up
) {
347 items
[0].byte_len
= len
;
348 items
[0].code_len
= 1;
349 items
[0].code
[0] = code_up
;
357 mbc_case_fold(OnigCaseFoldType flag
,
358 const UChar
** pp
, const UChar
* end
, UChar
* lower
,
361 const UChar
* p
= *pp
;
363 if (ONIGENC_IS_MBC_ASCII(p
)) {
364 *lower
= ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p
);
372 code
= get_lower_case(mbc_to_code(p
, end
, enc
));
373 len
= code_to_mbc(code
, lower
, enc
);
375 return len
; /* return byte length of converted char to lower */
381 is_mbc_ambiguous(OnigCaseFoldType flag
,
382 const UChar
** pp
, const UChar
* end
)
384 return onigenc_mbn_is_mbc_ambiguous(enc
, flag
, pp
, end
);
391 is_code_ctype(OnigCodePoint code
, unsigned int ctype
)
394 return ONIGENC_IS_ASCII_CODE_CTYPE(code
, ctype
);
396 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype
)) {
397 return (code_to_mbclen(code
) > 1 ? TRUE
: FALSE
);
406 left_adjust_char_head(const UChar
* start
, const UChar
* s
, const UChar
* end
, OnigEncoding enc
)
411 if (s
<= start
) return (UChar
* )s
;
414 if (SJIS_ISMB_TRAIL(*p
)) {
416 if (! SJIS_ISMB_FIRST(*--p
)) {
422 len
= mbc_enc_len(p
, end
, enc
);
423 if (p
+ len
> s
) return (UChar
* )p
;
425 return (UChar
* )(p
+ ((s
- p
) & ~1));
429 is_allowed_reverse_match(const UChar
* s
, const UChar
* end
, OnigEncoding enc ARG_UNUSED
)
432 return (SJIS_ISMB_TRAIL(c
) ? FALSE
: TRUE
);
436 static const OnigCodePoint CR_Hiragana
[] = {
441 static const OnigCodePoint CR_Katakana
[] = {
450 static const OnigCodePoint CR_Han
[] = {
453 0x889f, 0x9872, /* Kanji level 1 */
454 0x989f, 0x9ffc, /* Kanji level 2 */
455 0xe040, 0xeaa4, /* Kanji level 2 */
456 0xed40, 0xeeec, /* NEC-selected IBM extended characters (without symbols) */
457 0xfa5c, 0xfc4b, /* IBM extended characters (without symbols) */
460 static const OnigCodePoint CR_Han
[] = {
463 0x889f, 0x9872, /* Kanji level 1 */
464 0x989f, 0x9ffc, /* Kanji level 2 */
465 0xe040, 0xeaa4, /* Kanji level 2 */
469 static const OnigCodePoint CR_Latin
[] = {
477 static const OnigCodePoint CR_Greek
[] = {
483 static const OnigCodePoint CR_Cyrillic
[] = {
490 #include "enc/jis/props.h"
493 property_name_to_ctype(OnigEncoding enc
, const UChar
* p
, const UChar
* end
)
495 const UChar
*s
= p
, *e
= end
;
496 const struct enc_property
*prop
=
497 onig_jis_property((const char* )s
, (unsigned int )(e
- s
));
500 return onigenc_minimum_property_name_to_ctype(enc
, s
, e
);
503 return (int )prop
->ctype
;
507 is_code_ctype(OnigCodePoint code
, unsigned int ctype
, OnigEncoding enc
)
509 if (ctype
<= ONIGENC_MAX_STD_CTYPE
) {
511 return ONIGENC_IS_ASCII_CODE_CTYPE(code
, ctype
);
513 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype
)) {
519 ctype
-= (ONIGENC_MAX_STD_CTYPE
+ 1);
520 if (ctype
>= (unsigned int )PropertyListNum
)
521 return ONIGERR_TYPE_BUG
;
523 return onig_is_in_code_range((UChar
* )PropertyList
[ctype
], code
);
530 get_ctype_code_range(OnigCtype ctype
, OnigCodePoint
* sb_out
,
531 const OnigCodePoint
* ranges
[], OnigEncoding enc ARG_UNUSED
)
533 if (ctype
<= ONIGENC_MAX_STD_CTYPE
) {
534 return ONIG_NO_SUPPORT_CONFIG
;
539 ctype
-= (ONIGENC_MAX_STD_CTYPE
+ 1);
540 if (ctype
>= (OnigCtype
)PropertyListNum
)
541 return ONIGERR_TYPE_BUG
;
543 *ranges
= PropertyList
[ctype
];