1 /**********************************************************************
2 euc_jp.c - Onigmo (Oniguruma-mod) (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011 K.Takata <kentkt AT csc DOT jp>
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
35 static const int EncLen_EUCJP
[] = {
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
54 typedef enum { FAILURE
= -2, ACCEPT
= -1, S0
= 0, S1
, S2
} state_t
;
57 static const signed char trans
[][0x100] = {
58 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
59 /* 0 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
60 /* 1 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
61 /* 2 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
62 /* 3 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
63 /* 4 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
64 /* 5 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
65 /* 6 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
66 /* 7 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
67 /* 8 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, 1, 2,
68 /* 9 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
69 /* a */ F
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
76 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
77 /* 0 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
78 /* 1 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
79 /* 2 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
80 /* 3 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
81 /* 4 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
82 /* 5 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
83 /* 6 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
84 /* 7 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
85 /* 8 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
86 /* 9 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
87 /* a */ F
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
88 /* b */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
89 /* c */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
90 /* d */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
91 /* e */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
92 /* f */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, F
94 { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
95 /* 0 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
96 /* 1 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
97 /* 2 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
98 /* 3 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
99 /* 4 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
100 /* 5 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
101 /* 6 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
102 /* 7 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
103 /* 8 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
104 /* 9 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
105 /* a */ F
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
117 static const OnigPairCaseFoldCodes CaseFoldMap
[] = {
118 /* Fullwidth Alphabet */
209 mbc_enc_len(const UChar
* p
, const UChar
* e
, OnigEncoding enc ARG_UNUSED
)
211 int firstbyte
= *p
++;
213 s
= trans
[0][firstbyte
];
214 if (s
< 0) return s
== ACCEPT
? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
215 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
216 if (p
== e
) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP
[firstbyte
]-1);
218 if (s
< 0) return s
== ACCEPT
? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
219 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
220 if (p
== e
) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP
[firstbyte
]-2);
222 return s
== ACCEPT
? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
223 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
227 mbc_to_code(const UChar
* p
, const UChar
* end
, OnigEncoding enc
)
232 len
= mbc_enc_len(p
, end
, enc
);
233 n
= (OnigCodePoint
)*p
++;
234 if (len
== 1) return n
;
236 for (i
= 1; i
< len
; i
++) {
245 code_to_mbclen(OnigCodePoint code
, OnigEncoding enc ARG_UNUSED
)
247 if (ONIGENC_IS_CODE_ASCII(code
)) return 1;
248 else if (code
> 0x00ffffff)
249 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
;
250 else if ((code
& 0xff808080) == 0x00808080) return 3;
251 else if ((code
& 0xffff8080) == 0x00008080) return 2;
253 return ONIGERR_INVALID_CODE_POINT_VALUE
;
258 code_to_mbc_first(OnigCodePoint code
)
262 if ((code
& 0xff0000) != 0) {
263 first
= (code
>> 16) & 0xff;
265 else if ((code
& 0xff00) != 0) {
266 first
= (code
>> 8) & 0xff;
276 code_to_mbc(OnigCodePoint code
, UChar
*buf
, OnigEncoding enc
)
280 if ((code
& 0xff0000) != 0) *p
++ = (UChar
)(((code
>> 16) & 0xff));
281 if ((code
& 0xff00) != 0) *p
++ = (UChar
)(((code
>> 8) & 0xff));
282 *p
++ = (UChar
)(code
& 0xff);
285 if (mbc_enc_len(buf
, p
, enc
) != (p
- buf
))
286 return ONIGERR_INVALID_CODE_POINT_VALUE
;
288 return (int )(p
- buf
);
292 apply_all_case_fold(OnigCaseFoldType flag
,
293 OnigApplyAllCaseFoldFunc f
, void* arg
, OnigEncoding enc
)
295 return onigenc_apply_all_case_fold_with_map(
296 numberof(CaseFoldMap
), CaseFoldMap
, 0,
301 get_lower_case(OnigCodePoint code
)
303 if (ONIGENC_IS_IN_RANGE(code
, 0xa3c1, 0xa3da)) {
304 /* Fullwidth Alphabet */
305 return (OnigCodePoint
)(code
+ 0x0020);
307 else if (ONIGENC_IS_IN_RANGE(code
, 0xa6a1, 0xa6b8)) {
309 return (OnigCodePoint
)(code
+ 0x0020);
311 else if (ONIGENC_IS_IN_RANGE(code
, 0xa7a1, 0xa7c1)) {
313 return (OnigCodePoint
)(code
+ 0x0030);
319 get_upper_case(OnigCodePoint code
)
321 if (ONIGENC_IS_IN_RANGE(code
, 0xa3e1, 0xa3fa)) {
322 /* Fullwidth Alphabet */
323 return (OnigCodePoint
)(code
- 0x0020);
325 else if (ONIGENC_IS_IN_RANGE(code
, 0xa6c1, 0xa6d8)) {
327 return (OnigCodePoint
)(code
- 0x0020);
329 else if (ONIGENC_IS_IN_RANGE(code
, 0xa7d1, 0xa7f1)) {
331 return (OnigCodePoint
)(code
- 0x0030);
337 get_case_fold_codes_by_str(OnigCaseFoldType flag
,
338 const OnigUChar
* p
, const OnigUChar
* end
,
339 OnigCaseFoldCodeItem items
[], OnigEncoding enc
)
342 OnigCodePoint code
, code_lo
, code_up
;
344 code
= mbc_to_code(p
, end
, enc
);
345 if (ONIGENC_IS_ASCII_CODE(code
))
346 return onigenc_ascii_get_case_fold_codes_by_str(flag
, p
, end
, items
, enc
);
348 len
= mbc_enc_len(p
, end
, enc
);
349 code_lo
= get_lower_case(code
);
350 code_up
= get_upper_case(code
);
352 if (code
!= code_lo
) {
353 items
[0].byte_len
= len
;
354 items
[0].code_len
= 1;
355 items
[0].code
[0] = code_lo
;
358 else if (code
!= code_up
) {
359 items
[0].byte_len
= len
;
360 items
[0].code_len
= 1;
361 items
[0].code
[0] = code_up
;
369 mbc_case_fold(OnigCaseFoldType flag
,
370 const UChar
** pp
, const UChar
* end
, UChar
* lower
,
373 const UChar
* p
= *pp
;
375 if (ONIGENC_IS_MBC_ASCII(p
)) {
376 *lower
= ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p
);
384 len
= mbc_enc_len(p
, end
, enc
);
385 code
= get_lower_case(mbc_to_code(p
, end
, enc
));
386 len
= code_to_mbc(code
, lower
, enc
);
387 if (len
== ONIGERR_INVALID_CODE_POINT_VALUE
) len
= 1;
389 return len
; /* return byte length of converted char to lower */
394 left_adjust_char_head(const UChar
* start
, const UChar
* s
, const UChar
* end
, OnigEncoding enc
)
397 mb-trail bytes doesn't mix with single bytes.
402 if (s
<= start
) return (UChar
* )s
;
405 while (!eucjp_islead(*p
) && p
> start
) p
--;
406 len
= mbc_enc_len(p
, end
, enc
);
407 if (p
+ len
> s
) return (UChar
* )p
;
409 return (UChar
* )(p
+ ((s
- p
) & ~1));
413 is_allowed_reverse_match(const UChar
* s
, const UChar
* end
, OnigEncoding enc ARG_UNUSED
)
416 if (c
<= 0x7e || c
== 0x8e || c
== 0x8f)
423 static const OnigCodePoint CR_Hiragana
[] = {
425 #ifdef ENC_EUC_JIS_2004
432 #ifdef ENC_EUC_JIS_2004
433 static const OnigCodePoint CR_Katakana
[] = {
435 0x8ea6, 0x8eaf, /* JIS X 0201 Katakana */
436 0x8eb1, 0x8edd, /* JIS X 0201 Katakana */
442 static const OnigCodePoint CR_Katakana
[] = {
444 0x8ea6, 0x8eaf, /* JIS X 0201 Katakana */
445 0x8eb1, 0x8edd, /* JIS X 0201 Katakana */
450 #ifdef ENC_EUC_JIS_2004
451 static const OnigCodePoint CR_Han
[] = {
452 /* EUC-JIS-2004 (JIS X 0213:2004) */
456 0xaea1, 0xfefe, /* Kanji level 1, 2 and 3 */
458 0x8fa1a1, 0x8fa1fe, /* row 1 */
459 0x8fa3a1, 0x8fa5fe, /* row 3 .. 5 */
460 0x8fa8a1, 0x8fa8fe, /* row 8 */
461 0x8faca1, 0x8faffe, /* row 12 .. 15 */
462 0x8feea1, 0x8ffef6, /* row 78 .. 94 */
465 static const OnigCodePoint CR_Han
[] = {
466 /* EUC-JP (JIS X 0208 based) */
469 0xb0a1, 0xcfd3, /* Kanji level 1 */
470 0xd0a1, 0xf4a6, /* Kanji level 2 */
471 0x8fb0a1, 0x8fedf3 /* JIS X 0212 Supplemental Kanji (row 16 .. 77) */
475 static const OnigCodePoint CR_Latin
[] = {
481 /* TODO: add raw 8 .. 11 to support EUC-JIS-2004 */
482 /* TODO: add JIS X 0212 row 9 .. 11 */
485 static const OnigCodePoint CR_Greek
[] = {
488 #ifdef ENC_EUC_JIS_2004
492 /* TODO: add JIS X 0212 row 6 */
496 static const OnigCodePoint CR_Cyrillic
[] = {
500 /* TODO: add JIS X 0212 row 7 */
503 #include "enc/jis/props.h"
506 property_name_to_ctype(OnigEncoding enc
, const UChar
* p
, const UChar
* end
)
508 const UChar
*s
= p
, *e
= end
;
509 const struct enc_property
*prop
=
510 onig_jis_property((const char* )s
, (unsigned int )(e
- s
));
513 return onigenc_minimum_property_name_to_ctype(enc
, s
, e
);
516 return (int )prop
->ctype
;
520 is_code_ctype(OnigCodePoint code
, unsigned int ctype
, OnigEncoding enc ARG_UNUSED
)
522 if (ctype
<= ONIGENC_MAX_STD_CTYPE
) {
524 return ONIGENC_IS_ASCII_CODE_CTYPE(code
, ctype
);
526 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype
)) {
527 return (code_to_mbclen(code
, enc
) > 1 ? TRUE
: FALSE
);
532 ctype
-= (ONIGENC_MAX_STD_CTYPE
+ 1);
533 if (ctype
>= (unsigned int )PropertyListNum
)
534 return ONIGERR_TYPE_BUG
;
536 return onig_is_in_code_range((UChar
* )PropertyList
[ctype
], code
);
543 get_ctype_code_range(OnigCtype ctype
, OnigCodePoint
* sb_out
,
544 const OnigCodePoint
* ranges
[], OnigEncoding enc ARG_UNUSED
)
546 if (ctype
<= ONIGENC_MAX_STD_CTYPE
) {
547 return ONIG_NO_SUPPORT_CONFIG
;
552 ctype
-= (ONIGENC_MAX_STD_CTYPE
+ 1);
553 if (ctype
>= (OnigCtype
)PropertyListNum
)
554 return ONIGERR_TYPE_BUG
;
556 *ranges
= PropertyList
[ctype
];
562 OnigEncodingDefine(euc_jp
, EUC_JP
) = {
565 3, /* max enc length */
566 1, /* min enc length */
567 onigenc_is_mbc_newline_0x0a
,
573 get_case_fold_codes_by_str
,
574 property_name_to_ctype
,
576 get_ctype_code_range
,
577 left_adjust_char_head
,
578 is_allowed_reverse_match
,
579 onigenc_ascii_only_case_map
,
586 * Link: http://www.iana.org/assignments/character-sets
587 * Link: http://home.m05.itscom.net/numa/cde/sjis-euc/sjis-euc.html
588 * Link: http://home.m05.itscom.net/numa/uocjleE.pdf
590 ENC_ALIAS("eucJP", "EUC-JP") /* UI-OSF Application Platform Profile for Japanese Environment Version 1.1 */
594 * Link: http://home.m05.itscom.net/numa/cde/ucs-conv/ucs-conv.html
595 * Link: http://www2d.biglobe.ne.jp/~msyk/charcode/cp932/eucJP-ms.html
596 * Link: https://ja.wikipedia.org/wiki/EUC-JP
598 ENC_REPLICATE("eucJP-ms", "EUC-JP") /* TOG/JVC CDE/Motif Technical WG */
599 ENC_ALIAS("euc-jp-ms", "eucJP-ms")
604 * Link: http://www.iana.org/assignments/charset-reg/CP51932
605 * Link: http://search.cpan.org/src/NARUSE/Encode-EUCJPMS-0.07/ucm/cp51932.ucm
606 * Link: http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
607 * Link: http://msyk.at.webry.info/200511/article_2.html
609 ENC_REPLICATE("CP51932", "EUC-JP")
613 * Link: https://ja.wikipedia.org/wiki/EUC-JIS-2004
615 ENC_REPLICATE("EUC-JIS-2004", "EUC-JP") /* defined at JIS X 0213:2004 */
616 ENC_ALIAS("EUC-JISX0213", "EUC-JIS-2004") /* defined at JIS X 0213:2000, and obsolete at JIS X 0213:2004 */