[rubygems/rubygems] Use a constant empty tar header to avoid extra allocations
[ruby.git] / enc / shift_jis.h
blobd55240159573a4c018e7b762ace81e982a879ebb
1 /**********************************************************************
2 shift_jis.h - Onigmo (Oniguruma-mod) (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011 K.Takata <kentkt AT csc DOT jp>
7 * All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
31 #include "regenc.h"
33 static const int EncLen_SJIS[] = {
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
52 static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
71 static const OnigPairCaseFoldCodes CaseFoldMap[] = {
72 /* Fullwidth Alphabet */
73 { 0x8260, 0x8281 },
74 { 0x8261, 0x8282 },
75 { 0x8262, 0x8283 },
76 { 0x8263, 0x8284 },
77 { 0x8264, 0x8285 },
78 { 0x8265, 0x8286 },
79 { 0x8266, 0x8287 },
80 { 0x8267, 0x8288 },
81 { 0x8268, 0x8289 },
82 { 0x8269, 0x828a },
83 { 0x826a, 0x828b },
84 { 0x826b, 0x828c },
85 { 0x826c, 0x828d },
86 { 0x826d, 0x828e },
87 { 0x826e, 0x828f },
88 { 0x826f, 0x8290 },
89 { 0x8270, 0x8291 },
90 { 0x8271, 0x8292 },
91 { 0x8272, 0x8293 },
92 { 0x8273, 0x8294 },
93 { 0x8274, 0x8295 },
94 { 0x8275, 0x8296 },
95 { 0x8276, 0x8297 },
96 { 0x8277, 0x8298 },
97 { 0x8278, 0x8299 },
98 { 0x8279, 0x829a },
100 /* Greek */
101 { 0x839f, 0x83bf },
102 { 0x83a0, 0x83c0 },
103 { 0x83a1, 0x83c1 },
104 { 0x83a2, 0x83c2 },
105 { 0x83a3, 0x83c3 },
106 { 0x83a4, 0x83c4 },
107 { 0x83a5, 0x83c5 },
108 { 0x83a6, 0x83c6 },
109 { 0x83a7, 0x83c7 },
110 { 0x83a8, 0x83c8 },
111 { 0x83a9, 0x83c9 },
112 { 0x83aa, 0x83ca },
113 { 0x83ab, 0x83cb },
114 { 0x83ac, 0x83cc },
115 { 0x83ad, 0x83cd },
116 { 0x83ae, 0x83ce },
117 { 0x83af, 0x83cf },
118 { 0x83b0, 0x83d0 },
119 { 0x83b1, 0x83d1 },
120 { 0x83b2, 0x83d2 },
121 { 0x83b3, 0x83d3 },
122 { 0x83b4, 0x83d4 },
123 { 0x83b5, 0x83d5 },
124 { 0x83b6, 0x83d6 },
126 /* Cyrillic */
127 { 0x8440, 0x8470 },
128 { 0x8441, 0x8471 },
129 { 0x8442, 0x8472 },
130 { 0x8443, 0x8473 },
131 { 0x8444, 0x8474 },
132 { 0x8445, 0x8475 },
133 { 0x8446, 0x8476 },
134 { 0x8447, 0x8477 },
135 { 0x8448, 0x8478 },
136 { 0x8449, 0x8479 },
137 { 0x844a, 0x847a },
138 { 0x844b, 0x847b },
139 { 0x844c, 0x847c },
140 { 0x844d, 0x847d },
141 { 0x844e, 0x847e },
142 { 0x844f, 0x8480 },
143 { 0x8450, 0x8481 },
144 { 0x8451, 0x8482 },
145 { 0x8452, 0x8483 },
146 { 0x8453, 0x8484 },
147 { 0x8454, 0x8485 },
148 { 0x8455, 0x8486 },
149 { 0x8456, 0x8487 },
150 { 0x8457, 0x8488 },
151 { 0x8458, 0x8489 },
152 { 0x8459, 0x848a },
153 { 0x845a, 0x848b },
154 { 0x845b, 0x848c },
155 { 0x845c, 0x848d },
156 { 0x845d, 0x848e },
157 { 0x845e, 0x848f },
158 { 0x845f, 0x8490 },
159 { 0x8460, 0x8491 },
162 #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
163 #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
165 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
166 #define A ACCEPT
167 #define F FAILURE
168 static const signed char trans[][0x100] = {
169 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
170 /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
171 /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
172 /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
173 /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
174 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
175 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
176 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
177 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
178 /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
179 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
180 /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
181 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
182 /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
183 /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
184 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
185 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F
187 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
188 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
189 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
190 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
191 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
192 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
193 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
194 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
195 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
196 /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
197 /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
198 /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
199 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
200 /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
201 /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
202 /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
203 /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F
206 #undef A
207 #undef F
209 static int
210 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
212 int firstbyte = *p++;
213 state_t s;
214 s = trans[0][firstbyte];
215 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
216 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
217 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1);
218 s = trans[s][*p++];
219 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
220 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
223 static int
224 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
226 if (code < 256) {
227 if (EncLen_SJIS[(int )code] == 1)
228 return 1;
229 else
230 return ONIGERR_INVALID_CODE_POINT_VALUE;
232 else if (code <= 0xffff) {
233 int low = code & 0xff;
234 if (! SJIS_ISMB_TRAIL(low))
235 return ONIGERR_INVALID_CODE_POINT_VALUE;
236 return 2;
238 else
239 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
242 static OnigCodePoint
243 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
245 int c, i, len;
246 OnigCodePoint n;
248 len = mbc_enc_len(p, end, enc);
249 c = *p++;
250 n = c;
251 if (len == 1) return n;
253 for (i = 1; i < len; i++) {
254 if (p >= end) break;
255 c = *p++;
256 n <<= 8; n += c;
258 return n;
261 static int
262 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
264 UChar *p = buf;
266 if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
267 *p++ = (UChar )(code & 0xff);
269 #if 0
270 if (mbc_enc_len(buf, p, enc) != (p - buf))
271 return REGERR_INVALID_CODE_POINT_VALUE;
272 #endif
273 return (int )(p - buf);
276 static int
277 apply_all_case_fold(OnigCaseFoldType flag,
278 OnigApplyAllCaseFoldFunc f, void* arg, OnigEncoding enc)
280 return onigenc_apply_all_case_fold_with_map(
281 numberof(CaseFoldMap), CaseFoldMap, 0,
282 flag, f, arg);
285 static OnigCodePoint
286 get_lower_case(OnigCodePoint code)
288 if (ONIGENC_IS_IN_RANGE(code, 0x8260, 0x8279)) {
289 /* Fullwidth Alphabet */
290 return (OnigCodePoint )(code + 0x0021);
292 else if (ONIGENC_IS_IN_RANGE(code, 0x839f, 0x83b6)) {
293 /* Greek */
294 return (OnigCodePoint )(code + 0x0020);
296 else if (ONIGENC_IS_IN_RANGE(code, 0x8440, 0x8460)) {
297 /* Cyrillic */
298 int d = (code >= 0x844f) ? 1 : 0;
299 return (OnigCodePoint )(code + (0x0030 + d));
301 return code;
304 static OnigCodePoint
305 get_upper_case(OnigCodePoint code)
307 if (ONIGENC_IS_IN_RANGE(code, 0x8281, 0x829a)) {
308 /* Fullwidth Alphabet */
309 return (OnigCodePoint )(code - 0x0021);
311 else if (ONIGENC_IS_IN_RANGE(code, 0x83bf, 0x83d6)) {
312 /* Greek */
313 return (OnigCodePoint )(code - 0x0020);
315 else if (ONIGENC_IS_IN_RANGE(code, 0x8470, 0x847e) ||
316 ONIGENC_IS_IN_RANGE(code, 0x8480, 0x8491)) {
317 /* Cyrillic */
318 int d = (code >= 0x8480) ? 1 : 0;
319 return (OnigCodePoint )(code - (0x0030 - d));
321 return code;
324 static int
325 get_case_fold_codes_by_str(OnigCaseFoldType flag,
326 const OnigUChar* p, const OnigUChar* end,
327 OnigCaseFoldCodeItem items[], OnigEncoding enc)
329 int len;
330 OnigCodePoint code, code_lo, code_up;
332 code = mbc_to_code(p, end, enc);
333 if (ONIGENC_IS_ASCII_CODE(code))
334 return onigenc_ascii_get_case_fold_codes_by_str(flag, p, end, items, enc);
336 len = mbc_enc_len(p, end, enc);
337 code_lo = get_lower_case(code);
338 code_up = get_upper_case(code);
340 if (code != code_lo) {
341 items[0].byte_len = len;
342 items[0].code_len = 1;
343 items[0].code[0] = code_lo;
344 return 1;
346 else if (code != code_up) {
347 items[0].byte_len = len;
348 items[0].code_len = 1;
349 items[0].code[0] = code_up;
350 return 1;
353 return 0;
356 static int
357 mbc_case_fold(OnigCaseFoldType flag,
358 const UChar** pp, const UChar* end, UChar* lower,
359 OnigEncoding enc)
361 const UChar* p = *pp;
363 if (ONIGENC_IS_MBC_ASCII(p)) {
364 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
365 (*pp)++;
366 return 1;
368 else {
369 OnigCodePoint code;
370 int len;
372 code = get_lower_case(mbc_to_code(p, end, enc));
373 len = code_to_mbc(code, lower, enc);
374 (*pp) += len;
375 return len; /* return byte length of converted char to lower */
379 #if 0
380 static int
381 is_mbc_ambiguous(OnigCaseFoldType flag,
382 const UChar** pp, const UChar* end)
384 return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
387 #endif
389 #if 0
390 static int
391 is_code_ctype(OnigCodePoint code, unsigned int ctype)
393 if (code < 128)
394 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
395 else {
396 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
397 return (code_to_mbclen(code) > 1 ? TRUE : FALSE);
401 return FALSE;
403 #endif
405 static UChar*
406 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
408 const UChar *p;
409 int len;
411 if (s <= start) return (UChar* )s;
412 p = s;
414 if (SJIS_ISMB_TRAIL(*p)) {
415 while (p > start) {
416 if (! SJIS_ISMB_FIRST(*--p)) {
417 p++;
418 break;
422 len = mbc_enc_len(p, end, enc);
423 if (p + len > s) return (UChar* )p;
424 p += len;
425 return (UChar* )(p + ((s - p) & ~1));
428 static int
429 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
431 const UChar c = *s;
432 return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
436 static const OnigCodePoint CR_Hiragana[] = {
438 0x829f, 0x82f1
439 }; /* CR_Hiragana */
441 static const OnigCodePoint CR_Katakana[] = {
443 0x00a6, 0x00af,
444 0x00b1, 0x00dd,
445 0x8340, 0x837e,
446 0x8380, 0x8396,
447 }; /* CR_Katakana */
449 #ifdef ENC_CP932
450 static const OnigCodePoint CR_Han[] = {
452 0x8157, 0x8157,
453 0x889f, 0x9872, /* Kanji level 1 */
454 0x989f, 0x9ffc, /* Kanji level 2 */
455 0xe040, 0xeaa4, /* Kanji level 2 */
456 0xed40, 0xeeec, /* NEC-selected IBM extended characters (without symbols) */
457 0xfa5c, 0xfc4b, /* IBM extended characters (without symbols) */
458 }; /* CR_Han */
459 #else
460 static const OnigCodePoint CR_Han[] = {
462 0x8157, 0x8157,
463 0x889f, 0x9872, /* Kanji level 1 */
464 0x989f, 0x9ffc, /* Kanji level 2 */
465 0xe040, 0xeaa4, /* Kanji level 2 */
466 }; /* CR_Han */
467 #endif
469 static const OnigCodePoint CR_Latin[] = {
471 0x0041, 0x005a,
472 0x0061, 0x007a,
473 0x8260, 0x8279,
474 0x8281, 0x829a,
475 }; /* CR_Latin */
477 static const OnigCodePoint CR_Greek[] = {
479 0x839f, 0x83b6,
480 0x83bf, 0x83d6,
481 }; /* CR_Greek */
483 static const OnigCodePoint CR_Cyrillic[] = {
485 0x8440, 0x8460,
486 0x8470, 0x847f,
487 0x8480, 0x8491,
488 }; /* CR_Cyrillic */
490 #include "enc/jis/props.h"
492 static int
493 property_name_to_ctype(OnigEncoding enc, const UChar* p, const UChar* end)
495 const UChar *s = p, *e = end;
496 const struct enc_property *prop =
497 onig_jis_property((const char* )s, (unsigned int )(e - s));
499 if (!prop) {
500 return onigenc_minimum_property_name_to_ctype(enc, s, e);
503 return (int )prop->ctype;
506 static int
507 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
509 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
510 if (code < 128)
511 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
512 else {
513 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
514 return TRUE;
518 else {
519 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
520 if (ctype >= (unsigned int )PropertyListNum)
521 return ONIGERR_TYPE_BUG;
523 return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
526 return FALSE;
529 static int
530 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
531 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
533 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
534 return ONIG_NO_SUPPORT_CONFIG;
536 else {
537 *sb_out = 0x80;
539 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
540 if (ctype >= (OnigCtype )PropertyListNum)
541 return ONIGERR_TYPE_BUG;
543 *ranges = PropertyList[ctype];
544 return 0;