* 2022-01-18 [ci skip]
[ruby-80x24.org.git] / regenc.c
blob16d62fdf4098c444a0a32fb7da65e0c61c77da40
1 /**********************************************************************
2 regenc.c - Onigmo (Oniguruma-mod) (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011-2016 K.Takata <kentkt AT csc DOT jp>
7 * All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
31 #include "regint.h"
33 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
35 extern int
36 onigenc_init(void)
38 return 0;
41 extern OnigEncoding
42 onigenc_get_default_encoding(void)
44 return OnigEncDefaultCharEncoding;
47 extern int
48 onigenc_set_default_encoding(OnigEncoding enc)
50 OnigEncDefaultCharEncoding = enc;
51 return 0;
54 extern int
55 onigenc_mbclen_approximate(const OnigUChar* p,const OnigUChar* e, OnigEncoding enc)
57 int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc, p, e);
58 if (ONIGENC_MBCLEN_CHARFOUND_P(ret))
59 return ONIGENC_MBCLEN_CHARFOUND_LEN(ret);
60 else if (ONIGENC_MBCLEN_NEEDMORE_P(ret))
61 return (int )(e - p) + ONIGENC_MBCLEN_NEEDMORE_LEN(ret);
62 return 1;
65 extern UChar*
66 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end)
68 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s, end);
69 if (p < s) {
70 p += enclen(enc, p, end);
72 return p;
75 extern UChar*
76 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
77 const UChar* start, const UChar* s, const UChar* end, const UChar** prev)
79 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s, end);
81 if (p < s) {
82 if (prev) *prev = (const UChar* )p;
83 p += enclen(enc, p, end);
85 else {
86 if (prev) *prev = (const UChar* )NULL; /* Sorry */
88 return p;
91 extern UChar*
92 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end)
94 if (s <= start)
95 return (UChar* )NULL;
97 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1, end);
100 extern UChar*
101 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end, int n)
103 while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
104 if (s <= start)
105 return (UChar* )NULL;
107 s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1, end);
109 return (UChar* )s;
112 extern UChar*
113 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
115 UChar* q = (UChar* )p;
116 while (n-- > 0) {
117 q += ONIGENC_MBC_ENC_LEN(enc, q, end);
119 return (q <= end ? q : NULL);
122 extern int
123 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
125 int n = 0;
126 UChar* q = (UChar* )p;
128 while (q < end) {
129 q += ONIGENC_MBC_ENC_LEN(enc, q, end);
130 n++;
132 return n;
135 extern int
136 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
138 int n = 0;
139 UChar* p = (UChar* )s;
140 UChar* e;
142 while (1) {
143 if (*p == '\0') {
144 UChar* q;
145 int len = ONIGENC_MBC_MINLEN(enc);
147 if (len == 1) return n;
148 q = p + 1;
149 while (len > 1) {
150 if (*q != '\0') break;
151 q++;
152 len--;
154 if (len == 1) return n;
156 e = p + ONIGENC_MBC_MAXLEN(enc);
157 p += ONIGENC_MBC_ENC_LEN(enc, p, e);
158 n++;
162 extern int
163 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
165 UChar* start = (UChar* )s;
166 UChar* p = (UChar* )s;
167 UChar* e;
169 while (1) {
170 if (*p == '\0') {
171 UChar* q;
172 int len = ONIGENC_MBC_MINLEN(enc);
174 if (len == 1) return (int )(p - start);
175 q = p + 1;
176 while (len > 1) {
177 if (*q != '\0') break;
178 q++;
179 len--;
181 if (len == 1) return (int )(p - start);
183 e = p + ONIGENC_MBC_MAXLEN(enc);
184 p += ONIGENC_MBC_ENC_LEN(enc, p, e);
188 const UChar OnigEncAsciiToLowerCaseTable[] = {
189 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
190 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
191 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
192 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
193 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
194 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
195 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
196 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
197 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
198 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
199 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
200 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
201 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
202 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
203 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
204 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
205 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
206 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
207 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
208 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
209 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
210 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
211 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
212 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
213 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
214 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
215 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
216 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
217 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
218 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
219 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
220 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
223 #ifdef USE_UPPER_CASE_TABLE
224 const UChar OnigEncAsciiToUpperCaseTable[256] = {
225 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
226 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
227 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
228 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
229 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
230 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
231 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
232 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
233 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
234 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
235 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
236 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
237 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
238 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
239 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
240 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
241 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
242 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
243 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
244 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
245 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
246 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
247 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
248 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
249 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
250 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
251 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
252 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
253 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
254 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
255 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
256 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
258 #endif
260 const unsigned short OnigEncAsciiCtypeTable[256] = {
261 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
262 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
263 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
264 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
265 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
266 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
267 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
268 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
269 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
270 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
271 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
272 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
273 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
274 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
275 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
276 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
277 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
278 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
279 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
280 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
281 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
282 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
283 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
284 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
285 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
286 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
287 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
288 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
289 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
290 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
291 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
292 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
295 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
296 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
297 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
298 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
299 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
300 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
301 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
302 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
303 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
304 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
305 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
306 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
307 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
308 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
309 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
310 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
311 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
312 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
313 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
314 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
315 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
316 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
317 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
318 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
319 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
320 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
321 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
322 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
323 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
324 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
325 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
326 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
327 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
330 #ifdef USE_UPPER_CASE_TABLE
331 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
332 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
333 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
334 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
335 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
336 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
337 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
338 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
339 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
340 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
341 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
342 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
343 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
344 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
345 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
346 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
347 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
348 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
349 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
350 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
351 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
352 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
353 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
354 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
355 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
356 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
357 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
358 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
359 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
360 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
361 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
362 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
363 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
365 #endif
367 #if 0
368 extern void
369 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
371 /* nothing */
372 /* obsoleted. */
374 #endif
376 extern UChar*
377 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end)
379 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s, end);
382 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
383 { 0x41, 0x61 },
384 { 0x42, 0x62 },
385 { 0x43, 0x63 },
386 { 0x44, 0x64 },
387 { 0x45, 0x65 },
388 { 0x46, 0x66 },
389 { 0x47, 0x67 },
390 { 0x48, 0x68 },
391 { 0x49, 0x69 },
392 { 0x4a, 0x6a },
393 { 0x4b, 0x6b },
394 { 0x4c, 0x6c },
395 { 0x4d, 0x6d },
396 { 0x4e, 0x6e },
397 { 0x4f, 0x6f },
398 { 0x50, 0x70 },
399 { 0x51, 0x71 },
400 { 0x52, 0x72 },
401 { 0x53, 0x73 },
402 { 0x54, 0x74 },
403 { 0x55, 0x75 },
404 { 0x56, 0x76 },
405 { 0x57, 0x77 },
406 { 0x58, 0x78 },
407 { 0x59, 0x79 },
408 { 0x5a, 0x7a }
411 extern int
412 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
413 OnigApplyAllCaseFoldFunc f, void* arg,
414 OnigEncoding enc ARG_UNUSED)
416 OnigCodePoint code;
417 int i, r;
419 for (i = 0; i < numberof(OnigAsciiLowerMap); i++) {
420 code = OnigAsciiLowerMap[i].to;
421 r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
422 if (r != 0) return r;
424 code = OnigAsciiLowerMap[i].from;
425 r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
426 if (r != 0) return r;
429 return 0;
432 extern int
433 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
434 const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
435 OnigCaseFoldCodeItem items[], OnigEncoding enc ARG_UNUSED)
437 if (0x41 <= *p && *p <= 0x5a) {
438 items[0].byte_len = 1;
439 items[0].code_len = 1;
440 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
441 return 1;
443 else if (0x61 <= *p && *p <= 0x7a) {
444 items[0].byte_len = 1;
445 items[0].code_len = 1;
446 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
447 return 1;
449 else
450 return 0;
453 static int
454 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
455 OnigApplyAllCaseFoldFunc f, void* arg)
457 OnigCodePoint ss[] = { 0x73, 0x73 };
459 return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
462 extern int
463 onigenc_apply_all_case_fold_with_map(int map_size,
464 const OnigPairCaseFoldCodes map[],
465 int ess_tsett_flag, OnigCaseFoldType flag,
466 OnigApplyAllCaseFoldFunc f, void* arg)
468 OnigCodePoint code;
469 int i, r;
471 r = onigenc_ascii_apply_all_case_fold(flag, f, arg, 0);
472 if (r != 0) return r;
474 for (i = 0; i < map_size; i++) {
475 code = map[i].to;
476 r = (*f)(map[i].from, &code, 1, arg);
477 if (r != 0) return r;
479 code = map[i].from;
480 r = (*f)(map[i].to, &code, 1, arg);
481 if (r != 0) return r;
484 if (ess_tsett_flag != 0)
485 return ss_apply_all_case_fold(flag, f, arg);
487 return 0;
490 extern int
491 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
492 const OnigPairCaseFoldCodes map[],
493 int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
494 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
496 if (0x41 <= *p && *p <= 0x5a) {
497 items[0].byte_len = 1;
498 items[0].code_len = 1;
499 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
500 if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
501 && (*(p+1) == 0x53 || *(p+1) == 0x73)) {
502 /* SS */
503 items[1].byte_len = 2;
504 items[1].code_len = 1;
505 items[1].code[0] = (OnigCodePoint )0xdf;
506 return 2;
508 else
509 return 1;
511 else if (0x61 <= *p && *p <= 0x7a) {
512 items[0].byte_len = 1;
513 items[0].code_len = 1;
514 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
515 if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
516 && (*(p+1) == 0x73 || *(p+1) == 0x53)) {
517 /* ss */
518 items[1].byte_len = 2;
519 items[1].code_len = 1;
520 items[1].code[0] = (OnigCodePoint )0xdf;
521 return 2;
523 else
524 return 1;
526 else if (*p == 0xdf && ess_tsett_flag != 0) {
527 items[0].byte_len = 1;
528 items[0].code_len = 2;
529 items[0].code[0] = (OnigCodePoint )'s';
530 items[0].code[1] = (OnigCodePoint )'s';
532 items[1].byte_len = 1;
533 items[1].code_len = 2;
534 items[1].code[0] = (OnigCodePoint )'S';
535 items[1].code[1] = (OnigCodePoint )'S';
537 items[2].byte_len = 1;
538 items[2].code_len = 2;
539 items[2].code[0] = (OnigCodePoint )'s';
540 items[2].code[1] = (OnigCodePoint )'S';
542 items[3].byte_len = 1;
543 items[3].code_len = 2;
544 items[3].code[0] = (OnigCodePoint )'S';
545 items[3].code[1] = (OnigCodePoint )'s';
547 return 4;
549 else {
550 int i;
552 for (i = 0; i < map_size; i++) {
553 if (*p == map[i].from) {
554 items[0].byte_len = 1;
555 items[0].code_len = 1;
556 items[0].code[0] = map[i].to;
557 return 1;
559 else if (*p == map[i].to) {
560 items[0].byte_len = 1;
561 items[0].code_len = 1;
562 items[0].code[0] = map[i].from;
563 return 1;
568 return 0;
572 extern int
573 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
574 OnigCodePoint* sb_out ARG_UNUSED,
575 const OnigCodePoint* ranges[] ARG_UNUSED,
576 OnigEncoding enc)
578 return ONIG_NO_SUPPORT_CONFIG;
581 extern int
582 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end, OnigEncoding enc ARG_UNUSED)
584 if (p < end) {
585 if (*p == 0x0a) return 1;
587 return 0;
590 /* for single byte encodings */
591 extern int
592 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
593 const UChar* end, UChar* lower, OnigEncoding enc ARG_UNUSED)
595 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
597 (*p)++;
598 return 1; /* return byte length of converted char to lower */
601 #if 0
602 extern int
603 onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag ARG_UNUSED,
604 const UChar** pp, const UChar* end ARG_UNUSED)
606 const UChar* p = *pp;
608 (*pp)++;
609 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
611 #endif
613 extern int
614 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED, const UChar* e ARG_UNUSED,
615 OnigEncoding enc ARG_UNUSED)
617 return 1;
620 extern OnigCodePoint
621 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
622 OnigEncoding enc ARG_UNUSED)
624 return (OnigCodePoint )(*p);
627 extern int
628 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
630 return 1;
633 extern int
634 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
636 #ifdef RUBY
637 if (code > 0xff)
638 rb_raise(rb_eRangeError, "%u out of char range", code);
639 #endif
640 *buf = (UChar )(code & 0xff);
641 return 1;
644 extern UChar*
645 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
646 const UChar* s,
647 const UChar* end ARG_UNUSED,
648 OnigEncoding enc ARG_UNUSED)
650 return (UChar* )s;
653 extern int
654 onigenc_always_true_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
655 const UChar* end ARG_UNUSED,
656 OnigEncoding enc ARG_UNUSED)
658 return TRUE;
661 extern int
662 onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
663 const UChar* end ARG_UNUSED,
664 OnigEncoding enc ARG_UNUSED)
666 return FALSE;
669 extern int
670 onigenc_ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype,
671 OnigEncoding enc ARG_UNUSED)
673 if (code < 128)
674 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
675 else
676 return FALSE;
679 extern OnigCodePoint
680 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
682 int c, i, len;
683 OnigCodePoint n;
685 len = enclen(enc, p, end);
686 n = (OnigCodePoint )(*p++);
687 if (len == 1) return n;
689 for (i = 1; i < len; i++) {
690 if (p >= end) break;
691 c = *p++;
692 n <<= 8; n += c;
694 return n;
697 extern int
698 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
699 const UChar** pp, const UChar* end ARG_UNUSED,
700 UChar* lower)
702 int len;
703 const UChar *p = *pp;
705 if (ONIGENC_IS_MBC_ASCII(p)) {
706 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
707 (*pp)++;
708 return 1;
710 else {
711 int i;
713 len = enclen(enc, p, end);
714 for (i = 0; i < len; i++) {
715 *lower++ = *p++;
717 (*pp) += len;
718 return len; /* return byte length of converted to lower char */
722 #if 0
723 extern int
724 onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
725 const UChar** pp, const UChar* end ARG_UNUSED)
727 const UChar* p = *pp;
729 if (ONIGENC_IS_MBC_ASCII(p)) {
730 (*pp)++;
731 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
734 (*pp) += enclen(enc, p);
735 return FALSE;
737 #endif
739 extern int
740 onigenc_mb2_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
742 if (code <= 0xff) return 1;
743 if (code <= 0xffff) return 2;
744 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
747 extern int
748 onigenc_mb4_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
750 if ((code & 0xff000000) != 0) return 4;
751 else if ((code & 0xff0000) != 0) return 3;
752 else if ((code & 0xff00) != 0) return 2;
753 else return 1;
756 extern int
757 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
759 UChar *p = buf;
761 if ((code & 0xff00) != 0) {
762 *p++ = (UChar )((code >> 8) & 0xff);
764 *p++ = (UChar )(code & 0xff);
766 #if 1
767 if (enclen(enc, buf, p) != (p - buf))
768 return ONIGERR_INVALID_CODE_POINT_VALUE;
769 #endif
770 return (int )(p - buf);
773 extern int
774 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
776 UChar *p = buf;
778 if ((code & 0xff000000) != 0) {
779 *p++ = (UChar )((code >> 24) & 0xff);
781 if ((code & 0xff0000) != 0 || p != buf) {
782 *p++ = (UChar )((code >> 16) & 0xff);
784 if ((code & 0xff00) != 0 || p != buf) {
785 *p++ = (UChar )((code >> 8) & 0xff);
787 *p++ = (UChar )(code & 0xff);
789 #if 1
790 if (enclen(enc, buf, p) != (p - buf))
791 return ONIGERR_INVALID_CODE_POINT_VALUE;
792 #endif
793 return (int )(p - buf);
796 extern int
797 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, const UChar* p, const UChar* end)
799 static const PosixBracketEntryType PBS[] = {
800 POSIX_BRACKET_ENTRY_INIT("Alnum", ONIGENC_CTYPE_ALNUM),
801 POSIX_BRACKET_ENTRY_INIT("Alpha", ONIGENC_CTYPE_ALPHA),
802 POSIX_BRACKET_ENTRY_INIT("Blank", ONIGENC_CTYPE_BLANK),
803 POSIX_BRACKET_ENTRY_INIT("Cntrl", ONIGENC_CTYPE_CNTRL),
804 POSIX_BRACKET_ENTRY_INIT("Digit", ONIGENC_CTYPE_DIGIT),
805 POSIX_BRACKET_ENTRY_INIT("Graph", ONIGENC_CTYPE_GRAPH),
806 POSIX_BRACKET_ENTRY_INIT("Lower", ONIGENC_CTYPE_LOWER),
807 POSIX_BRACKET_ENTRY_INIT("Print", ONIGENC_CTYPE_PRINT),
808 POSIX_BRACKET_ENTRY_INIT("Punct", ONIGENC_CTYPE_PUNCT),
809 POSIX_BRACKET_ENTRY_INIT("Space", ONIGENC_CTYPE_SPACE),
810 POSIX_BRACKET_ENTRY_INIT("Upper", ONIGENC_CTYPE_UPPER),
811 POSIX_BRACKET_ENTRY_INIT("XDigit", ONIGENC_CTYPE_XDIGIT),
812 POSIX_BRACKET_ENTRY_INIT("ASCII", ONIGENC_CTYPE_ASCII),
813 POSIX_BRACKET_ENTRY_INIT("Word", ONIGENC_CTYPE_WORD),
816 const PosixBracketEntryType *pb;
817 int len;
819 len = onigenc_strlen(enc, p, end);
820 for (pb = PBS; pb < PBS + numberof(PBS); pb++) {
821 if (len == pb->len &&
822 onigenc_with_ascii_strnicmp(enc, p, end, pb->name, pb->len) == 0)
823 return pb->ctype;
826 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
829 extern int
830 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
831 unsigned int ctype)
833 if (code < 128)
834 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
835 else {
836 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
837 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
841 return FALSE;
844 extern int
845 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
846 unsigned int ctype)
848 if (code < 128)
849 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
850 else {
851 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
852 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
856 return FALSE;
859 extern int
860 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
861 const UChar* sascii /* ascii */, int n)
863 int x, c;
865 while (n-- > 0) {
866 if (p >= end) return (int )(*sascii);
868 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
869 x = *sascii - c;
870 if (x) return x;
872 sascii++;
873 p += enclen(enc, p, end);
875 return 0;
878 extern int
879 onigenc_with_ascii_strnicmp(OnigEncoding enc, const UChar* p, const UChar* end,
880 const UChar* sascii /* ascii */, int n)
882 int x, c;
884 while (n-- > 0) {
885 if (p >= end) return (int )(*sascii);
887 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
888 if (ONIGENC_IS_ASCII_CODE(c))
889 c = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c);
890 x = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*sascii) - c;
891 if (x) return x;
893 sascii++;
894 p += enclen(enc, p, end);
896 return 0;
899 #if 0
900 /* Property management */
901 static int
902 resize_property_list(int new_size, const OnigCodePoint*** plist, int* psize)
904 size_t size;
905 const OnigCodePoint **list = *plist;
907 size = sizeof(OnigCodePoint*) * new_size;
908 if (IS_NULL(list)) {
909 list = (const OnigCodePoint** )xmalloc(size);
910 if (IS_NULL(list)) return ONIGERR_MEMORY;
912 else {
913 const OnigCodePoint **tmp;
914 tmp = (const OnigCodePoint** )xrealloc((void* )list, size);
915 if (IS_NULL(tmp)) return ONIGERR_MEMORY;
916 list = tmp;
919 *plist = list;
920 *psize = new_size;
922 return 0;
925 extern int
926 onigenc_property_list_add_property(UChar* name, const OnigCodePoint* prop,
927 hash_table_type **table, const OnigCodePoint*** plist, int *pnum,
928 int *psize)
930 #define PROP_INIT_SIZE 16
932 int r;
934 if (*psize <= *pnum) {
935 int new_size = (*psize == 0 ? PROP_INIT_SIZE : *psize * 2);
936 r = resize_property_list(new_size, plist, psize);
937 if (r != 0) return r;
940 (*plist)[*pnum] = prop;
942 if (ONIG_IS_NULL(*table)) {
943 *table = onig_st_init_strend_table_with_size(PROP_INIT_SIZE);
944 if (ONIG_IS_NULL(*table)) return ONIGERR_MEMORY;
947 *pnum = *pnum + 1;
948 onig_st_insert_strend(*table, name, name + strlen((char* )name),
949 (hash_data_type )(*pnum + ONIGENC_MAX_STD_CTYPE));
950 return 0;
952 #endif
954 extern int
955 onigenc_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end,
956 OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc)
958 OnigCodePoint code;
959 OnigUChar *to_start = to;
960 OnigCaseFoldType flags = *flagP;
961 int codepoint_length;
963 while (*pp < end && to < to_end) {
964 codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end);
965 if (codepoint_length < 0)
966 return codepoint_length; /* encoding invalid */
967 code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
968 *pp += codepoint_length;
970 if (code >= 'a' && code <= 'z' && (flags & ONIGENC_CASE_UPCASE)) {
971 flags |= ONIGENC_CASE_MODIFIED;
972 code += 'A' - 'a';
973 } else if (code >= 'A' && code <= 'Z' &&
974 (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD))) {
975 flags |= ONIGENC_CASE_MODIFIED;
976 code += 'a' - 'A';
978 to += ONIGENC_CODE_TO_MBC(enc, code, to);
979 if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */
980 flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE);
982 *flagP = flags;
983 return (int )(to - to_start);
986 extern int
987 onigenc_single_byte_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp,
988 const OnigUChar* end, OnigUChar* to, OnigUChar* to_end,
989 const struct OnigEncodingTypeST* enc)
991 OnigCodePoint code;
992 OnigUChar *to_start = to;
993 OnigCaseFoldType flags = *flagP;
995 while (*pp < end && to < to_end) {
996 code = *(*pp)++;
998 if (code >= 'a' && code <= 'z' && (flags & ONIGENC_CASE_UPCASE)) {
999 flags |= ONIGENC_CASE_MODIFIED;
1000 code += 'A' - 'a';
1001 } else if (code >= 'A' && code <= 'Z' &&
1002 (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD))) {
1003 flags |= ONIGENC_CASE_MODIFIED;
1004 code += 'a' - 'A';
1006 *to++ = code;
1007 if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */
1008 flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE);
1010 *flagP = flags;
1011 return (int )(to - to_start);