Tempfile document updated.
[ruby.git] / regenc.c
blobeb523e1ae530a8f811abe7e06a60dfccddc74a55
1 /**********************************************************************
2 regenc.c - Onigmo (Oniguruma-mod) (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011-2016 K.Takata <kentkt AT csc DOT jp>
7 * All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
31 #include "regint.h"
33 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
35 extern int
36 onigenc_init(void)
38 return 0;
41 extern OnigEncoding
42 onigenc_get_default_encoding(void)
44 return OnigEncDefaultCharEncoding;
47 extern int
48 onigenc_set_default_encoding(OnigEncoding enc)
50 OnigEncDefaultCharEncoding = enc;
51 return 0;
54 extern int
55 onigenc_mbclen(const OnigUChar* p,const OnigUChar* e, OnigEncoding enc)
57 int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc, p, e);
58 if (ONIGENC_MBCLEN_CHARFOUND_P(ret)) {
59 ret = ONIGENC_MBCLEN_CHARFOUND_LEN(ret);
60 if (p + ret > e) ret = (int)(e - p); // just for case
61 return ret;
63 else if (ONIGENC_MBCLEN_NEEDMORE_P(ret)) {
64 return (int)(e - p);
66 return p < e ? 1 : 0;
69 extern int
70 onigenc_mbclen_approximate(const OnigUChar* p,const OnigUChar* e, OnigEncoding enc)
72 int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc, p, e);
73 if (ONIGENC_MBCLEN_CHARFOUND_P(ret))
74 return ONIGENC_MBCLEN_CHARFOUND_LEN(ret);
75 else if (ONIGENC_MBCLEN_NEEDMORE_P(ret))
76 return (int )(e - p) + ONIGENC_MBCLEN_NEEDMORE_LEN(ret);
77 return 1;
80 extern UChar*
81 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end)
83 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s, end);
84 if (p < s) {
85 p += enclen(enc, p, end);
87 return p;
90 extern UChar*
91 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
92 const UChar* start, const UChar* s, const UChar* end, const UChar** prev)
94 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s, end);
96 if (p < s) {
97 if (prev) *prev = (const UChar* )p;
98 p += enclen(enc, p, end);
100 else {
101 if (prev) *prev = (const UChar* )NULL; /* Sorry */
103 return p;
106 extern UChar*
107 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end)
109 if (s <= start)
110 return (UChar* )NULL;
112 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1, end);
115 extern UChar*
116 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end, int n)
118 while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
119 if (s <= start)
120 return (UChar* )NULL;
122 s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1, end);
124 return (UChar* )s;
127 extern UChar*
128 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
130 UChar* q = (UChar* )p;
131 while (n-- > 0) {
132 q += ONIGENC_MBC_ENC_LEN(enc, q, end);
134 return (q <= end ? q : NULL);
137 extern int
138 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
140 int n = 0;
141 UChar* q = (UChar* )p;
143 while (q < end) {
144 q += ONIGENC_MBC_ENC_LEN(enc, q, end);
145 n++;
147 return n;
150 extern int
151 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
153 int n = 0;
154 UChar* p = (UChar* )s;
155 UChar* e;
157 while (1) {
158 if (*p == '\0') {
159 UChar* q;
160 int len = ONIGENC_MBC_MINLEN(enc);
162 if (len == 1) return n;
163 q = p + 1;
164 while (len > 1) {
165 if (*q != '\0') break;
166 q++;
167 len--;
169 if (len == 1) return n;
171 e = p + ONIGENC_MBC_MAXLEN(enc);
172 p += ONIGENC_MBC_ENC_LEN(enc, p, e);
173 n++;
177 extern int
178 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
180 UChar* start = (UChar* )s;
181 UChar* p = (UChar* )s;
182 UChar* e;
184 while (1) {
185 if (*p == '\0') {
186 UChar* q;
187 int len = ONIGENC_MBC_MINLEN(enc);
189 if (len == 1) return (int )(p - start);
190 q = p + 1;
191 while (len > 1) {
192 if (*q != '\0') break;
193 q++;
194 len--;
196 if (len == 1) return (int )(p - start);
198 e = p + ONIGENC_MBC_MAXLEN(enc);
199 p += ONIGENC_MBC_ENC_LEN(enc, p, e);
203 const UChar OnigEncAsciiToLowerCaseTable[] = {
204 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
205 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
206 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
207 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
208 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
209 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
210 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
211 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
212 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
213 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
214 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
215 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
216 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
217 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
218 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
219 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
220 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
221 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
222 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
223 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
224 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
225 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
226 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
227 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
228 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
229 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
230 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
231 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
232 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
233 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
234 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
235 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
238 #ifdef USE_UPPER_CASE_TABLE
239 const UChar OnigEncAsciiToUpperCaseTable[256] = {
240 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
241 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
242 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
243 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
244 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
245 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
246 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
247 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
248 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
249 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
250 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
251 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
252 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
253 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
254 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
255 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
256 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
257 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
258 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
259 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
260 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
261 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
262 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
263 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
264 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
265 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
266 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
267 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
268 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
269 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
270 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
271 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
273 #endif
275 const unsigned short OnigEncAsciiCtypeTable[256] = {
276 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
277 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
278 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
279 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
280 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
281 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
282 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
283 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
284 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
285 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
286 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
287 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
288 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
289 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
290 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
291 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
292 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
293 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
294 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
295 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
296 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
297 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
298 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
299 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
300 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
301 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
302 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
303 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
304 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
305 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
306 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
307 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
310 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
311 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
312 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
313 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
314 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
315 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
316 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
317 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
318 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
319 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
320 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
321 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
322 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
323 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
324 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
325 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
326 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
327 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
328 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
329 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
330 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
331 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
332 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
333 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
334 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
335 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
336 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
337 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
338 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
339 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
340 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
341 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
342 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
345 #ifdef USE_UPPER_CASE_TABLE
346 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
347 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
348 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
349 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
350 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
351 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
352 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
353 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
354 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
355 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
356 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
357 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
358 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
359 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
360 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
361 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
362 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
363 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
364 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
365 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
366 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
367 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
368 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
369 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
370 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
371 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
372 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
373 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
374 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
375 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
376 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
377 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
378 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
380 #endif
382 #if 0
383 extern void
384 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
386 /* nothing */
387 /* obsoleted. */
389 #endif
391 extern UChar*
392 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end)
394 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s, end);
397 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
398 { 0x41, 0x61 },
399 { 0x42, 0x62 },
400 { 0x43, 0x63 },
401 { 0x44, 0x64 },
402 { 0x45, 0x65 },
403 { 0x46, 0x66 },
404 { 0x47, 0x67 },
405 { 0x48, 0x68 },
406 { 0x49, 0x69 },
407 { 0x4a, 0x6a },
408 { 0x4b, 0x6b },
409 { 0x4c, 0x6c },
410 { 0x4d, 0x6d },
411 { 0x4e, 0x6e },
412 { 0x4f, 0x6f },
413 { 0x50, 0x70 },
414 { 0x51, 0x71 },
415 { 0x52, 0x72 },
416 { 0x53, 0x73 },
417 { 0x54, 0x74 },
418 { 0x55, 0x75 },
419 { 0x56, 0x76 },
420 { 0x57, 0x77 },
421 { 0x58, 0x78 },
422 { 0x59, 0x79 },
423 { 0x5a, 0x7a }
426 extern int
427 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
428 OnigApplyAllCaseFoldFunc f, void* arg,
429 OnigEncoding enc ARG_UNUSED)
431 OnigCodePoint code;
432 int i, r;
434 for (i = 0; i < numberof(OnigAsciiLowerMap); i++) {
435 code = OnigAsciiLowerMap[i].to;
436 r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
437 if (r != 0) return r;
439 code = OnigAsciiLowerMap[i].from;
440 r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
441 if (r != 0) return r;
444 return 0;
447 extern int
448 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
449 const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
450 OnigCaseFoldCodeItem items[], OnigEncoding enc ARG_UNUSED)
452 if (0x41 <= *p && *p <= 0x5a) {
453 items[0].byte_len = 1;
454 items[0].code_len = 1;
455 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
456 return 1;
458 else if (0x61 <= *p && *p <= 0x7a) {
459 items[0].byte_len = 1;
460 items[0].code_len = 1;
461 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
462 return 1;
464 else
465 return 0;
468 static int
469 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
470 OnigApplyAllCaseFoldFunc f, void* arg)
472 OnigCodePoint ss[] = { 0x73, 0x73 };
474 return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
477 extern int
478 onigenc_apply_all_case_fold_with_map(int map_size,
479 const OnigPairCaseFoldCodes map[],
480 int ess_tsett_flag, OnigCaseFoldType flag,
481 OnigApplyAllCaseFoldFunc f, void* arg)
483 OnigCodePoint code;
484 int i, r;
486 r = onigenc_ascii_apply_all_case_fold(flag, f, arg, 0);
487 if (r != 0) return r;
489 for (i = 0; i < map_size; i++) {
490 code = map[i].to;
491 r = (*f)(map[i].from, &code, 1, arg);
492 if (r != 0) return r;
494 code = map[i].from;
495 r = (*f)(map[i].to, &code, 1, arg);
496 if (r != 0) return r;
499 if (ess_tsett_flag != 0)
500 return ss_apply_all_case_fold(flag, f, arg);
502 return 0;
505 extern int
506 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
507 const OnigPairCaseFoldCodes map[],
508 int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
509 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
511 if (0x41 <= *p && *p <= 0x5a) {
512 items[0].byte_len = 1;
513 items[0].code_len = 1;
514 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
515 if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
516 && (*(p+1) == 0x53 || *(p+1) == 0x73)) {
517 /* SS */
518 items[1].byte_len = 2;
519 items[1].code_len = 1;
520 items[1].code[0] = (OnigCodePoint )0xdf;
521 return 2;
523 else
524 return 1;
526 else if (0x61 <= *p && *p <= 0x7a) {
527 items[0].byte_len = 1;
528 items[0].code_len = 1;
529 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
530 if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
531 && (*(p+1) == 0x73 || *(p+1) == 0x53)) {
532 /* ss */
533 items[1].byte_len = 2;
534 items[1].code_len = 1;
535 items[1].code[0] = (OnigCodePoint )0xdf;
536 return 2;
538 else
539 return 1;
541 else if (*p == 0xdf && ess_tsett_flag != 0) {
542 items[0].byte_len = 1;
543 items[0].code_len = 2;
544 items[0].code[0] = (OnigCodePoint )'s';
545 items[0].code[1] = (OnigCodePoint )'s';
547 items[1].byte_len = 1;
548 items[1].code_len = 2;
549 items[1].code[0] = (OnigCodePoint )'S';
550 items[1].code[1] = (OnigCodePoint )'S';
552 items[2].byte_len = 1;
553 items[2].code_len = 2;
554 items[2].code[0] = (OnigCodePoint )'s';
555 items[2].code[1] = (OnigCodePoint )'S';
557 items[3].byte_len = 1;
558 items[3].code_len = 2;
559 items[3].code[0] = (OnigCodePoint )'S';
560 items[3].code[1] = (OnigCodePoint )'s';
562 return 4;
564 else {
565 int i;
567 for (i = 0; i < map_size; i++) {
568 if (*p == map[i].from) {
569 items[0].byte_len = 1;
570 items[0].code_len = 1;
571 items[0].code[0] = map[i].to;
572 return 1;
574 else if (*p == map[i].to) {
575 items[0].byte_len = 1;
576 items[0].code_len = 1;
577 items[0].code[0] = map[i].from;
578 return 1;
583 return 0;
587 extern int
588 onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
589 OnigCodePoint* sb_out ARG_UNUSED,
590 const OnigCodePoint* ranges[] ARG_UNUSED,
591 OnigEncoding enc)
593 return ONIG_NO_SUPPORT_CONFIG;
596 extern int
597 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end, OnigEncoding enc ARG_UNUSED)
599 if (p < end) {
600 if (*p == 0x0a) return 1;
602 return 0;
605 /* for single byte encodings */
606 extern int
607 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
608 const UChar* end, UChar* lower, OnigEncoding enc ARG_UNUSED)
610 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
612 (*p)++;
613 return 1; /* return byte length of converted char to lower */
616 #if 0
617 extern int
618 onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag ARG_UNUSED,
619 const UChar** pp, const UChar* end ARG_UNUSED)
621 const UChar* p = *pp;
623 (*pp)++;
624 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
626 #endif
628 extern int
629 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED, const UChar* e ARG_UNUSED,
630 OnigEncoding enc ARG_UNUSED)
632 return 1;
635 extern OnigCodePoint
636 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
637 OnigEncoding enc ARG_UNUSED)
639 return (OnigCodePoint )(*p);
642 extern int
643 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
645 return 1;
648 extern int
649 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
651 #ifdef RUBY
652 if (code > 0xff)
653 rb_raise(rb_eRangeError, "%u out of char range", code);
654 #endif
655 *buf = (UChar )(code & 0xff);
656 return 1;
659 extern UChar*
660 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
661 const UChar* s,
662 const UChar* end ARG_UNUSED,
663 OnigEncoding enc ARG_UNUSED)
665 return (UChar* )s;
668 extern int
669 onigenc_always_true_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
670 const UChar* end ARG_UNUSED,
671 OnigEncoding enc ARG_UNUSED)
673 return TRUE;
676 extern int
677 onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
678 const UChar* end ARG_UNUSED,
679 OnigEncoding enc ARG_UNUSED)
681 return FALSE;
684 extern int
685 onigenc_ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype,
686 OnigEncoding enc ARG_UNUSED)
688 if (code < 128)
689 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
690 else
691 return FALSE;
694 extern OnigCodePoint
695 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
697 int c, i, len;
698 OnigCodePoint n;
700 len = enclen(enc, p, end);
701 n = (OnigCodePoint )(*p++);
702 if (len == 1) return n;
704 for (i = 1; i < len; i++) {
705 if (p >= end) break;
706 c = *p++;
707 n <<= 8; n += c;
709 return n;
712 extern int
713 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
714 const UChar** pp, const UChar* end ARG_UNUSED,
715 UChar* lower)
717 int len;
718 const UChar *p = *pp;
720 if (ONIGENC_IS_MBC_ASCII(p)) {
721 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
722 (*pp)++;
723 return 1;
725 else {
726 int i;
728 len = enclen(enc, p, end);
729 for (i = 0; i < len; i++) {
730 *lower++ = *p++;
732 (*pp) += len;
733 return len; /* return byte length of converted to lower char */
737 #if 0
738 extern int
739 onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
740 const UChar** pp, const UChar* end ARG_UNUSED)
742 const UChar* p = *pp;
744 if (ONIGENC_IS_MBC_ASCII(p)) {
745 (*pp)++;
746 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
749 (*pp) += enclen(enc, p);
750 return FALSE;
752 #endif
754 extern int
755 onigenc_mb2_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
757 if (code <= 0xff) return 1;
758 if (code <= 0xffff) return 2;
759 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
762 extern int
763 onigenc_mb4_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
765 if ((code & 0xff000000) != 0) return 4;
766 else if ((code & 0xff0000) != 0) return 3;
767 else if ((code & 0xff00) != 0) return 2;
768 else return 1;
771 extern int
772 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
774 UChar *p = buf;
776 if ((code & 0xff00) != 0) {
777 *p++ = (UChar )((code >> 8) & 0xff);
779 *p++ = (UChar )(code & 0xff);
781 #if 1
782 if (enclen(enc, buf, p) != (p - buf))
783 return ONIGERR_INVALID_CODE_POINT_VALUE;
784 #endif
785 return (int )(p - buf);
788 extern int
789 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
791 UChar *p = buf;
793 if ((code & 0xff000000) != 0) {
794 *p++ = (UChar )((code >> 24) & 0xff);
796 if ((code & 0xff0000) != 0 || p != buf) {
797 *p++ = (UChar )((code >> 16) & 0xff);
799 if ((code & 0xff00) != 0 || p != buf) {
800 *p++ = (UChar )((code >> 8) & 0xff);
802 *p++ = (UChar )(code & 0xff);
804 #if 1
805 if (enclen(enc, buf, p) != (p - buf))
806 return ONIGERR_INVALID_CODE_POINT_VALUE;
807 #endif
808 return (int )(p - buf);
811 extern int
812 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, const UChar* p, const UChar* end)
814 static const PosixBracketEntryType PBS[] = {
815 POSIX_BRACKET_ENTRY_INIT("Alnum", ONIGENC_CTYPE_ALNUM),
816 POSIX_BRACKET_ENTRY_INIT("Alpha", ONIGENC_CTYPE_ALPHA),
817 POSIX_BRACKET_ENTRY_INIT("Blank", ONIGENC_CTYPE_BLANK),
818 POSIX_BRACKET_ENTRY_INIT("Cntrl", ONIGENC_CTYPE_CNTRL),
819 POSIX_BRACKET_ENTRY_INIT("Digit", ONIGENC_CTYPE_DIGIT),
820 POSIX_BRACKET_ENTRY_INIT("Graph", ONIGENC_CTYPE_GRAPH),
821 POSIX_BRACKET_ENTRY_INIT("Lower", ONIGENC_CTYPE_LOWER),
822 POSIX_BRACKET_ENTRY_INIT("Print", ONIGENC_CTYPE_PRINT),
823 POSIX_BRACKET_ENTRY_INIT("Punct", ONIGENC_CTYPE_PUNCT),
824 POSIX_BRACKET_ENTRY_INIT("Space", ONIGENC_CTYPE_SPACE),
825 POSIX_BRACKET_ENTRY_INIT("Upper", ONIGENC_CTYPE_UPPER),
826 POSIX_BRACKET_ENTRY_INIT("XDigit", ONIGENC_CTYPE_XDIGIT),
827 POSIX_BRACKET_ENTRY_INIT("ASCII", ONIGENC_CTYPE_ASCII),
828 POSIX_BRACKET_ENTRY_INIT("Word", ONIGENC_CTYPE_WORD),
831 const PosixBracketEntryType *pb;
832 int len;
834 len = onigenc_strlen(enc, p, end);
835 for (pb = PBS; pb < PBS + numberof(PBS); pb++) {
836 if (len == pb->len &&
837 onigenc_with_ascii_strnicmp(enc, p, end, pb->name, pb->len) == 0)
838 return pb->ctype;
841 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
844 extern int
845 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
846 unsigned int ctype)
848 if (code < 128)
849 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
850 else {
851 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
852 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
856 return FALSE;
859 extern int
860 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
861 unsigned int ctype)
863 if (code < 128)
864 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
865 else {
866 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
867 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
871 return FALSE;
874 extern int
875 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
876 const UChar* sascii /* ascii */, int n)
878 int x, c;
880 while (n-- > 0) {
881 if (p >= end) return (int )(*sascii);
883 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
884 x = *sascii - c;
885 if (x) return x;
887 sascii++;
888 p += enclen(enc, p, end);
890 return 0;
893 extern int
894 onigenc_with_ascii_strnicmp(OnigEncoding enc, const UChar* p, const UChar* end,
895 const UChar* sascii /* ascii */, int n)
897 int x, c;
899 while (n-- > 0) {
900 if (p >= end) return (int )(*sascii);
902 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
903 if (ONIGENC_IS_ASCII_CODE(c))
904 c = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c);
905 x = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*sascii) - c;
906 if (x) return x;
908 sascii++;
909 p += enclen(enc, p, end);
911 return 0;
914 #if 0
915 /* Property management */
916 static int
917 resize_property_list(int new_size, const OnigCodePoint*** plist, int* psize)
919 size_t size;
920 const OnigCodePoint **list = *plist;
922 size = sizeof(OnigCodePoint*) * new_size;
923 if (IS_NULL(list)) {
924 list = (const OnigCodePoint** )xmalloc(size);
925 if (IS_NULL(list)) return ONIGERR_MEMORY;
927 else {
928 const OnigCodePoint **tmp;
929 tmp = (const OnigCodePoint** )xrealloc((void* )list, size);
930 if (IS_NULL(tmp)) return ONIGERR_MEMORY;
931 list = tmp;
934 *plist = list;
935 *psize = new_size;
937 return 0;
940 extern int
941 onigenc_property_list_add_property(UChar* name, const OnigCodePoint* prop,
942 hash_table_type **table, const OnigCodePoint*** plist, int *pnum,
943 int *psize)
945 #define PROP_INIT_SIZE 16
947 int r;
949 if (*psize <= *pnum) {
950 int new_size = (*psize == 0 ? PROP_INIT_SIZE : *psize * 2);
951 r = resize_property_list(new_size, plist, psize);
952 if (r != 0) return r;
955 (*plist)[*pnum] = prop;
957 if (ONIG_IS_NULL(*table)) {
958 *table = onig_st_init_strend_table_with_size(PROP_INIT_SIZE);
959 if (ONIG_IS_NULL(*table)) return ONIGERR_MEMORY;
962 *pnum = *pnum + 1;
963 onig_st_insert_strend(*table, name, name + strlen((char* )name),
964 (hash_data_type )(*pnum + ONIGENC_MAX_STD_CTYPE));
965 return 0;
967 #endif
969 extern int
970 onigenc_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end,
971 OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc)
973 OnigCodePoint code;
974 OnigUChar *to_start = to;
975 OnigCaseFoldType flags = *flagP;
976 int codepoint_length;
978 while (*pp < end && to < to_end) {
979 codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end);
980 if (codepoint_length < 0)
981 return codepoint_length; /* encoding invalid */
982 code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
983 *pp += codepoint_length;
985 if (code >= 'a' && code <= 'z' && (flags & ONIGENC_CASE_UPCASE)) {
986 flags |= ONIGENC_CASE_MODIFIED;
987 code += 'A' - 'a';
988 } else if (code >= 'A' && code <= 'Z' &&
989 (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD))) {
990 flags |= ONIGENC_CASE_MODIFIED;
991 code += 'a' - 'A';
993 to += ONIGENC_CODE_TO_MBC(enc, code, to);
994 if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */
995 flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE);
997 *flagP = flags;
998 return (int )(to - to_start);
1001 extern int
1002 onigenc_single_byte_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp,
1003 const OnigUChar* end, OnigUChar* to, OnigUChar* to_end,
1004 const struct OnigEncodingTypeST* enc)
1006 OnigCodePoint code;
1007 OnigUChar *to_start = to;
1008 OnigCaseFoldType flags = *flagP;
1010 while (*pp < end && to < to_end) {
1011 code = *(*pp)++;
1013 if (code >= 'a' && code <= 'z' && (flags & ONIGENC_CASE_UPCASE)) {
1014 flags |= ONIGENC_CASE_MODIFIED;
1015 code += 'A' - 'a';
1016 } else if (code >= 'A' && code <= 'Z' &&
1017 (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD))) {
1018 flags |= ONIGENC_CASE_MODIFIED;
1019 code += 'a' - 'A';
1021 *to++ = code;
1022 if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */
1023 flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE);
1025 *flagP = flags;
1026 return (int )(to - to_start);