[ruby/etc] bump up to 1.3.1
[ruby-80x24.org.git] / regcomp.c
blobd51163103ea6cbc8fcce1a3bc0aa812a2e6b5488
1 /**********************************************************************
2 regcomp.c - Onigmo (Oniguruma-mod) (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011-2016 K.Takata <kentkt AT csc DOT jp>
7 * All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
31 #include "regparse.h"
33 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
35 extern OnigCaseFoldType
36 onig_get_default_case_fold_flag(void)
38 return OnigDefaultCaseFoldFlag;
41 extern int
42 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
44 OnigDefaultCaseFoldFlag = case_fold_flag;
45 return 0;
49 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
50 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
51 #endif
53 #if 0
54 static UChar*
55 str_dup(UChar* s, UChar* end)
57 ptrdiff_t len = end - s;
59 if (len > 0) {
60 UChar* r = (UChar* )xmalloc(len + 1);
61 CHECK_NULL_RETURN(r);
62 xmemcpy(r, s, len);
63 r[len] = (UChar )0;
64 return r;
66 else return NULL;
68 #endif
70 static void
71 swap_node(Node* a, Node* b)
73 Node c;
74 c = *a; *a = *b; *b = c;
76 if (NTYPE(a) == NT_STR) {
77 StrNode* sn = NSTR(a);
78 if (sn->capa == 0) {
79 size_t len = sn->end - sn->s;
80 sn->s = sn->buf;
81 sn->end = sn->s + len;
85 if (NTYPE(b) == NT_STR) {
86 StrNode* sn = NSTR(b);
87 if (sn->capa == 0) {
88 size_t len = sn->end - sn->s;
89 sn->s = sn->buf;
90 sn->end = sn->s + len;
95 static OnigDistance
96 distance_add(OnigDistance d1, OnigDistance d2)
98 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
99 return ONIG_INFINITE_DISTANCE;
100 else {
101 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
102 else return ONIG_INFINITE_DISTANCE;
106 static OnigDistance
107 distance_multiply(OnigDistance d, int m)
109 if (m == 0) return 0;
111 if (d < ONIG_INFINITE_DISTANCE / m)
112 return d * m;
113 else
114 return ONIG_INFINITE_DISTANCE;
117 static int
118 bitset_is_empty(BitSetRef bs)
120 int i;
121 for (i = 0; i < BITSET_SIZE; i++) {
122 if (bs[i] != 0) return 0;
124 return 1;
127 #ifdef ONIG_DEBUG
128 static int
129 bitset_on_num(BitSetRef bs)
131 int i, n;
133 n = 0;
134 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
135 if (BITSET_AT(bs, i)) n++;
137 return n;
139 #endif
141 // Attempt to right size allocated buffers for a regex post compile
142 static void
143 onig_reg_resize(regex_t *reg)
145 resize:
146 if (reg->alloc > reg->used) {
147 unsigned char *new_ptr = xrealloc(reg->p, reg->used);
148 // Skip the right size optimization if memory allocation fails
149 if (new_ptr) {
150 reg->alloc = reg->used;
151 reg->p = new_ptr;
154 if (reg->chain) {
155 reg = reg->chain;
156 goto resize;
160 extern int
161 onig_bbuf_init(BBuf* buf, OnigDistance size)
163 if (size <= 0) {
164 size = 0;
165 buf->p = NULL;
167 else {
168 buf->p = (UChar* )xmalloc(size);
169 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
172 buf->alloc = (unsigned int )size;
173 buf->used = 0;
174 return 0;
178 #ifdef USE_SUBEXP_CALL
180 static int
181 unset_addr_list_init(UnsetAddrList* uslist, int size)
183 UnsetAddr* p;
185 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
186 CHECK_NULL_RETURN_MEMERR(p);
187 uslist->num = 0;
188 uslist->alloc = size;
189 uslist->us = p;
190 return 0;
193 static void
194 unset_addr_list_end(UnsetAddrList* uslist)
196 if (IS_NOT_NULL(uslist->us))
197 xfree(uslist->us);
200 static int
201 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
203 UnsetAddr* p;
204 int size;
206 if (uslist->num >= uslist->alloc) {
207 size = uslist->alloc * 2;
208 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
209 CHECK_NULL_RETURN_MEMERR(p);
210 uslist->alloc = size;
211 uslist->us = p;
214 uslist->us[uslist->num].offset = offset;
215 uslist->us[uslist->num].target = node;
216 uslist->num++;
217 return 0;
219 #endif /* USE_SUBEXP_CALL */
222 static int
223 add_opcode(regex_t* reg, int opcode)
225 BBUF_ADD1(reg, opcode);
226 return 0;
229 #ifdef USE_COMBINATION_EXPLOSION_CHECK
230 static int
231 add_state_check_num(regex_t* reg, int num)
233 StateCheckNumType n = (StateCheckNumType )num;
235 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
236 return 0;
238 #endif
240 static int
241 add_rel_addr(regex_t* reg, int addr)
243 RelAddrType ra = (RelAddrType )addr;
245 BBUF_ADD(reg, &ra, SIZE_RELADDR);
246 return 0;
249 static int
250 add_abs_addr(regex_t* reg, int addr)
252 AbsAddrType ra = (AbsAddrType )addr;
254 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
255 return 0;
258 static int
259 add_length(regex_t* reg, OnigDistance len)
261 LengthType l = (LengthType )len;
263 BBUF_ADD(reg, &l, SIZE_LENGTH);
264 return 0;
267 static int
268 add_mem_num(regex_t* reg, int num)
270 MemNumType n = (MemNumType )num;
272 BBUF_ADD(reg, &n, SIZE_MEMNUM);
273 return 0;
276 #if 0
277 static int
278 add_pointer(regex_t* reg, void* addr)
280 PointerType ptr = (PointerType )addr;
282 BBUF_ADD(reg, &ptr, SIZE_POINTER);
283 return 0;
285 #endif
287 static int
288 add_option(regex_t* reg, OnigOptionType option)
290 BBUF_ADD(reg, &option, SIZE_OPTION);
291 return 0;
294 static int
295 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
297 int r;
299 r = add_opcode(reg, opcode);
300 if (r) return r;
301 r = add_rel_addr(reg, addr);
302 return r;
305 static int
306 add_bytes(regex_t* reg, UChar* bytes, OnigDistance len)
308 BBUF_ADD(reg, bytes, len);
309 return 0;
312 static int
313 add_bitset(regex_t* reg, BitSetRef bs)
315 BBUF_ADD(reg, bs, SIZE_BITSET);
316 return 0;
319 static int
320 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
322 int r;
324 r = add_opcode(reg, opcode);
325 if (r) return r;
326 r = add_option(reg, option);
327 return r;
330 static int compile_length_tree(Node* node, regex_t* reg);
331 static int compile_tree(Node* node, regex_t* reg);
334 #define IS_NEED_STR_LEN_OP_EXACT(op) \
335 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
336 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
338 static int
339 select_str_opcode(int mb_len, OnigDistance byte_len, int ignore_case)
341 int op;
342 OnigDistance str_len = (byte_len + mb_len - 1) / mb_len;
344 if (ignore_case) {
345 switch (str_len) {
346 case 1: op = OP_EXACT1_IC; break;
347 default: op = OP_EXACTN_IC; break;
350 else {
351 switch (mb_len) {
352 case 1:
353 switch (str_len) {
354 case 1: op = OP_EXACT1; break;
355 case 2: op = OP_EXACT2; break;
356 case 3: op = OP_EXACT3; break;
357 case 4: op = OP_EXACT4; break;
358 case 5: op = OP_EXACT5; break;
359 default: op = OP_EXACTN; break;
361 break;
363 case 2:
364 switch (str_len) {
365 case 1: op = OP_EXACTMB2N1; break;
366 case 2: op = OP_EXACTMB2N2; break;
367 case 3: op = OP_EXACTMB2N3; break;
368 default: op = OP_EXACTMB2N; break;
370 break;
372 case 3:
373 op = OP_EXACTMB3N;
374 break;
376 default:
377 op = OP_EXACTMBN;
378 break;
381 return op;
384 static int
385 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
387 int r;
388 int saved_num_null_check = reg->num_null_check;
390 if (empty_info != 0) {
391 r = add_opcode(reg, OP_NULL_CHECK_START);
392 if (r) return r;
393 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
394 if (r) return r;
395 reg->num_null_check++;
398 r = compile_tree(node, reg);
399 if (r) return r;
401 if (empty_info != 0) {
402 if (empty_info == NQ_TARGET_IS_EMPTY)
403 r = add_opcode(reg, OP_NULL_CHECK_END);
404 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
405 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
406 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
407 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
409 if (r) return r;
410 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
412 return r;
415 #ifdef USE_SUBEXP_CALL
416 static int
417 compile_call(CallNode* node, regex_t* reg)
419 int r;
421 r = add_opcode(reg, OP_CALL);
422 if (r) return r;
423 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
424 node->target);
425 if (r) return r;
426 r = add_abs_addr(reg, 0 /*dummy addr.*/);
427 return r;
429 #endif
431 static int
432 compile_tree_n_times(Node* node, int n, regex_t* reg)
434 int i, r;
436 for (i = 0; i < n; i++) {
437 r = compile_tree(node, reg);
438 if (r) return r;
440 return 0;
443 static int
444 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance byte_len,
445 regex_t* reg ARG_UNUSED, int ignore_case)
447 int len;
448 int op = select_str_opcode(mb_len, byte_len, ignore_case);
450 len = SIZE_OPCODE;
452 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
453 if (IS_NEED_STR_LEN_OP_EXACT(op))
454 len += SIZE_LENGTH;
456 len += (int )byte_len;
457 return len;
460 static int
461 add_compile_string(UChar* s, int mb_len, OnigDistance byte_len,
462 regex_t* reg, int ignore_case)
464 int op = select_str_opcode(mb_len, byte_len, ignore_case);
465 add_opcode(reg, op);
467 if (op == OP_EXACTMBN)
468 add_length(reg, mb_len);
470 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
471 if (op == OP_EXACTN_IC)
472 add_length(reg, byte_len);
473 else
474 add_length(reg, byte_len / mb_len);
477 add_bytes(reg, s, byte_len);
478 return 0;
482 static int
483 compile_length_string_node(Node* node, regex_t* reg)
485 int rlen, r, len, prev_len, blen, ambig;
486 OnigEncoding enc = reg->enc;
487 UChar *p, *prev;
488 StrNode* sn;
490 sn = NSTR(node);
491 if (sn->end <= sn->s)
492 return 0;
494 ambig = NSTRING_IS_AMBIG(node);
496 p = prev = sn->s;
497 prev_len = enclen(enc, p, sn->end);
498 p += prev_len;
499 blen = prev_len;
500 rlen = 0;
502 for (; p < sn->end; ) {
503 len = enclen(enc, p, sn->end);
504 if (len == prev_len || ambig) {
505 blen += len;
507 else {
508 r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
509 rlen += r;
510 prev = p;
511 blen = len;
512 prev_len = len;
514 p += len;
516 r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
517 rlen += r;
518 return rlen;
521 static int
522 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
524 if (sn->end <= sn->s)
525 return 0;
527 return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
530 static int
531 compile_string_node(Node* node, regex_t* reg)
533 int r, len, prev_len, blen, ambig;
534 OnigEncoding enc = reg->enc;
535 UChar *p, *prev, *end;
536 StrNode* sn;
538 sn = NSTR(node);
539 if (sn->end <= sn->s)
540 return 0;
542 end = sn->end;
543 ambig = NSTRING_IS_AMBIG(node);
545 p = prev = sn->s;
546 prev_len = enclen(enc, p, end);
547 p += prev_len;
548 blen = prev_len;
550 for (; p < end; ) {
551 len = enclen(enc, p, end);
552 if (len == prev_len || ambig) {
553 blen += len;
555 else {
556 r = add_compile_string(prev, prev_len, blen, reg, ambig);
557 if (r) return r;
559 prev = p;
560 blen = len;
561 prev_len = len;
564 p += len;
566 return add_compile_string(prev, prev_len, blen, reg, ambig);
569 static int
570 compile_string_raw_node(StrNode* sn, regex_t* reg)
572 if (sn->end <= sn->s)
573 return 0;
575 return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
578 static int
579 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
581 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
582 add_length(reg, mbuf->used);
583 return add_bytes(reg, mbuf->p, mbuf->used);
584 #else
585 int r, pad_size;
586 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
588 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
589 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
590 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
592 r = add_bytes(reg, mbuf->p, mbuf->used);
594 /* padding for return value from compile_length_cclass_node() to be fix. */
595 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
596 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
597 return r;
598 #endif
601 static int
602 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
604 int len;
606 if (IS_NULL(cc->mbuf)) {
607 len = SIZE_OPCODE + SIZE_BITSET;
609 else {
610 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
611 len = SIZE_OPCODE;
613 else {
614 len = SIZE_OPCODE + SIZE_BITSET;
616 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
617 len += SIZE_LENGTH + cc->mbuf->used;
618 #else
619 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
620 #endif
623 return len;
626 static int
627 compile_cclass_node(CClassNode* cc, regex_t* reg)
629 int r;
631 if (IS_NULL(cc->mbuf)) {
632 if (IS_NCCLASS_NOT(cc))
633 add_opcode(reg, OP_CCLASS_NOT);
634 else
635 add_opcode(reg, OP_CCLASS);
637 r = add_bitset(reg, cc->bs);
639 else {
640 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
641 if (IS_NCCLASS_NOT(cc))
642 add_opcode(reg, OP_CCLASS_MB_NOT);
643 else
644 add_opcode(reg, OP_CCLASS_MB);
646 r = add_multi_byte_cclass(cc->mbuf, reg);
648 else {
649 if (IS_NCCLASS_NOT(cc))
650 add_opcode(reg, OP_CCLASS_MIX_NOT);
651 else
652 add_opcode(reg, OP_CCLASS_MIX);
654 r = add_bitset(reg, cc->bs);
655 if (r) return r;
656 r = add_multi_byte_cclass(cc->mbuf, reg);
660 return r;
663 static int
664 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
666 #define REPEAT_RANGE_ALLOC 4
668 OnigRepeatRange* p;
670 if (reg->repeat_range_alloc == 0) {
671 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
672 CHECK_NULL_RETURN_MEMERR(p);
673 reg->repeat_range = p;
674 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
676 else if (reg->repeat_range_alloc <= id) {
677 int n;
678 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
679 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
680 sizeof(OnigRepeatRange) * n);
681 CHECK_NULL_RETURN_MEMERR(p);
682 reg->repeat_range = p;
683 reg->repeat_range_alloc = n;
685 else {
686 p = reg->repeat_range;
689 p[id].lower = lower;
690 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
691 return 0;
694 static int
695 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
696 regex_t* reg)
698 int r;
699 int num_repeat = reg->num_repeat;
701 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
702 if (r) return r;
703 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
704 reg->num_repeat++;
705 if (r) return r;
706 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
707 if (r) return r;
709 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
710 if (r) return r;
712 r = compile_tree_empty_check(qn->target, reg, empty_info);
713 if (r) return r;
715 if (
716 #ifdef USE_SUBEXP_CALL
717 reg->num_call > 0 ||
718 #endif
719 IS_QUANTIFIER_IN_REPEAT(qn)) {
720 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
722 else {
723 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
725 if (r) return r;
726 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
727 return r;
730 static int
731 is_anychar_star_quantifier(QtfrNode* qn)
733 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
734 NTYPE(qn->target) == NT_CANY)
735 return 1;
736 else
737 return 0;
740 #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
741 #define CKN_ON (ckn > 0)
743 #ifdef USE_COMBINATION_EXPLOSION_CHECK
745 static int
746 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
748 int len, mod_tlen, cklen;
749 int ckn;
750 int infinite = IS_REPEAT_INFINITE(qn->upper);
751 int empty_info = qn->target_empty_info;
752 int tlen = compile_length_tree(qn->target, reg);
754 if (tlen < 0) return tlen;
756 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
758 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
760 /* anychar repeat */
761 if (NTYPE(qn->target) == NT_CANY) {
762 if (qn->greedy && infinite) {
763 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
764 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
765 else
766 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
770 if (empty_info != 0)
771 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
772 else
773 mod_tlen = tlen;
775 if (infinite && qn->lower <= 1) {
776 if (qn->greedy) {
777 if (qn->lower == 1)
778 len = SIZE_OP_JUMP;
779 else
780 len = 0;
782 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
784 else {
785 if (qn->lower == 0)
786 len = SIZE_OP_JUMP;
787 else
788 len = 0;
790 len += mod_tlen + SIZE_OP_PUSH + cklen;
793 else if (qn->upper == 0) {
794 if (qn->is_referred != 0) /* /(?<n>..){0}/ */
795 len = SIZE_OP_JUMP + tlen;
796 else
797 len = 0;
799 else if (qn->upper == 1 && qn->greedy) {
800 if (qn->lower == 0) {
801 if (CKN_ON) {
802 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
804 else {
805 len = SIZE_OP_PUSH + tlen;
808 else {
809 len = tlen;
812 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
813 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
815 else {
816 len = SIZE_OP_REPEAT_INC
817 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
818 if (CKN_ON)
819 len += SIZE_OP_STATE_CHECK;
822 return len;
825 static int
826 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
828 int r, mod_tlen;
829 int ckn;
830 int infinite = IS_REPEAT_INFINITE(qn->upper);
831 int empty_info = qn->target_empty_info;
832 int tlen = compile_length_tree(qn->target, reg);
834 if (tlen < 0) return tlen;
836 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
838 if (is_anychar_star_quantifier(qn)) {
839 r = compile_tree_n_times(qn->target, qn->lower, reg);
840 if (r) return r;
841 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
842 if (IS_MULTILINE(reg->options))
843 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
844 else
845 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
846 if (r) return r;
847 if (CKN_ON) {
848 r = add_state_check_num(reg, ckn);
849 if (r) return r;
852 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
854 else {
855 if (IS_MULTILINE(reg->options)) {
856 r = add_opcode(reg, (CKN_ON ?
857 OP_STATE_CHECK_ANYCHAR_ML_STAR
858 : OP_ANYCHAR_ML_STAR));
860 else {
861 r = add_opcode(reg, (CKN_ON ?
862 OP_STATE_CHECK_ANYCHAR_STAR
863 : OP_ANYCHAR_STAR));
865 if (r) return r;
866 if (CKN_ON)
867 r = add_state_check_num(reg, ckn);
869 return r;
873 if (empty_info != 0)
874 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
875 else
876 mod_tlen = tlen;
878 if (infinite && qn->lower <= 1) {
879 if (qn->greedy) {
880 if (qn->lower == 1) {
881 r = add_opcode_rel_addr(reg, OP_JUMP,
882 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
883 if (r) return r;
886 if (CKN_ON) {
887 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
888 if (r) return r;
889 r = add_state_check_num(reg, ckn);
890 if (r) return r;
891 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
893 else {
894 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
896 if (r) return r;
897 r = compile_tree_empty_check(qn->target, reg, empty_info);
898 if (r) return r;
899 r = add_opcode_rel_addr(reg, OP_JUMP,
900 -(mod_tlen + (int )SIZE_OP_JUMP
901 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
903 else {
904 if (qn->lower == 0) {
905 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
906 if (r) return r;
908 r = compile_tree_empty_check(qn->target, reg, empty_info);
909 if (r) return r;
910 if (CKN_ON) {
911 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
912 if (r) return r;
913 r = add_state_check_num(reg, ckn);
914 if (r) return r;
915 r = add_rel_addr(reg,
916 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
918 else
919 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
922 else if (qn->upper == 0) {
923 if (qn->is_referred != 0) { /* /(?<n>..){0}/ */
924 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
925 if (r) return r;
926 r = compile_tree(qn->target, reg);
928 else
929 r = 0;
931 else if (qn->upper == 1 && qn->greedy) {
932 if (qn->lower == 0) {
933 if (CKN_ON) {
934 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
935 if (r) return r;
936 r = add_state_check_num(reg, ckn);
937 if (r) return r;
938 r = add_rel_addr(reg, tlen);
940 else {
941 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
943 if (r) return r;
946 r = compile_tree(qn->target, reg);
948 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
949 if (CKN_ON) {
950 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
951 if (r) return r;
952 r = add_state_check_num(reg, ckn);
953 if (r) return r;
954 r = add_rel_addr(reg, SIZE_OP_JUMP);
956 else {
957 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
960 if (r) return r;
961 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
962 if (r) return r;
963 r = compile_tree(qn->target, reg);
965 else {
966 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
967 if (CKN_ON) {
968 if (r) return r;
969 r = add_opcode(reg, OP_STATE_CHECK);
970 if (r) return r;
971 r = add_state_check_num(reg, ckn);
974 return r;
977 #else /* USE_COMBINATION_EXPLOSION_CHECK */
979 static int
980 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
982 int len, mod_tlen;
983 int infinite = IS_REPEAT_INFINITE(qn->upper);
984 int empty_info = qn->target_empty_info;
985 int tlen = compile_length_tree(qn->target, reg);
987 if (tlen < 0) return tlen;
989 /* anychar repeat */
990 if (NTYPE(qn->target) == NT_CANY) {
991 if (qn->greedy && infinite) {
992 if (IS_NOT_NULL(qn->next_head_exact))
993 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
994 else
995 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
999 if (empty_info != 0)
1000 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1001 else
1002 mod_tlen = tlen;
1004 if (infinite &&
1005 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1006 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1007 len = SIZE_OP_JUMP;
1009 else {
1010 len = tlen * qn->lower;
1013 if (qn->greedy) {
1014 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1015 if (IS_NOT_NULL(qn->head_exact))
1016 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1017 else
1018 #endif
1019 if (IS_NOT_NULL(qn->next_head_exact))
1020 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1021 else
1022 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1024 else
1025 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1027 else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */
1028 len = SIZE_OP_JUMP + tlen;
1030 else if (!infinite && qn->greedy &&
1031 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1032 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1033 len = tlen * qn->lower;
1034 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1036 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1037 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1039 else {
1040 len = SIZE_OP_REPEAT_INC
1041 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1044 return len;
1047 static int
1048 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1050 int i, r, mod_tlen;
1051 int infinite = IS_REPEAT_INFINITE(qn->upper);
1052 int empty_info = qn->target_empty_info;
1053 int tlen = compile_length_tree(qn->target, reg);
1055 if (tlen < 0) return tlen;
1057 if (is_anychar_star_quantifier(qn)) {
1058 r = compile_tree_n_times(qn->target, qn->lower, reg);
1059 if (r) return r;
1060 if (IS_NOT_NULL(qn->next_head_exact)) {
1061 if (IS_MULTILINE(reg->options))
1062 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1063 else
1064 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1065 if (r) return r;
1066 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1068 else {
1069 if (IS_MULTILINE(reg->options))
1070 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1071 else
1072 return add_opcode(reg, OP_ANYCHAR_STAR);
1076 if (empty_info != 0)
1077 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1078 else
1079 mod_tlen = tlen;
1081 if (infinite &&
1082 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1083 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1084 if (qn->greedy) {
1085 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1086 if (IS_NOT_NULL(qn->head_exact))
1087 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1088 else
1089 #endif
1090 if (IS_NOT_NULL(qn->next_head_exact))
1091 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1092 else
1093 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1095 else {
1096 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1098 if (r) return r;
1100 else {
1101 r = compile_tree_n_times(qn->target, qn->lower, reg);
1102 if (r) return r;
1105 if (qn->greedy) {
1106 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1107 if (IS_NOT_NULL(qn->head_exact)) {
1108 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1109 mod_tlen + SIZE_OP_JUMP);
1110 if (r) return r;
1111 add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1112 r = compile_tree_empty_check(qn->target, reg, empty_info);
1113 if (r) return r;
1114 r = add_opcode_rel_addr(reg, OP_JUMP,
1115 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1117 else
1118 #endif
1119 if (IS_NOT_NULL(qn->next_head_exact)) {
1120 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1121 mod_tlen + SIZE_OP_JUMP);
1122 if (r) return r;
1123 add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1124 r = compile_tree_empty_check(qn->target, reg, empty_info);
1125 if (r) return r;
1126 r = add_opcode_rel_addr(reg, OP_JUMP,
1127 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1129 else {
1130 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1131 if (r) return r;
1132 r = compile_tree_empty_check(qn->target, reg, empty_info);
1133 if (r) return r;
1134 r = add_opcode_rel_addr(reg, OP_JUMP,
1135 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1138 else {
1139 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1140 if (r) return r;
1141 r = compile_tree_empty_check(qn->target, reg, empty_info);
1142 if (r) return r;
1143 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1146 else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */
1147 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1148 if (r) return r;
1149 r = compile_tree(qn->target, reg);
1151 else if (!infinite && qn->greedy &&
1152 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1153 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1154 int n = qn->upper - qn->lower;
1156 r = compile_tree_n_times(qn->target, qn->lower, reg);
1157 if (r) return r;
1159 for (i = 0; i < n; i++) {
1160 r = add_opcode_rel_addr(reg, OP_PUSH,
1161 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1162 if (r) return r;
1163 r = compile_tree(qn->target, reg);
1164 if (r) return r;
1167 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1168 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1169 if (r) return r;
1170 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1171 if (r) return r;
1172 r = compile_tree(qn->target, reg);
1174 else {
1175 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1177 return r;
1179 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1181 static int
1182 compile_length_option_node(EncloseNode* node, regex_t* reg)
1184 int tlen;
1185 OnigOptionType prev = reg->options;
1187 reg->options = node->option;
1188 tlen = compile_length_tree(node->target, reg);
1189 reg->options = prev;
1191 if (tlen < 0) return tlen;
1193 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1194 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1195 + tlen + SIZE_OP_SET_OPTION;
1197 else
1198 return tlen;
1201 static int
1202 compile_option_node(EncloseNode* node, regex_t* reg)
1204 int r;
1205 OnigOptionType prev = reg->options;
1207 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1208 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1209 if (r) return r;
1210 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1211 if (r) return r;
1212 r = add_opcode(reg, OP_FAIL);
1213 if (r) return r;
1216 reg->options = node->option;
1217 r = compile_tree(node->target, reg);
1218 reg->options = prev;
1220 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1221 if (r) return r;
1222 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1224 return r;
1227 static int
1228 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1230 int len;
1231 int tlen;
1233 if (node->type == ENCLOSE_OPTION)
1234 return compile_length_option_node(node, reg);
1236 if (node->target) {
1237 tlen = compile_length_tree(node->target, reg);
1238 if (tlen < 0) return tlen;
1240 else
1241 tlen = 0;
1243 switch (node->type) {
1244 case ENCLOSE_MEMORY:
1245 #ifdef USE_SUBEXP_CALL
1246 if (IS_ENCLOSE_CALLED(node)) {
1247 len = SIZE_OP_MEMORY_START_PUSH + tlen
1248 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1249 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1250 len += (IS_ENCLOSE_RECURSION(node)
1251 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1252 else
1253 len += (IS_ENCLOSE_RECURSION(node)
1254 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1256 else if (IS_ENCLOSE_RECURSION(node)) {
1257 len = SIZE_OP_MEMORY_START_PUSH;
1258 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1259 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC);
1261 else
1262 #endif
1264 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1265 len = SIZE_OP_MEMORY_START_PUSH;
1266 else
1267 len = SIZE_OP_MEMORY_START;
1269 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1270 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1272 break;
1274 case ENCLOSE_STOP_BACKTRACK:
1275 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1276 QtfrNode* qn = NQTFR(node->target);
1277 tlen = compile_length_tree(qn->target, reg);
1278 if (tlen < 0) return tlen;
1280 len = tlen * qn->lower
1281 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1283 else {
1284 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1286 break;
1288 case ENCLOSE_CONDITION:
1289 len = SIZE_OP_CONDITION;
1290 if (NTYPE(node->target) == NT_ALT) {
1291 Node* x = node->target;
1293 tlen = compile_length_tree(NCAR(x), reg); /* yes-node */
1294 if (tlen < 0) return tlen;
1295 len += tlen + SIZE_OP_JUMP;
1296 if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
1297 x = NCDR(x);
1298 tlen = compile_length_tree(NCAR(x), reg); /* no-node */
1299 if (tlen < 0) return tlen;
1300 len += tlen;
1301 if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
1303 else {
1304 return ONIGERR_PARSER_BUG;
1306 break;
1308 case ENCLOSE_ABSENT:
1309 len = SIZE_OP_PUSH_ABSENT_POS + SIZE_OP_ABSENT + tlen + SIZE_OP_ABSENT_END;
1310 break;
1312 default:
1313 return ONIGERR_TYPE_BUG;
1314 break;
1317 return len;
1320 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1322 static int
1323 compile_enclose_node(EncloseNode* node, regex_t* reg)
1325 int r, len;
1327 if (node->type == ENCLOSE_OPTION)
1328 return compile_option_node(node, reg);
1330 switch (node->type) {
1331 case ENCLOSE_MEMORY:
1332 #ifdef USE_SUBEXP_CALL
1333 if (IS_ENCLOSE_CALLED(node)) {
1334 r = add_opcode(reg, OP_CALL);
1335 if (r) return r;
1336 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1337 node->state |= NST_ADDR_FIXED;
1338 r = add_abs_addr(reg, (int )node->call_addr);
1339 if (r) return r;
1340 len = compile_length_tree(node->target, reg);
1341 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1342 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1343 len += (IS_ENCLOSE_RECURSION(node)
1344 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1345 else
1346 len += (IS_ENCLOSE_RECURSION(node)
1347 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1349 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1350 if (r) return r;
1352 #endif
1353 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1354 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1355 else
1356 r = add_opcode(reg, OP_MEMORY_START);
1357 if (r) return r;
1358 r = add_mem_num(reg, node->regnum);
1359 if (r) return r;
1360 r = compile_tree(node->target, reg);
1361 if (r) return r;
1362 #ifdef USE_SUBEXP_CALL
1363 if (IS_ENCLOSE_CALLED(node)) {
1364 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1365 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1366 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1367 else
1368 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1369 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1371 if (r) return r;
1372 r = add_mem_num(reg, node->regnum);
1373 if (r) return r;
1374 r = add_opcode(reg, OP_RETURN);
1376 else if (IS_ENCLOSE_RECURSION(node)) {
1377 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1378 r = add_opcode(reg, OP_MEMORY_END_PUSH_REC);
1379 else
1380 r = add_opcode(reg, OP_MEMORY_END_REC);
1381 if (r) return r;
1382 r = add_mem_num(reg, node->regnum);
1384 else
1385 #endif
1387 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1388 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1389 else
1390 r = add_opcode(reg, OP_MEMORY_END);
1391 if (r) return r;
1392 r = add_mem_num(reg, node->regnum);
1394 break;
1396 case ENCLOSE_STOP_BACKTRACK:
1397 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1398 QtfrNode* qn = NQTFR(node->target);
1399 r = compile_tree_n_times(qn->target, qn->lower, reg);
1400 if (r) return r;
1402 len = compile_length_tree(qn->target, reg);
1403 if (len < 0) return len;
1405 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1406 if (r) return r;
1407 r = compile_tree(qn->target, reg);
1408 if (r) return r;
1409 r = add_opcode(reg, OP_POP);
1410 if (r) return r;
1411 r = add_opcode_rel_addr(reg, OP_JUMP,
1412 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1414 else {
1415 r = add_opcode(reg, OP_PUSH_STOP_BT);
1416 if (r) return r;
1417 r = compile_tree(node->target, reg);
1418 if (r) return r;
1419 r = add_opcode(reg, OP_POP_STOP_BT);
1421 break;
1423 case ENCLOSE_CONDITION:
1424 r = add_opcode(reg, OP_CONDITION);
1425 if (r) return r;
1426 r = add_mem_num(reg, node->regnum);
1427 if (r) return r;
1429 if (NTYPE(node->target) == NT_ALT) {
1430 Node* x = node->target;
1431 int len2;
1433 len = compile_length_tree(NCAR(x), reg); /* yes-node */
1434 if (len < 0) return len;
1435 if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
1436 x = NCDR(x);
1437 len2 = compile_length_tree(NCAR(x), reg); /* no-node */
1438 if (len2 < 0) return len2;
1439 if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
1441 x = node->target;
1442 r = add_rel_addr(reg, len + SIZE_OP_JUMP);
1443 if (r) return r;
1444 r = compile_tree(NCAR(x), reg); /* yes-node */
1445 if (r) return r;
1446 r = add_opcode_rel_addr(reg, OP_JUMP, len2);
1447 if (r) return r;
1448 x = NCDR(x);
1449 r = compile_tree(NCAR(x), reg); /* no-node */
1451 else {
1452 return ONIGERR_PARSER_BUG;
1454 break;
1456 case ENCLOSE_ABSENT:
1457 len = compile_length_tree(node->target, reg);
1458 if (len < 0) return len;
1460 r = add_opcode(reg, OP_PUSH_ABSENT_POS);
1461 if (r) return r;
1462 r = add_opcode_rel_addr(reg, OP_ABSENT, len + SIZE_OP_ABSENT_END);
1463 if (r) return r;
1464 r = compile_tree(node->target, reg);
1465 if (r) return r;
1466 r = add_opcode(reg, OP_ABSENT_END);
1467 break;
1469 default:
1470 return ONIGERR_TYPE_BUG;
1471 break;
1474 return r;
1477 static int
1478 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1480 int len;
1481 int tlen = 0;
1483 if (node->target) {
1484 tlen = compile_length_tree(node->target, reg);
1485 if (tlen < 0) return tlen;
1488 switch (node->type) {
1489 case ANCHOR_PREC_READ:
1490 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1491 break;
1492 case ANCHOR_PREC_READ_NOT:
1493 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1494 break;
1495 case ANCHOR_LOOK_BEHIND:
1496 len = SIZE_OP_LOOK_BEHIND + tlen;
1497 break;
1498 case ANCHOR_LOOK_BEHIND_NOT:
1499 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1500 break;
1502 default:
1503 len = SIZE_OPCODE;
1504 break;
1507 return len;
1510 static int
1511 compile_anchor_node(AnchorNode* node, regex_t* reg)
1513 int r, len;
1515 switch (node->type) {
1516 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1517 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1518 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1519 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1520 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1521 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1523 case ANCHOR_WORD_BOUND:
1524 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BOUND);
1525 else r = add_opcode(reg, OP_WORD_BOUND);
1526 break;
1527 case ANCHOR_NOT_WORD_BOUND:
1528 if (node->ascii_range) r = add_opcode(reg, OP_NOT_ASCII_WORD_BOUND);
1529 else r = add_opcode(reg, OP_NOT_WORD_BOUND);
1530 break;
1531 #ifdef USE_WORD_BEGIN_END
1532 case ANCHOR_WORD_BEGIN:
1533 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BEGIN);
1534 else r = add_opcode(reg, OP_WORD_BEGIN);
1535 break;
1536 case ANCHOR_WORD_END:
1537 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_END);
1538 else r = add_opcode(reg, OP_WORD_END);
1539 break;
1540 #endif
1541 case ANCHOR_KEEP: r = add_opcode(reg, OP_KEEP); break;
1543 case ANCHOR_PREC_READ:
1544 r = add_opcode(reg, OP_PUSH_POS);
1545 if (r) return r;
1546 r = compile_tree(node->target, reg);
1547 if (r) return r;
1548 r = add_opcode(reg, OP_POP_POS);
1549 break;
1551 case ANCHOR_PREC_READ_NOT:
1552 len = compile_length_tree(node->target, reg);
1553 if (len < 0) return len;
1554 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1555 if (r) return r;
1556 r = compile_tree(node->target, reg);
1557 if (r) return r;
1558 r = add_opcode(reg, OP_FAIL_POS);
1559 break;
1561 case ANCHOR_LOOK_BEHIND:
1563 int n;
1564 r = add_opcode(reg, OP_LOOK_BEHIND);
1565 if (r) return r;
1566 if (node->char_len < 0) {
1567 r = get_char_length_tree(node->target, reg, &n);
1568 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1570 else
1571 n = node->char_len;
1572 r = add_length(reg, n);
1573 if (r) return r;
1574 r = compile_tree(node->target, reg);
1576 break;
1578 case ANCHOR_LOOK_BEHIND_NOT:
1580 int n;
1581 len = compile_length_tree(node->target, reg);
1582 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1583 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1584 if (r) return r;
1585 if (node->char_len < 0) {
1586 r = get_char_length_tree(node->target, reg, &n);
1587 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1589 else
1590 n = node->char_len;
1591 r = add_length(reg, n);
1592 if (r) return r;
1593 r = compile_tree(node->target, reg);
1594 if (r) return r;
1595 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1597 break;
1599 default:
1600 return ONIGERR_TYPE_BUG;
1601 break;
1604 return r;
1607 static int
1608 compile_length_tree(Node* node, regex_t* reg)
1610 int len, type, r;
1612 type = NTYPE(node);
1613 switch (type) {
1614 case NT_LIST:
1615 len = 0;
1616 do {
1617 r = compile_length_tree(NCAR(node), reg);
1618 if (r < 0) return r;
1619 len += r;
1620 } while (IS_NOT_NULL(node = NCDR(node)));
1621 r = len;
1622 break;
1624 case NT_ALT:
1626 int n = 0;
1627 len = 0;
1628 do {
1629 r = compile_length_tree(NCAR(node), reg);
1630 if (r < 0) return r;
1631 len += r;
1632 n++;
1633 } while (IS_NOT_NULL(node = NCDR(node)));
1634 r = len;
1635 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1637 break;
1639 case NT_STR:
1640 if (NSTRING_IS_RAW(node))
1641 r = compile_length_string_raw_node(NSTR(node), reg);
1642 else
1643 r = compile_length_string_node(node, reg);
1644 break;
1646 case NT_CCLASS:
1647 r = compile_length_cclass_node(NCCLASS(node), reg);
1648 break;
1650 case NT_CTYPE:
1651 case NT_CANY:
1652 r = SIZE_OPCODE;
1653 break;
1655 case NT_BREF:
1657 BRefNode* br = NBREF(node);
1659 #ifdef USE_BACKREF_WITH_LEVEL
1660 if (IS_BACKREF_NEST_LEVEL(br)) {
1661 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1662 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1664 else
1665 #endif
1666 if (br->back_num == 1) {
1667 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1668 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1670 else {
1671 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1674 break;
1676 #ifdef USE_SUBEXP_CALL
1677 case NT_CALL:
1678 r = SIZE_OP_CALL;
1679 break;
1680 #endif
1682 case NT_QTFR:
1683 r = compile_length_quantifier_node(NQTFR(node), reg);
1684 break;
1686 case NT_ENCLOSE:
1687 r = compile_length_enclose_node(NENCLOSE(node), reg);
1688 break;
1690 case NT_ANCHOR:
1691 r = compile_length_anchor_node(NANCHOR(node), reg);
1692 break;
1694 default:
1695 return ONIGERR_TYPE_BUG;
1696 break;
1699 return r;
1702 static int
1703 compile_tree(Node* node, regex_t* reg)
1705 int n, type, len, pos, r = 0;
1707 type = NTYPE(node);
1708 switch (type) {
1709 case NT_LIST:
1710 do {
1711 r = compile_tree(NCAR(node), reg);
1712 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1713 break;
1715 case NT_ALT:
1717 Node* x = node;
1718 len = 0;
1719 do {
1720 len += compile_length_tree(NCAR(x), reg);
1721 if (NCDR(x) != NULL) {
1722 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1724 } while (IS_NOT_NULL(x = NCDR(x)));
1725 pos = reg->used + len; /* goal position */
1727 do {
1728 len = compile_length_tree(NCAR(node), reg);
1729 if (IS_NOT_NULL(NCDR(node))) {
1730 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1731 if (r) break;
1733 r = compile_tree(NCAR(node), reg);
1734 if (r) break;
1735 if (IS_NOT_NULL(NCDR(node))) {
1736 len = pos - (reg->used + SIZE_OP_JUMP);
1737 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1738 if (r) break;
1740 } while (IS_NOT_NULL(node = NCDR(node)));
1742 break;
1744 case NT_STR:
1745 if (NSTRING_IS_RAW(node))
1746 r = compile_string_raw_node(NSTR(node), reg);
1747 else
1748 r = compile_string_node(node, reg);
1749 break;
1751 case NT_CCLASS:
1752 r = compile_cclass_node(NCCLASS(node), reg);
1753 break;
1755 case NT_CTYPE:
1757 int op;
1759 switch (NCTYPE(node)->ctype) {
1760 case ONIGENC_CTYPE_WORD:
1761 if (NCTYPE(node)->ascii_range != 0) {
1762 if (NCTYPE(node)->not != 0) op = OP_NOT_ASCII_WORD;
1763 else op = OP_ASCII_WORD;
1765 else {
1766 if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
1767 else op = OP_WORD;
1769 break;
1770 default:
1771 return ONIGERR_TYPE_BUG;
1772 break;
1774 r = add_opcode(reg, op);
1776 break;
1778 case NT_CANY:
1779 if (IS_MULTILINE(reg->options))
1780 r = add_opcode(reg, OP_ANYCHAR_ML);
1781 else
1782 r = add_opcode(reg, OP_ANYCHAR);
1783 break;
1785 case NT_BREF:
1787 BRefNode* br = NBREF(node);
1789 #ifdef USE_BACKREF_WITH_LEVEL
1790 if (IS_BACKREF_NEST_LEVEL(br)) {
1791 r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1792 if (r) return r;
1793 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1794 if (r) return r;
1795 r = add_length(reg, br->nest_level);
1796 if (r) return r;
1798 goto add_bacref_mems;
1800 else
1801 #endif
1802 if (br->back_num == 1) {
1803 n = br->back_static[0];
1804 if (IS_IGNORECASE(reg->options)) {
1805 r = add_opcode(reg, OP_BACKREFN_IC);
1806 if (r) return r;
1807 r = add_mem_num(reg, n);
1809 else {
1810 switch (n) {
1811 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1812 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1813 default:
1814 r = add_opcode(reg, OP_BACKREFN);
1815 if (r) return r;
1816 r = add_mem_num(reg, n);
1817 break;
1821 else {
1822 int i;
1823 int* p;
1825 if (IS_IGNORECASE(reg->options)) {
1826 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1828 else {
1829 r = add_opcode(reg, OP_BACKREF_MULTI);
1831 if (r) return r;
1833 #ifdef USE_BACKREF_WITH_LEVEL
1834 add_bacref_mems:
1835 #endif
1836 r = add_length(reg, br->back_num);
1837 if (r) return r;
1838 p = BACKREFS_P(br);
1839 for (i = br->back_num - 1; i >= 0; i--) {
1840 r = add_mem_num(reg, p[i]);
1841 if (r) return r;
1845 break;
1847 #ifdef USE_SUBEXP_CALL
1848 case NT_CALL:
1849 r = compile_call(NCALL(node), reg);
1850 break;
1851 #endif
1853 case NT_QTFR:
1854 r = compile_quantifier_node(NQTFR(node), reg);
1855 break;
1857 case NT_ENCLOSE:
1858 r = compile_enclose_node(NENCLOSE(node), reg);
1859 break;
1861 case NT_ANCHOR:
1862 r = compile_anchor_node(NANCHOR(node), reg);
1863 break;
1865 default:
1866 #ifdef ONIG_DEBUG
1867 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1868 #endif
1869 break;
1872 return r;
1875 #ifdef USE_NAMED_GROUP
1877 static int
1878 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1880 int r = 0;
1881 Node* node = *plink;
1883 switch (NTYPE(node)) {
1884 case NT_LIST:
1885 case NT_ALT:
1886 do {
1887 r = noname_disable_map(&(NCAR(node)), map, counter);
1888 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1889 break;
1891 case NT_QTFR:
1893 Node** ptarget = &(NQTFR(node)->target);
1894 Node* old = *ptarget;
1895 r = noname_disable_map(ptarget, map, counter);
1896 if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1897 onig_reduce_nested_quantifier(node, *ptarget);
1900 break;
1902 case NT_ENCLOSE:
1904 EncloseNode* en = NENCLOSE(node);
1905 if (en->type == ENCLOSE_MEMORY) {
1906 if (IS_ENCLOSE_NAMED_GROUP(en)) {
1907 (*counter)++;
1908 map[en->regnum].new_val = *counter;
1909 en->regnum = *counter;
1911 else if (en->regnum != 0) {
1912 *plink = en->target;
1913 en->target = NULL_NODE;
1914 onig_node_free(node);
1915 r = noname_disable_map(plink, map, counter);
1916 break;
1919 r = noname_disable_map(&(en->target), map, counter);
1921 break;
1923 case NT_ANCHOR:
1924 if (NANCHOR(node)->target)
1925 r = noname_disable_map(&(NANCHOR(node)->target), map, counter);
1926 break;
1928 default:
1929 break;
1932 return r;
1935 static int
1936 renumber_node_backref(Node* node, GroupNumRemap* map, const int num_mem)
1938 int i, pos, n, old_num;
1939 int *backs;
1940 BRefNode* bn = NBREF(node);
1942 if (! IS_BACKREF_NAME_REF(bn))
1943 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1945 old_num = bn->back_num;
1946 if (IS_NULL(bn->back_dynamic))
1947 backs = bn->back_static;
1948 else
1949 backs = bn->back_dynamic;
1951 for (i = 0, pos = 0; i < old_num; i++) {
1952 if (backs[i] > num_mem) return ONIGERR_INVALID_BACKREF;
1953 n = map[backs[i]].new_val;
1954 if (n > 0) {
1955 backs[pos] = n;
1956 pos++;
1960 bn->back_num = pos;
1961 return 0;
1964 static int
1965 renumber_by_map(Node* node, GroupNumRemap* map, const int num_mem)
1967 int r = 0;
1969 switch (NTYPE(node)) {
1970 case NT_LIST:
1971 case NT_ALT:
1972 do {
1973 r = renumber_by_map(NCAR(node), map, num_mem);
1974 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1975 break;
1976 case NT_QTFR:
1977 r = renumber_by_map(NQTFR(node)->target, map, num_mem);
1978 break;
1979 case NT_ENCLOSE:
1981 EncloseNode* en = NENCLOSE(node);
1982 if (en->type == ENCLOSE_CONDITION) {
1983 if (en->regnum > num_mem) return ONIGERR_INVALID_BACKREF;
1984 en->regnum = map[en->regnum].new_val;
1986 r = renumber_by_map(en->target, map, num_mem);
1988 break;
1990 case NT_BREF:
1991 r = renumber_node_backref(node, map, num_mem);
1992 break;
1994 case NT_ANCHOR:
1995 if (NANCHOR(node)->target)
1996 r = renumber_by_map(NANCHOR(node)->target, map, num_mem);
1997 break;
1999 default:
2000 break;
2003 return r;
2006 static int
2007 numbered_ref_check(Node* node)
2009 int r = 0;
2011 switch (NTYPE(node)) {
2012 case NT_LIST:
2013 case NT_ALT:
2014 do {
2015 r = numbered_ref_check(NCAR(node));
2016 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2017 break;
2018 case NT_QTFR:
2019 r = numbered_ref_check(NQTFR(node)->target);
2020 break;
2021 case NT_ENCLOSE:
2022 r = numbered_ref_check(NENCLOSE(node)->target);
2023 break;
2025 case NT_BREF:
2026 if (! IS_BACKREF_NAME_REF(NBREF(node)))
2027 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2028 break;
2030 case NT_ANCHOR:
2031 if (NANCHOR(node)->target)
2032 r = numbered_ref_check(NANCHOR(node)->target);
2033 break;
2035 default:
2036 break;
2039 return r;
2042 static int
2043 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
2045 int r, i, pos, counter;
2046 BitStatusType loc;
2047 GroupNumRemap* map;
2049 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
2050 CHECK_NULL_RETURN_MEMERR(map);
2051 for (i = 1; i <= env->num_mem; i++) {
2052 map[i].new_val = 0;
2054 counter = 0;
2055 r = noname_disable_map(root, map, &counter);
2056 if (r != 0) return r;
2058 r = renumber_by_map(*root, map, env->num_mem);
2059 if (r != 0) return r;
2061 for (i = 1, pos = 1; i <= env->num_mem; i++) {
2062 if (map[i].new_val > 0) {
2063 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
2064 pos++;
2068 loc = env->capture_history;
2069 BIT_STATUS_CLEAR(env->capture_history);
2070 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
2071 if (BIT_STATUS_AT(loc, i)) {
2072 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
2076 env->num_mem = env->num_named;
2077 reg->num_mem = env->num_named;
2079 return onig_renumber_name_table(reg, map);
2081 #endif /* USE_NAMED_GROUP */
2083 #ifdef USE_SUBEXP_CALL
2084 static int
2085 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
2087 int i, offset;
2088 EncloseNode* en;
2089 AbsAddrType addr;
2091 for (i = 0; i < uslist->num; i++) {
2092 en = NENCLOSE(uslist->us[i].target);
2093 if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
2094 addr = en->call_addr;
2095 offset = uslist->us[i].offset;
2097 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
2099 return 0;
2101 #endif
2103 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
2104 static int
2105 quantifiers_memory_node_info(Node* node)
2107 int r = 0;
2109 switch (NTYPE(node)) {
2110 case NT_LIST:
2111 case NT_ALT:
2113 int v;
2114 do {
2115 v = quantifiers_memory_node_info(NCAR(node));
2116 if (v > r) r = v;
2117 } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
2119 break;
2121 # ifdef USE_SUBEXP_CALL
2122 case NT_CALL:
2123 if (IS_CALL_RECURSION(NCALL(node))) {
2124 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
2126 else
2127 r = quantifiers_memory_node_info(NCALL(node)->target);
2128 break;
2129 # endif
2131 case NT_QTFR:
2133 QtfrNode* qn = NQTFR(node);
2134 if (qn->upper != 0) {
2135 r = quantifiers_memory_node_info(qn->target);
2138 break;
2140 case NT_ENCLOSE:
2142 EncloseNode* en = NENCLOSE(node);
2143 switch (en->type) {
2144 case ENCLOSE_MEMORY:
2145 return NQ_TARGET_IS_EMPTY_MEM;
2146 break;
2148 case ENCLOSE_OPTION:
2149 case ENCLOSE_STOP_BACKTRACK:
2150 case ENCLOSE_CONDITION:
2151 case ENCLOSE_ABSENT:
2152 r = quantifiers_memory_node_info(en->target);
2153 break;
2154 default:
2155 break;
2158 break;
2160 case NT_BREF:
2161 case NT_STR:
2162 case NT_CTYPE:
2163 case NT_CCLASS:
2164 case NT_CANY:
2165 case NT_ANCHOR:
2166 default:
2167 break;
2170 return r;
2172 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2174 static int
2175 get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2177 OnigDistance tmin;
2178 int r = 0;
2180 *min = 0;
2181 switch (NTYPE(node)) {
2182 case NT_BREF:
2184 int i;
2185 int* backs;
2186 Node** nodes = SCANENV_MEM_NODES(env);
2187 BRefNode* br = NBREF(node);
2188 if (br->state & NST_RECURSION) break;
2190 backs = BACKREFS_P(br);
2191 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2192 r = get_min_match_length(nodes[backs[0]], min, env);
2193 if (r != 0) break;
2194 for (i = 1; i < br->back_num; i++) {
2195 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2196 r = get_min_match_length(nodes[backs[i]], &tmin, env);
2197 if (r != 0) break;
2198 if (*min > tmin) *min = tmin;
2201 break;
2203 #ifdef USE_SUBEXP_CALL
2204 case NT_CALL:
2205 if (IS_CALL_RECURSION(NCALL(node))) {
2206 EncloseNode* en = NENCLOSE(NCALL(node)->target);
2207 if (IS_ENCLOSE_MIN_FIXED(en))
2208 *min = en->min_len;
2210 else
2211 r = get_min_match_length(NCALL(node)->target, min, env);
2212 break;
2213 #endif
2215 case NT_LIST:
2216 do {
2217 r = get_min_match_length(NCAR(node), &tmin, env);
2218 if (r == 0) *min += tmin;
2219 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2220 break;
2222 case NT_ALT:
2224 Node *x, *y;
2225 y = node;
2226 do {
2227 x = NCAR(y);
2228 r = get_min_match_length(x, &tmin, env);
2229 if (r != 0) break;
2230 if (y == node) *min = tmin;
2231 else if (*min > tmin) *min = tmin;
2232 } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2234 break;
2236 case NT_STR:
2238 StrNode* sn = NSTR(node);
2239 *min = sn->end - sn->s;
2241 break;
2243 case NT_CTYPE:
2244 *min = 1;
2245 break;
2247 case NT_CCLASS:
2248 case NT_CANY:
2249 *min = 1;
2250 break;
2252 case NT_QTFR:
2254 QtfrNode* qn = NQTFR(node);
2256 if (qn->lower > 0) {
2257 r = get_min_match_length(qn->target, min, env);
2258 if (r == 0)
2259 *min = distance_multiply(*min, qn->lower);
2262 break;
2264 case NT_ENCLOSE:
2266 EncloseNode* en = NENCLOSE(node);
2267 switch (en->type) {
2268 case ENCLOSE_MEMORY:
2269 if (IS_ENCLOSE_MIN_FIXED(en))
2270 *min = en->min_len;
2271 else {
2272 if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2273 *min = 0; /* recursive */
2274 else {
2275 SET_ENCLOSE_STATUS(node, NST_MARK1);
2276 r = get_min_match_length(en->target, min, env);
2277 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2278 if (r == 0) {
2279 en->min_len = *min;
2280 SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2284 break;
2286 case ENCLOSE_OPTION:
2287 case ENCLOSE_STOP_BACKTRACK:
2288 case ENCLOSE_CONDITION:
2289 r = get_min_match_length(en->target, min, env);
2290 break;
2292 case ENCLOSE_ABSENT:
2293 break;
2296 break;
2298 case NT_ANCHOR:
2299 default:
2300 break;
2303 return r;
2306 static int
2307 get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2309 OnigDistance tmax;
2310 int r = 0;
2312 *max = 0;
2313 switch (NTYPE(node)) {
2314 case NT_LIST:
2315 do {
2316 r = get_max_match_length(NCAR(node), &tmax, env);
2317 if (r == 0)
2318 *max = distance_add(*max, tmax);
2319 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2320 break;
2322 case NT_ALT:
2323 do {
2324 r = get_max_match_length(NCAR(node), &tmax, env);
2325 if (r == 0 && *max < tmax) *max = tmax;
2326 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2327 break;
2329 case NT_STR:
2331 StrNode* sn = NSTR(node);
2332 *max = sn->end - sn->s;
2334 break;
2336 case NT_CTYPE:
2337 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2338 break;
2340 case NT_CCLASS:
2341 case NT_CANY:
2342 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2343 break;
2345 case NT_BREF:
2347 int i;
2348 int* backs;
2349 Node** nodes = SCANENV_MEM_NODES(env);
2350 BRefNode* br = NBREF(node);
2351 if (br->state & NST_RECURSION) {
2352 *max = ONIG_INFINITE_DISTANCE;
2353 break;
2355 backs = BACKREFS_P(br);
2356 for (i = 0; i < br->back_num; i++) {
2357 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2358 r = get_max_match_length(nodes[backs[i]], &tmax, env);
2359 if (r != 0) break;
2360 if (*max < tmax) *max = tmax;
2363 break;
2365 #ifdef USE_SUBEXP_CALL
2366 case NT_CALL:
2367 if (! IS_CALL_RECURSION(NCALL(node)))
2368 r = get_max_match_length(NCALL(node)->target, max, env);
2369 else
2370 *max = ONIG_INFINITE_DISTANCE;
2371 break;
2372 #endif
2374 case NT_QTFR:
2376 QtfrNode* qn = NQTFR(node);
2378 if (qn->upper != 0) {
2379 r = get_max_match_length(qn->target, max, env);
2380 if (r == 0 && *max != 0) {
2381 if (! IS_REPEAT_INFINITE(qn->upper))
2382 *max = distance_multiply(*max, qn->upper);
2383 else
2384 *max = ONIG_INFINITE_DISTANCE;
2388 break;
2390 case NT_ENCLOSE:
2392 EncloseNode* en = NENCLOSE(node);
2393 switch (en->type) {
2394 case ENCLOSE_MEMORY:
2395 if (IS_ENCLOSE_MAX_FIXED(en))
2396 *max = en->max_len;
2397 else {
2398 if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2399 *max = ONIG_INFINITE_DISTANCE;
2400 else {
2401 SET_ENCLOSE_STATUS(node, NST_MARK1);
2402 r = get_max_match_length(en->target, max, env);
2403 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2404 if (r == 0) {
2405 en->max_len = *max;
2406 SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2410 break;
2412 case ENCLOSE_OPTION:
2413 case ENCLOSE_STOP_BACKTRACK:
2414 case ENCLOSE_CONDITION:
2415 r = get_max_match_length(en->target, max, env);
2416 break;
2418 case ENCLOSE_ABSENT:
2419 break;
2422 break;
2424 case NT_ANCHOR:
2425 default:
2426 break;
2429 return r;
2432 #define GET_CHAR_LEN_VARLEN -1
2433 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2435 /* fixed size pattern node only */
2436 static int
2437 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2439 int tlen;
2440 int r = 0;
2442 level++;
2443 *len = 0;
2444 switch (NTYPE(node)) {
2445 case NT_LIST:
2446 do {
2447 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2448 if (r == 0)
2449 *len = (int )distance_add(*len, tlen);
2450 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2451 break;
2453 case NT_ALT:
2455 int tlen2;
2456 int varlen = 0;
2458 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2459 while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2460 r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2461 if (r == 0) {
2462 if (tlen != tlen2)
2463 varlen = 1;
2466 if (r == 0) {
2467 if (varlen != 0) {
2468 if (level == 1)
2469 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2470 else
2471 r = GET_CHAR_LEN_VARLEN;
2473 else
2474 *len = tlen;
2477 break;
2479 case NT_STR:
2481 StrNode* sn = NSTR(node);
2482 UChar *s = sn->s;
2483 while (s < sn->end) {
2484 s += enclen(reg->enc, s, sn->end);
2485 (*len)++;
2488 break;
2490 case NT_QTFR:
2492 QtfrNode* qn = NQTFR(node);
2493 if (qn->lower == qn->upper) {
2494 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2495 if (r == 0)
2496 *len = (int )distance_multiply(tlen, qn->lower);
2498 else
2499 r = GET_CHAR_LEN_VARLEN;
2501 break;
2503 #ifdef USE_SUBEXP_CALL
2504 case NT_CALL:
2505 if (! IS_CALL_RECURSION(NCALL(node)))
2506 r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2507 else
2508 r = GET_CHAR_LEN_VARLEN;
2509 break;
2510 #endif
2512 case NT_CTYPE:
2513 *len = 1;
2514 break;
2516 case NT_CCLASS:
2517 case NT_CANY:
2518 *len = 1;
2519 break;
2521 case NT_ENCLOSE:
2523 EncloseNode* en = NENCLOSE(node);
2524 switch (en->type) {
2525 case ENCLOSE_MEMORY:
2526 #ifdef USE_SUBEXP_CALL
2527 if (IS_ENCLOSE_CLEN_FIXED(en))
2528 *len = en->char_len;
2529 else {
2530 r = get_char_length_tree1(en->target, reg, len, level);
2531 if (r == 0) {
2532 en->char_len = *len;
2533 SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2536 break;
2537 #endif
2538 case ENCLOSE_OPTION:
2539 case ENCLOSE_STOP_BACKTRACK:
2540 case ENCLOSE_CONDITION:
2541 r = get_char_length_tree1(en->target, reg, len, level);
2542 break;
2543 case ENCLOSE_ABSENT:
2544 default:
2545 break;
2548 break;
2550 case NT_ANCHOR:
2551 break;
2553 default:
2554 r = GET_CHAR_LEN_VARLEN;
2555 break;
2558 return r;
2561 static int
2562 get_char_length_tree(Node* node, regex_t* reg, int* len)
2564 return get_char_length_tree1(node, reg, len, 0);
2567 /* x is not included y ==> 1 : 0 */
2568 static int
2569 is_not_included(Node* x, Node* y, regex_t* reg)
2571 int i;
2572 OnigDistance len;
2573 OnigCodePoint code;
2574 UChar *p;
2575 int ytype;
2577 retry:
2578 ytype = NTYPE(y);
2579 switch (NTYPE(x)) {
2580 case NT_CTYPE:
2582 switch (ytype) {
2583 case NT_CTYPE:
2584 if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2585 NCTYPE(y)->not != NCTYPE(x)->not &&
2586 NCTYPE(y)->ascii_range == NCTYPE(x)->ascii_range)
2587 return 1;
2588 else
2589 return 0;
2590 break;
2592 case NT_CCLASS:
2593 swap:
2595 Node* tmp;
2596 tmp = x; x = y; y = tmp;
2597 goto retry;
2599 break;
2601 case NT_STR:
2602 goto swap;
2603 break;
2605 default:
2606 break;
2609 break;
2611 case NT_CCLASS:
2613 CClassNode* xc = NCCLASS(x);
2614 switch (ytype) {
2615 case NT_CTYPE:
2616 switch (NCTYPE(y)->ctype) {
2617 case ONIGENC_CTYPE_WORD:
2618 if (NCTYPE(y)->not == 0) {
2619 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2620 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2621 if (BITSET_AT(xc->bs, i)) {
2622 if (NCTYPE(y)->ascii_range) {
2623 if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2625 else {
2626 if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
2630 return 1;
2632 return 0;
2634 else {
2635 if (IS_NOT_NULL(xc->mbuf)) return 0;
2636 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2637 int is_word;
2638 if (NCTYPE(y)->ascii_range)
2639 is_word = IS_CODE_SB_WORD(reg->enc, i);
2640 else
2641 is_word = ONIGENC_IS_CODE_WORD(reg->enc, i);
2642 if (! is_word) {
2643 if (!IS_NCCLASS_NOT(xc)) {
2644 if (BITSET_AT(xc->bs, i))
2645 return 0;
2647 else {
2648 if (! BITSET_AT(xc->bs, i))
2649 return 0;
2653 return 1;
2655 break;
2657 default:
2658 break;
2660 break;
2662 case NT_CCLASS:
2664 int v;
2665 CClassNode* yc = NCCLASS(y);
2667 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2668 v = BITSET_AT(xc->bs, i);
2669 if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2670 (v == 0 && IS_NCCLASS_NOT(xc))) {
2671 v = BITSET_AT(yc->bs, i);
2672 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2673 (v == 0 && IS_NCCLASS_NOT(yc)))
2674 return 0;
2677 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2678 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2679 return 1;
2680 return 0;
2682 break;
2684 case NT_STR:
2685 goto swap;
2686 break;
2688 default:
2689 break;
2692 break;
2694 case NT_STR:
2696 StrNode* xs = NSTR(x);
2697 if (NSTRING_LEN(x) == 0)
2698 break;
2700 switch (ytype) {
2701 case NT_CTYPE:
2702 switch (NCTYPE(y)->ctype) {
2703 case ONIGENC_CTYPE_WORD:
2704 if (NCTYPE(y)->ascii_range) {
2705 if (ONIGENC_IS_MBC_ASCII_WORD(reg->enc, xs->s, xs->end))
2706 return NCTYPE(y)->not;
2707 else
2708 return !(NCTYPE(y)->not);
2710 else {
2711 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2712 return NCTYPE(y)->not;
2713 else
2714 return !(NCTYPE(y)->not);
2716 break;
2717 default:
2718 break;
2720 break;
2722 case NT_CCLASS:
2724 CClassNode* cc = NCCLASS(y);
2726 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2727 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2728 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2730 break;
2732 case NT_STR:
2734 UChar *q;
2735 StrNode* ys = NSTR(y);
2736 len = NSTRING_LEN(x);
2737 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2738 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2739 /* tiny version */
2740 return 0;
2742 else {
2743 for (i = 0, p = ys->s, q = xs->s; (OnigDistance )i < len; i++, p++, q++) {
2744 if (*p != *q) return 1;
2748 break;
2750 default:
2751 break;
2754 break;
2756 default:
2757 break;
2760 return 0;
2763 static Node*
2764 get_head_value_node(Node* node, int exact, regex_t* reg)
2766 Node* n = NULL_NODE;
2768 switch (NTYPE(node)) {
2769 case NT_BREF:
2770 case NT_ALT:
2771 case NT_CANY:
2772 #ifdef USE_SUBEXP_CALL
2773 case NT_CALL:
2774 #endif
2775 break;
2777 case NT_CTYPE:
2778 case NT_CCLASS:
2779 if (exact == 0) {
2780 n = node;
2782 break;
2784 case NT_LIST:
2785 n = get_head_value_node(NCAR(node), exact, reg);
2786 break;
2788 case NT_STR:
2790 StrNode* sn = NSTR(node);
2792 if (sn->end <= sn->s)
2793 break;
2795 if (exact != 0 &&
2796 !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2798 else {
2799 n = node;
2802 break;
2804 case NT_QTFR:
2806 QtfrNode* qn = NQTFR(node);
2807 if (qn->lower > 0) {
2808 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
2809 if (IS_NOT_NULL(qn->head_exact))
2810 n = qn->head_exact;
2811 else
2812 #endif
2813 n = get_head_value_node(qn->target, exact, reg);
2816 break;
2818 case NT_ENCLOSE:
2820 EncloseNode* en = NENCLOSE(node);
2821 switch (en->type) {
2822 case ENCLOSE_OPTION:
2824 OnigOptionType options = reg->options;
2826 reg->options = NENCLOSE(node)->option;
2827 n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2828 reg->options = options;
2830 break;
2832 case ENCLOSE_MEMORY:
2833 case ENCLOSE_STOP_BACKTRACK:
2834 case ENCLOSE_CONDITION:
2835 n = get_head_value_node(en->target, exact, reg);
2836 break;
2838 case ENCLOSE_ABSENT:
2839 break;
2842 break;
2844 case NT_ANCHOR:
2845 if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2846 n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2847 break;
2849 default:
2850 break;
2853 return n;
2856 static int
2857 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2859 int type, r = 0;
2861 type = NTYPE(node);
2862 if ((NTYPE2BIT(type) & type_mask) == 0)
2863 return 1;
2865 switch (type) {
2866 case NT_LIST:
2867 case NT_ALT:
2868 do {
2869 r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2870 anchor_mask);
2871 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2872 break;
2874 case NT_QTFR:
2875 r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2876 anchor_mask);
2877 break;
2879 case NT_ENCLOSE:
2881 EncloseNode* en = NENCLOSE(node);
2882 if ((en->type & enclose_mask) == 0)
2883 return 1;
2885 r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2887 break;
2889 case NT_ANCHOR:
2890 type = NANCHOR(node)->type;
2891 if ((type & anchor_mask) == 0)
2892 return 1;
2894 if (NANCHOR(node)->target)
2895 r = check_type_tree(NANCHOR(node)->target,
2896 type_mask, enclose_mask, anchor_mask);
2897 break;
2899 default:
2900 break;
2902 return r;
2905 #ifdef USE_SUBEXP_CALL
2907 # define RECURSION_EXIST 1
2908 # define RECURSION_INFINITE 2
2910 static int
2911 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2913 int type;
2914 int r = 0;
2916 type = NTYPE(node);
2917 switch (type) {
2918 case NT_LIST:
2920 Node *x;
2921 OnigDistance min;
2922 int ret;
2924 x = node;
2925 do {
2926 ret = subexp_inf_recursive_check(NCAR(x), env, head);
2927 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2928 r |= ret;
2929 if (head) {
2930 ret = get_min_match_length(NCAR(x), &min, env);
2931 if (ret != 0) return ret;
2932 if (min != 0) head = 0;
2934 } while (IS_NOT_NULL(x = NCDR(x)));
2936 break;
2938 case NT_ALT:
2940 int ret;
2941 r = RECURSION_EXIST;
2942 do {
2943 ret = subexp_inf_recursive_check(NCAR(node), env, head);
2944 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2945 r &= ret;
2946 } while (IS_NOT_NULL(node = NCDR(node)));
2948 break;
2950 case NT_QTFR:
2951 r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2952 if (r == RECURSION_EXIST) {
2953 if (NQTFR(node)->lower == 0) r = 0;
2955 break;
2957 case NT_ANCHOR:
2959 AnchorNode* an = NANCHOR(node);
2960 switch (an->type) {
2961 case ANCHOR_PREC_READ:
2962 case ANCHOR_PREC_READ_NOT:
2963 case ANCHOR_LOOK_BEHIND:
2964 case ANCHOR_LOOK_BEHIND_NOT:
2965 r = subexp_inf_recursive_check(an->target, env, head);
2966 break;
2969 break;
2971 case NT_CALL:
2972 r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2973 break;
2975 case NT_ENCLOSE:
2976 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2977 return 0;
2978 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2979 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2980 else {
2981 SET_ENCLOSE_STATUS(node, NST_MARK2);
2982 r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2983 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2985 break;
2987 default:
2988 break;
2991 return r;
2994 static int
2995 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2997 int type;
2998 int r = 0;
3000 type = NTYPE(node);
3001 switch (type) {
3002 case NT_LIST:
3003 case NT_ALT:
3004 do {
3005 r = subexp_inf_recursive_check_trav(NCAR(node), env);
3006 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3007 break;
3009 case NT_QTFR:
3010 r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
3011 break;
3013 case NT_ANCHOR:
3015 AnchorNode* an = NANCHOR(node);
3016 switch (an->type) {
3017 case ANCHOR_PREC_READ:
3018 case ANCHOR_PREC_READ_NOT:
3019 case ANCHOR_LOOK_BEHIND:
3020 case ANCHOR_LOOK_BEHIND_NOT:
3021 r = subexp_inf_recursive_check_trav(an->target, env);
3022 break;
3025 break;
3027 case NT_ENCLOSE:
3029 EncloseNode* en = NENCLOSE(node);
3031 if (IS_ENCLOSE_RECURSION(en)) {
3032 SET_ENCLOSE_STATUS(node, NST_MARK1);
3033 r = subexp_inf_recursive_check(en->target, env, 1);
3034 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
3035 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
3037 r = subexp_inf_recursive_check_trav(en->target, env);
3040 break;
3042 default:
3043 break;
3046 return r;
3049 static int
3050 subexp_recursive_check(Node* node)
3052 int r = 0;
3054 switch (NTYPE(node)) {
3055 case NT_LIST:
3056 case NT_ALT:
3057 do {
3058 r |= subexp_recursive_check(NCAR(node));
3059 } while (IS_NOT_NULL(node = NCDR(node)));
3060 break;
3062 case NT_QTFR:
3063 r = subexp_recursive_check(NQTFR(node)->target);
3064 break;
3066 case NT_ANCHOR:
3068 AnchorNode* an = NANCHOR(node);
3069 switch (an->type) {
3070 case ANCHOR_PREC_READ:
3071 case ANCHOR_PREC_READ_NOT:
3072 case ANCHOR_LOOK_BEHIND:
3073 case ANCHOR_LOOK_BEHIND_NOT:
3074 r = subexp_recursive_check(an->target);
3075 break;
3078 break;
3080 case NT_CALL:
3081 r = subexp_recursive_check(NCALL(node)->target);
3082 if (r != 0) SET_CALL_RECURSION(node);
3083 break;
3085 case NT_ENCLOSE:
3086 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
3087 return 0;
3088 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
3089 return 1; /* recursion */
3090 else {
3091 SET_ENCLOSE_STATUS(node, NST_MARK2);
3092 r = subexp_recursive_check(NENCLOSE(node)->target);
3093 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
3095 break;
3097 default:
3098 break;
3101 return r;
3105 static int
3106 subexp_recursive_check_trav(Node* node, ScanEnv* env)
3108 # define FOUND_CALLED_NODE 1
3110 int type;
3111 int r = 0;
3113 type = NTYPE(node);
3114 switch (type) {
3115 case NT_LIST:
3116 case NT_ALT:
3118 int ret;
3119 do {
3120 ret = subexp_recursive_check_trav(NCAR(node), env);
3121 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
3122 else if (ret < 0) return ret;
3123 } while (IS_NOT_NULL(node = NCDR(node)));
3125 break;
3127 case NT_QTFR:
3128 r = subexp_recursive_check_trav(NQTFR(node)->target, env);
3129 if (NQTFR(node)->upper == 0) {
3130 if (r == FOUND_CALLED_NODE)
3131 NQTFR(node)->is_referred = 1;
3133 break;
3135 case NT_ANCHOR:
3137 AnchorNode* an = NANCHOR(node);
3138 switch (an->type) {
3139 case ANCHOR_PREC_READ:
3140 case ANCHOR_PREC_READ_NOT:
3141 case ANCHOR_LOOK_BEHIND:
3142 case ANCHOR_LOOK_BEHIND_NOT:
3143 r = subexp_recursive_check_trav(an->target, env);
3144 break;
3147 break;
3149 case NT_ENCLOSE:
3151 EncloseNode* en = NENCLOSE(node);
3153 if (! IS_ENCLOSE_RECURSION(en)) {
3154 if (IS_ENCLOSE_CALLED(en)) {
3155 SET_ENCLOSE_STATUS(node, NST_MARK1);
3156 r = subexp_recursive_check(en->target);
3157 if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
3158 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
3161 r = subexp_recursive_check_trav(en->target, env);
3162 if (IS_ENCLOSE_CALLED(en))
3163 r |= FOUND_CALLED_NODE;
3165 break;
3167 default:
3168 break;
3171 return r;
3174 static int
3175 setup_subexp_call(Node* node, ScanEnv* env)
3177 int type;
3178 int r = 0;
3180 type = NTYPE(node);
3181 switch (type) {
3182 case NT_LIST:
3183 do {
3184 r = setup_subexp_call(NCAR(node), env);
3185 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3186 break;
3188 case NT_ALT:
3189 do {
3190 r = setup_subexp_call(NCAR(node), env);
3191 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3192 break;
3194 case NT_QTFR:
3195 r = setup_subexp_call(NQTFR(node)->target, env);
3196 break;
3197 case NT_ENCLOSE:
3198 r = setup_subexp_call(NENCLOSE(node)->target, env);
3199 break;
3201 case NT_CALL:
3203 CallNode* cn = NCALL(node);
3204 Node** nodes = SCANENV_MEM_NODES(env);
3206 if (cn->group_num != 0) {
3207 int gnum = cn->group_num;
3209 # ifdef USE_NAMED_GROUP
3210 if (env->num_named > 0 &&
3211 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3212 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3213 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3215 # endif
3216 if (gnum > env->num_mem) {
3217 onig_scan_env_set_error_string(env,
3218 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3219 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3222 # ifdef USE_NAMED_GROUP
3223 set_call_attr:
3224 # endif
3225 cn->target = nodes[cn->group_num];
3226 if (IS_NULL(cn->target)) {
3227 onig_scan_env_set_error_string(env,
3228 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3229 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3231 SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3232 BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3233 cn->unset_addr_list = env->unset_addr_list;
3235 # ifdef USE_NAMED_GROUP
3236 # ifdef USE_PERL_SUBEXP_CALL
3237 else if (cn->name == cn->name_end) {
3238 goto set_call_attr;
3240 # endif
3241 else {
3242 int *refs;
3244 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3245 &refs);
3246 if (n <= 0) {
3247 onig_scan_env_set_error_string(env,
3248 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3249 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3251 else if (n > 1 &&
3252 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL)) {
3253 onig_scan_env_set_error_string(env,
3254 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3255 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3257 else {
3258 cn->group_num = refs[0];
3259 goto set_call_attr;
3262 # endif
3264 break;
3266 case NT_ANCHOR:
3268 AnchorNode* an = NANCHOR(node);
3270 switch (an->type) {
3271 case ANCHOR_PREC_READ:
3272 case ANCHOR_PREC_READ_NOT:
3273 case ANCHOR_LOOK_BEHIND:
3274 case ANCHOR_LOOK_BEHIND_NOT:
3275 r = setup_subexp_call(an->target, env);
3276 break;
3279 break;
3281 default:
3282 break;
3285 return r;
3287 #endif
3289 /* divide different length alternatives in look-behind.
3290 (?<=A|B) ==> (?<=A)|(?<=B)
3291 (?<!A|B) ==> (?<!A)(?<!B)
3293 static int
3294 divide_look_behind_alternatives(Node* node)
3296 Node *head, *np, *insert_node;
3297 AnchorNode* an = NANCHOR(node);
3298 int anc_type = an->type;
3300 head = an->target;
3301 np = NCAR(head);
3302 swap_node(node, head);
3303 NCAR(node) = head;
3304 NANCHOR(head)->target = np;
3306 np = node;
3307 while ((np = NCDR(np)) != NULL_NODE) {
3308 insert_node = onig_node_new_anchor(anc_type);
3309 CHECK_NULL_RETURN_MEMERR(insert_node);
3310 NANCHOR(insert_node)->target = NCAR(np);
3311 NCAR(np) = insert_node;
3314 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3315 np = node;
3316 do {
3317 SET_NTYPE(np, NT_LIST); /* alt -> list */
3318 } while ((np = NCDR(np)) != NULL_NODE);
3320 return 0;
3323 static int
3324 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3326 int r, len;
3327 AnchorNode* an = NANCHOR(node);
3329 r = get_char_length_tree(an->target, reg, &len);
3330 if (r == 0)
3331 an->char_len = len;
3332 else if (r == GET_CHAR_LEN_VARLEN)
3333 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3334 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3335 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3336 r = divide_look_behind_alternatives(node);
3337 else
3338 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3341 return r;
3344 static int
3345 next_setup(Node* node, Node* next_node, regex_t* reg)
3347 int type;
3349 retry:
3350 type = NTYPE(node);
3351 if (type == NT_QTFR) {
3352 QtfrNode* qn = NQTFR(node);
3353 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3354 #ifdef USE_QTFR_PEEK_NEXT
3355 Node* n = get_head_value_node(next_node, 1, reg);
3356 /* '\0': for UTF-16BE etc... */
3357 if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3358 qn->next_head_exact = n;
3360 #endif
3361 /* automatic possessification a*b ==> (?>a*)b */
3362 if (qn->lower <= 1) {
3363 int ttype = NTYPE(qn->target);
3364 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3365 Node *x, *y;
3366 x = get_head_value_node(qn->target, 0, reg);
3367 if (IS_NOT_NULL(x)) {
3368 y = get_head_value_node(next_node, 0, reg);
3369 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3370 Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3371 CHECK_NULL_RETURN_MEMERR(en);
3372 SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3373 swap_node(node, en);
3374 NENCLOSE(node)->target = en;
3381 else if (type == NT_ENCLOSE) {
3382 EncloseNode* en = NENCLOSE(node);
3383 if (en->type == ENCLOSE_MEMORY) {
3384 node = en->target;
3385 goto retry;
3388 return 0;
3392 static int
3393 update_string_node_case_fold(regex_t* reg, Node *node)
3395 UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3396 UChar *sbuf, *ebuf, *sp;
3397 int r, i, len;
3398 OnigDistance sbuf_size;
3399 StrNode* sn = NSTR(node);
3401 end = sn->end;
3402 sbuf_size = (end - sn->s) * 2;
3403 sbuf = (UChar* )xmalloc(sbuf_size);
3404 CHECK_NULL_RETURN_MEMERR(sbuf);
3405 ebuf = sbuf + sbuf_size;
3407 sp = sbuf;
3408 p = sn->s;
3409 while (p < end) {
3410 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3411 for (i = 0; i < len; i++) {
3412 if (sp >= ebuf) {
3413 UChar* p = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3414 if (IS_NULL(p)) {
3415 xfree(sbuf);
3416 return ONIGERR_MEMORY;
3418 sbuf = p;
3419 sp = sbuf + sbuf_size;
3420 sbuf_size *= 2;
3421 ebuf = sbuf + sbuf_size;
3424 *sp++ = buf[i];
3428 r = onig_node_str_set(node, sbuf, sp);
3430 xfree(sbuf);
3431 return r;
3434 static int
3435 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3436 regex_t* reg)
3438 int r;
3439 Node *node;
3441 node = onig_node_new_str(s, end);
3442 if (IS_NULL(node)) return ONIGERR_MEMORY;
3444 r = update_string_node_case_fold(reg, node);
3445 if (r != 0) {
3446 onig_node_free(node);
3447 return r;
3450 NSTRING_SET_AMBIG(node);
3451 NSTRING_SET_DONT_GET_OPT_INFO(node);
3452 *rnode = node;
3453 return 0;
3456 static int
3457 is_case_fold_variable_len(int item_num, OnigCaseFoldCodeItem items[],
3458 int slen)
3460 int i;
3462 for (i = 0; i < item_num; i++) {
3463 if (items[i].byte_len != slen) {
3464 return 1;
3466 if (items[i].code_len != 1) {
3467 return 1;
3470 return 0;
3473 static int
3474 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3475 UChar *p, int slen, UChar *end,
3476 regex_t* reg, Node **rnode)
3478 int r, i, j, len, varlen;
3479 Node *anode, *var_anode, *snode, *xnode, *an;
3480 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3482 *rnode = var_anode = NULL_NODE;
3484 varlen = 0;
3485 for (i = 0; i < item_num; i++) {
3486 if (items[i].byte_len != slen) {
3487 varlen = 1;
3488 break;
3492 if (varlen != 0) {
3493 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3494 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3496 xnode = onig_node_new_list(NULL, NULL);
3497 if (IS_NULL(xnode)) goto mem_err;
3498 NCAR(var_anode) = xnode;
3500 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3501 if (IS_NULL(anode)) goto mem_err;
3502 NCAR(xnode) = anode;
3504 else {
3505 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3506 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3509 snode = onig_node_new_str(p, p + slen);
3510 if (IS_NULL(snode)) goto mem_err;
3512 NCAR(anode) = snode;
3514 for (i = 0; i < item_num; i++) {
3515 snode = onig_node_new_str(NULL, NULL);
3516 if (IS_NULL(snode)) goto mem_err;
3518 for (j = 0; j < items[i].code_len; j++) {
3519 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3520 if (len < 0) {
3521 r = len;
3522 goto mem_err2;
3525 r = onig_node_str_cat(snode, buf, buf + len);
3526 if (r != 0) goto mem_err2;
3529 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3530 if (IS_NULL(an)) {
3531 goto mem_err2;
3534 if (items[i].byte_len != slen) {
3535 Node *rem;
3536 UChar *q = p + items[i].byte_len;
3538 if (q < end) {
3539 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3540 if (r != 0) {
3541 onig_node_free(an);
3542 goto mem_err2;
3545 xnode = onig_node_list_add(NULL_NODE, snode);
3546 if (IS_NULL(xnode)) {
3547 onig_node_free(an);
3548 onig_node_free(rem);
3549 goto mem_err2;
3551 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3552 onig_node_free(an);
3553 onig_node_free(xnode);
3554 onig_node_free(rem);
3555 goto mem_err;
3558 NCAR(an) = xnode;
3560 else {
3561 NCAR(an) = snode;
3564 NCDR(var_anode) = an;
3565 var_anode = an;
3567 else {
3568 NCAR(an) = snode;
3569 NCDR(anode) = an;
3570 anode = an;
3574 return varlen;
3576 mem_err2:
3577 onig_node_free(snode);
3579 mem_err:
3580 onig_node_free(*rnode);
3582 return ONIGERR_MEMORY;
3585 static int
3586 expand_case_fold_string(Node* node, regex_t* reg)
3588 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3590 int r, n, len, alt_num;
3591 int varlen = 0;
3592 UChar *start, *end, *p;
3593 Node *top_root, *root, *snode, *prev_node;
3594 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3595 StrNode* sn = NSTR(node);
3597 if (NSTRING_IS_AMBIG(node)) return 0;
3599 start = sn->s;
3600 end = sn->end;
3601 if (start >= end) return 0;
3603 r = 0;
3604 top_root = root = prev_node = snode = NULL_NODE;
3605 alt_num = 1;
3606 p = start;
3607 while (p < end) {
3608 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3609 p, end, items);
3610 if (n < 0) {
3611 r = n;
3612 goto err;
3615 len = enclen(reg->enc, p, end);
3617 varlen = is_case_fold_variable_len(n, items, len);
3618 if (n == 0 || varlen == 0) {
3619 if (IS_NULL(snode)) {
3620 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3621 onig_node_free(top_root);
3622 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3623 if (IS_NULL(root)) {
3624 onig_node_free(prev_node);
3625 goto mem_err;
3629 prev_node = snode = onig_node_new_str(NULL, NULL);
3630 if (IS_NULL(snode)) goto mem_err;
3631 if (IS_NOT_NULL(root)) {
3632 if (IS_NULL(onig_node_list_add(root, snode))) {
3633 onig_node_free(snode);
3634 goto mem_err;
3639 r = onig_node_str_cat(snode, p, p + len);
3640 if (r != 0) goto err;
3642 else {
3643 alt_num *= (n + 1);
3644 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3646 if (IS_NOT_NULL(snode)) {
3647 r = update_string_node_case_fold(reg, snode);
3648 if (r == 0) {
3649 NSTRING_SET_AMBIG(snode);
3652 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3653 onig_node_free(top_root);
3654 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3655 if (IS_NULL(root)) {
3656 onig_node_free(prev_node);
3657 goto mem_err;
3661 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3662 if (r < 0) goto mem_err;
3663 if (r == 1) {
3664 if (IS_NULL(root)) {
3665 top_root = prev_node;
3667 else {
3668 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3669 onig_node_free(prev_node);
3670 goto mem_err;
3674 root = NCAR(prev_node);
3676 else { /* r == 0 */
3677 if (IS_NOT_NULL(root)) {
3678 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3679 onig_node_free(prev_node);
3680 goto mem_err;
3685 snode = NULL_NODE;
3688 p += len;
3690 if (IS_NOT_NULL(snode)) {
3691 r = update_string_node_case_fold(reg, snode);
3692 if (r == 0) {
3693 NSTRING_SET_AMBIG(snode);
3697 if (p < end) {
3698 Node *srem;
3700 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3701 if (r != 0) goto mem_err;
3703 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3704 onig_node_free(top_root);
3705 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3706 if (IS_NULL(root)) {
3707 onig_node_free(srem);
3708 onig_node_free(prev_node);
3709 goto mem_err;
3713 if (IS_NULL(root)) {
3714 prev_node = srem;
3716 else {
3717 if (IS_NULL(onig_node_list_add(root, srem))) {
3718 onig_node_free(srem);
3719 goto mem_err;
3724 /* ending */
3725 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3726 swap_node(node, top_root);
3727 onig_node_free(top_root);
3728 return 0;
3730 mem_err:
3731 r = ONIGERR_MEMORY;
3733 err:
3734 onig_node_free(top_root);
3735 return r;
3739 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3741 # define CEC_THRES_NUM_BIG_REPEAT 512
3742 # define CEC_INFINITE_NUM 0x7fffffff
3744 # define CEC_IN_INFINITE_REPEAT (1<<0)
3745 # define CEC_IN_FINITE_REPEAT (1<<1)
3746 # define CEC_CONT_BIG_REPEAT (1<<2)
3748 static int
3749 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3751 int type;
3752 int r = state;
3754 type = NTYPE(node);
3755 switch (type) {
3756 case NT_LIST:
3758 do {
3759 r = setup_comb_exp_check(NCAR(node), r, env);
3760 } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3762 break;
3764 case NT_ALT:
3766 int ret;
3767 do {
3768 ret = setup_comb_exp_check(NCAR(node), state, env);
3769 r |= ret;
3770 } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3772 break;
3774 case NT_QTFR:
3776 int child_state = state;
3777 int add_state = 0;
3778 QtfrNode* qn = NQTFR(node);
3779 Node* target = qn->target;
3780 int var_num;
3782 if (! IS_REPEAT_INFINITE(qn->upper)) {
3783 if (qn->upper > 1) {
3784 /* {0,1}, {1,1} are allowed */
3785 child_state |= CEC_IN_FINITE_REPEAT;
3787 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3788 if (env->backrefed_mem == 0) {
3789 if (NTYPE(qn->target) == NT_ENCLOSE) {
3790 EncloseNode* en = NENCLOSE(qn->target);
3791 if (en->type == ENCLOSE_MEMORY) {
3792 if (NTYPE(en->target) == NT_QTFR) {
3793 QtfrNode* q = NQTFR(en->target);
3794 if (IS_REPEAT_INFINITE(q->upper)
3795 && q->greedy == qn->greedy) {
3796 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3797 if (qn->upper == 1)
3798 child_state = state;
3807 if (state & CEC_IN_FINITE_REPEAT) {
3808 qn->comb_exp_check_num = -1;
3810 else {
3811 if (IS_REPEAT_INFINITE(qn->upper)) {
3812 var_num = CEC_INFINITE_NUM;
3813 child_state |= CEC_IN_INFINITE_REPEAT;
3815 else {
3816 var_num = qn->upper - qn->lower;
3819 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3820 add_state |= CEC_CONT_BIG_REPEAT;
3822 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3823 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3824 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3825 if (qn->comb_exp_check_num == 0) {
3826 env->num_comb_exp_check++;
3827 qn->comb_exp_check_num = env->num_comb_exp_check;
3828 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3829 env->comb_exp_max_regnum = env->curr_max_regnum;
3834 r = setup_comb_exp_check(target, child_state, env);
3835 r |= add_state;
3837 break;
3839 case NT_ENCLOSE:
3841 EncloseNode* en = NENCLOSE(node);
3843 switch (en->type) {
3844 case ENCLOSE_MEMORY:
3846 if (env->curr_max_regnum < en->regnum)
3847 env->curr_max_regnum = en->regnum;
3849 r = setup_comb_exp_check(en->target, state, env);
3851 break;
3853 default:
3854 r = setup_comb_exp_check(en->target, state, env);
3855 break;
3858 break;
3860 # ifdef USE_SUBEXP_CALL
3861 case NT_CALL:
3862 if (IS_CALL_RECURSION(NCALL(node)))
3863 env->has_recursion = 1;
3864 else
3865 r = setup_comb_exp_check(NCALL(node)->target, state, env);
3866 break;
3867 # endif
3869 default:
3870 break;
3873 return r;
3875 #endif
3877 #define IN_ALT (1<<0)
3878 #define IN_NOT (1<<1)
3879 #define IN_REPEAT (1<<2)
3880 #define IN_VAR_REPEAT (1<<3)
3881 #define IN_CALL (1<<4)
3882 #define IN_RECCALL (1<<5)
3884 /* setup_tree does the following work.
3885 1. check empty loop. (set qn->target_empty_info)
3886 2. expand ignore-case in char class.
3887 3. set memory status bit flags. (reg->mem_stats)
3888 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3889 5. find invalid patterns in look-behind.
3890 6. expand repeated string.
3892 static int
3893 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3895 int type;
3896 int r = 0;
3898 restart:
3899 type = NTYPE(node);
3900 switch (type) {
3901 case NT_LIST:
3903 Node* prev = NULL_NODE;
3904 do {
3905 r = setup_tree(NCAR(node), reg, state, env);
3906 if (IS_NOT_NULL(prev) && r == 0) {
3907 r = next_setup(prev, NCAR(node), reg);
3909 prev = NCAR(node);
3910 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3912 break;
3914 case NT_ALT:
3915 do {
3916 r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3917 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3918 break;
3920 case NT_CCLASS:
3921 break;
3923 case NT_STR:
3924 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3925 r = expand_case_fold_string(node, reg);
3927 break;
3929 case NT_CTYPE:
3930 case NT_CANY:
3931 break;
3933 #ifdef USE_SUBEXP_CALL
3934 case NT_CALL:
3935 break;
3936 #endif
3938 case NT_BREF:
3940 int i;
3941 int* p;
3942 Node** nodes = SCANENV_MEM_NODES(env);
3943 BRefNode* br = NBREF(node);
3944 p = BACKREFS_P(br);
3945 for (i = 0; i < br->back_num; i++) {
3946 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3947 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3948 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3949 #ifdef USE_BACKREF_WITH_LEVEL
3950 if (IS_BACKREF_NEST_LEVEL(br)) {
3951 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3953 #endif
3954 SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3957 break;
3959 case NT_QTFR:
3961 OnigDistance d;
3962 QtfrNode* qn = NQTFR(node);
3963 Node* target = qn->target;
3965 if ((state & IN_REPEAT) != 0) {
3966 qn->state |= NST_IN_REPEAT;
3969 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3970 r = get_min_match_length(target, &d, env);
3971 if (r) break;
3972 if (d == 0) {
3973 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3974 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3975 r = quantifiers_memory_node_info(target);
3976 if (r < 0) break;
3977 if (r > 0) {
3978 qn->target_empty_info = r;
3980 #endif
3981 #if 0
3982 r = get_max_match_length(target, &d, env);
3983 if (r == 0 && d == 0) {
3984 /* ()* ==> ()?, ()+ ==> () */
3985 qn->upper = 1;
3986 if (qn->lower > 1) qn->lower = 1;
3987 if (NTYPE(target) == NT_STR) {
3988 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
3991 #endif
3995 state |= IN_REPEAT;
3996 if (qn->lower != qn->upper)
3997 state |= IN_VAR_REPEAT;
3998 r = setup_tree(target, reg, state, env);
3999 if (r) break;
4001 /* expand string */
4002 #define EXPAND_STRING_MAX_LENGTH 100
4003 if (NTYPE(target) == NT_STR) {
4004 if (qn->lower > 1) {
4005 int i, n = qn->lower;
4006 OnigDistance len = NSTRING_LEN(target);
4007 StrNode* sn = NSTR(target);
4008 Node* np;
4010 np = onig_node_new_str(sn->s, sn->end);
4011 if (IS_NULL(np)) return ONIGERR_MEMORY;
4012 NSTR(np)->flag = sn->flag;
4014 for (i = 1; i < n && (i+1) * len <= EXPAND_STRING_MAX_LENGTH; i++) {
4015 r = onig_node_str_cat(np, sn->s, sn->end);
4016 if (r) {
4017 onig_node_free(np);
4018 return r;
4021 if (i < qn->upper || IS_REPEAT_INFINITE(qn->upper)) {
4022 Node *np1, *np2;
4024 qn->lower -= i;
4025 if (! IS_REPEAT_INFINITE(qn->upper))
4026 qn->upper -= i;
4028 np1 = onig_node_new_list(np, NULL);
4029 if (IS_NULL(np1)) {
4030 onig_node_free(np);
4031 return ONIGERR_MEMORY;
4033 swap_node(np1, node);
4034 np2 = onig_node_list_add(node, np1);
4035 if (IS_NULL(np2)) {
4036 onig_node_free(np1);
4037 return ONIGERR_MEMORY;
4040 else {
4041 swap_node(np, node);
4042 onig_node_free(np);
4044 break; /* break case NT_QTFR: */
4048 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
4049 if (qn->greedy && (qn->target_empty_info != 0)) {
4050 if (NTYPE(target) == NT_QTFR) {
4051 QtfrNode* tqn = NQTFR(target);
4052 if (IS_NOT_NULL(tqn->head_exact)) {
4053 qn->head_exact = tqn->head_exact;
4054 tqn->head_exact = NULL;
4057 else {
4058 qn->head_exact = get_head_value_node(qn->target, 1, reg);
4061 #endif
4063 break;
4065 case NT_ENCLOSE:
4067 EncloseNode* en = NENCLOSE(node);
4069 switch (en->type) {
4070 case ENCLOSE_OPTION:
4072 OnigOptionType options = reg->options;
4073 reg->options = NENCLOSE(node)->option;
4074 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4075 reg->options = options;
4077 break;
4079 case ENCLOSE_MEMORY:
4080 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_CALL)) != 0) {
4081 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
4082 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
4084 if (IS_ENCLOSE_CALLED(en))
4085 state |= IN_CALL;
4086 if (IS_ENCLOSE_RECURSION(en))
4087 state |= IN_RECCALL;
4088 else if ((state & IN_RECCALL) != 0)
4089 SET_CALL_RECURSION(node);
4090 r = setup_tree(en->target, reg, state, env);
4091 break;
4093 case ENCLOSE_STOP_BACKTRACK:
4095 Node* target = en->target;
4096 r = setup_tree(target, reg, state, env);
4097 if (NTYPE(target) == NT_QTFR) {
4098 QtfrNode* tqn = NQTFR(target);
4099 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
4100 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
4101 int qtype = NTYPE(tqn->target);
4102 if (IS_NODE_TYPE_SIMPLE(qtype))
4103 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
4107 break;
4109 case ENCLOSE_CONDITION:
4110 #ifdef USE_NAMED_GROUP
4111 if (! IS_ENCLOSE_NAME_REF(NENCLOSE(node)) &&
4112 env->num_named > 0 &&
4113 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
4114 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
4115 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
4117 #endif
4118 if (NENCLOSE(node)->regnum > env->num_mem)
4119 return ONIGERR_INVALID_BACKREF;
4120 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4121 break;
4123 case ENCLOSE_ABSENT:
4124 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4125 break;
4128 break;
4130 case NT_ANCHOR:
4132 AnchorNode* an = NANCHOR(node);
4134 switch (an->type) {
4135 case ANCHOR_PREC_READ:
4136 r = setup_tree(an->target, reg, state, env);
4137 break;
4138 case ANCHOR_PREC_READ_NOT:
4139 r = setup_tree(an->target, reg, (state | IN_NOT), env);
4140 break;
4142 /* allowed node types in look-behind */
4143 #define ALLOWED_TYPE_IN_LB \
4144 ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
4145 BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
4147 #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY | ENCLOSE_OPTION )
4148 #define ALLOWED_ENCLOSE_IN_LB_NOT ENCLOSE_OPTION
4150 #define ALLOWED_ANCHOR_IN_LB \
4151 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
4152 ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
4153 ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
4154 ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
4155 #define ALLOWED_ANCHOR_IN_LB_NOT \
4156 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
4157 ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
4158 ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
4159 ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
4161 case ANCHOR_LOOK_BEHIND:
4163 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
4164 ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
4165 if (r < 0) return r;
4166 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4167 if (NTYPE(node) != NT_ANCHOR) goto restart;
4168 r = setup_tree(an->target, reg, state, env);
4169 if (r != 0) return r;
4170 r = setup_look_behind(node, reg, env);
4172 break;
4174 case ANCHOR_LOOK_BEHIND_NOT:
4176 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
4177 ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
4178 if (r < 0) return r;
4179 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4180 if (NTYPE(node) != NT_ANCHOR) goto restart;
4181 r = setup_tree(an->target, reg, (state | IN_NOT), env);
4182 if (r != 0) return r;
4183 r = setup_look_behind(node, reg, env);
4185 break;
4188 break;
4190 default:
4191 break;
4194 return r;
4197 #ifndef USE_SUNDAY_QUICK_SEARCH
4198 /* set skip map for Boyer-Moore search */
4199 static int
4200 set_bm_skip(UChar* s, UChar* end, regex_t* reg,
4201 UChar skip[], int** int_skip, int ignore_case)
4203 OnigDistance i, len;
4204 int clen, flen, n, j, k;
4205 UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
4206 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4207 OnigEncoding enc = reg->enc;
4209 len = end - s;
4210 if (len < ONIG_CHAR_TABLE_SIZE) {
4211 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )len;
4213 n = 0;
4214 for (i = 0; i < len - 1; i += clen) {
4215 p = s + i;
4216 if (ignore_case)
4217 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4218 p, end, items);
4219 clen = enclen(enc, p, end);
4220 if (p + clen > end)
4221 clen = (int )(end - p);
4223 for (j = 0; j < n; j++) {
4224 if ((items[j].code_len != 1) || (items[j].byte_len != clen))
4225 return 1; /* different length isn't supported. */
4226 flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
4227 if (flen != clen)
4228 return 1; /* different length isn't supported. */
4230 for (j = 0; j < clen; j++) {
4231 skip[s[i + j]] = (UChar )(len - 1 - i - j);
4232 for (k = 0; k < n; k++) {
4233 skip[buf[k][j]] = (UChar )(len - 1 - i - j);
4238 else {
4239 # if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE
4240 /* This should not happen. */
4241 return ONIGERR_TYPE_BUG;
4242 # else
4243 if (IS_NULL(*int_skip)) {
4244 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
4245 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
4247 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )len;
4249 n = 0;
4250 for (i = 0; i < len - 1; i += clen) {
4251 p = s + i;
4252 if (ignore_case)
4253 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4254 p, end, items);
4255 clen = enclen(enc, p, end);
4256 if (p + clen > end)
4257 clen = (int )(end - p);
4259 for (j = 0; j < n; j++) {
4260 if ((items[j].code_len != 1) || (items[j].byte_len != clen))
4261 return 1; /* different length isn't supported. */
4262 flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
4263 if (flen != clen)
4264 return 1; /* different length isn't supported. */
4266 for (j = 0; j < clen; j++) {
4267 (*int_skip)[s[i + j]] = (int )(len - 1 - i - j);
4268 for (k = 0; k < n; k++) {
4269 (*int_skip)[buf[k][j]] = (int )(len - 1 - i - j);
4273 # endif
4275 return 0;
4278 #else /* USE_SUNDAY_QUICK_SEARCH */
4280 /* set skip map for Sunday's quick search */
4281 static int
4282 set_bm_skip(UChar* s, UChar* end, regex_t* reg,
4283 UChar skip[], int** int_skip, int ignore_case)
4285 OnigDistance i, len;
4286 int clen, flen, n, j, k;
4287 UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
4288 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4289 OnigEncoding enc = reg->enc;
4291 len = end - s;
4292 if (len < ONIG_CHAR_TABLE_SIZE) {
4293 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )(len + 1);
4295 n = 0;
4296 for (i = 0; i < len; i += clen) {
4297 p = s + i;
4298 if (ignore_case)
4299 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4300 p, end, items);
4301 clen = enclen(enc, p, end);
4302 if (p + clen > end)
4303 clen = (int )(end - p);
4305 for (j = 0; j < n; j++) {
4306 if ((items[j].code_len != 1) || (items[j].byte_len != clen))
4307 return 1; /* different length isn't supported. */
4308 flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
4309 if (flen != clen)
4310 return 1; /* different length isn't supported. */
4312 for (j = 0; j < clen; j++) {
4313 skip[s[i + j]] = (UChar )(len - i - j);
4314 for (k = 0; k < n; k++) {
4315 skip[buf[k][j]] = (UChar )(len - i - j);
4320 else {
4321 # if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE
4322 /* This should not happen. */
4323 return ONIGERR_TYPE_BUG;
4324 # else
4325 if (IS_NULL(*int_skip)) {
4326 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
4327 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
4329 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )(len + 1);
4331 n = 0;
4332 for (i = 0; i < len; i += clen) {
4333 p = s + i;
4334 if (ignore_case)
4335 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4336 p, end, items);
4337 clen = enclen(enc, p, end);
4338 if (p + clen > end)
4339 clen = (int )(end - p);
4341 for (j = 0; j < n; j++) {
4342 if ((items[j].code_len != 1) || (items[j].byte_len != clen))
4343 return 1; /* different length isn't supported. */
4344 flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
4345 if (flen != clen)
4346 return 1; /* different length isn't supported. */
4348 for (j = 0; j < clen; j++) {
4349 (*int_skip)[s[i + j]] = (int )(len - i - j);
4350 for (k = 0; k < n; k++) {
4351 (*int_skip)[buf[k][j]] = (int )(len - i - j);
4355 # endif
4357 return 0;
4359 #endif /* USE_SUNDAY_QUICK_SEARCH */
4361 typedef struct {
4362 OnigDistance min; /* min byte length */
4363 OnigDistance max; /* max byte length */
4364 } MinMaxLen;
4366 typedef struct {
4367 MinMaxLen mmd;
4368 OnigEncoding enc;
4369 OnigOptionType options;
4370 OnigCaseFoldType case_fold_flag;
4371 ScanEnv* scan_env;
4372 } OptEnv;
4374 typedef struct {
4375 int left_anchor;
4376 int right_anchor;
4377 } OptAncInfo;
4379 typedef struct {
4380 MinMaxLen mmd; /* info position */
4381 OptAncInfo anc;
4383 int reach_end;
4384 int ignore_case; /* -1: unset, 0: case sensitive, 1: ignore case */
4385 int len;
4386 UChar s[OPT_EXACT_MAXLEN];
4387 } OptExactInfo;
4389 typedef struct {
4390 MinMaxLen mmd; /* info position */
4391 OptAncInfo anc;
4393 int value; /* weighted value */
4394 UChar map[ONIG_CHAR_TABLE_SIZE];
4395 } OptMapInfo;
4397 typedef struct {
4398 MinMaxLen len;
4400 OptAncInfo anc;
4401 OptExactInfo exb; /* boundary */
4402 OptExactInfo exm; /* middle */
4403 OptExactInfo expr; /* prec read (?=...) */
4405 OptMapInfo map; /* boundary */
4406 } NodeOptInfo;
4409 static int
4410 map_position_value(OnigEncoding enc, int i)
4412 static const short int ByteValTable[] = {
4413 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
4414 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4415 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
4416 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
4417 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4418 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
4419 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4420 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
4423 if (i < numberof(ByteValTable)) {
4424 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
4425 return 20;
4426 else
4427 return (int )ByteValTable[i];
4429 else
4430 return 4; /* Take it easy. */
4433 static int
4434 distance_value(MinMaxLen* mm)
4436 /* 1000 / (min-max-dist + 1) */
4437 static const short int dist_vals[] = {
4438 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4439 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4440 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4441 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4442 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4443 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4444 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4445 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4446 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4447 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4450 OnigDistance d;
4452 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4454 d = mm->max - mm->min;
4455 if (d < numberof(dist_vals))
4456 /* return dist_vals[d] * 16 / (mm->min + 12); */
4457 return (int )dist_vals[d];
4458 else
4459 return 1;
4462 static int
4463 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4465 if (v2 <= 0) return -1;
4466 if (v1 <= 0) return 1;
4468 v1 *= distance_value(d1);
4469 v2 *= distance_value(d2);
4471 if (v2 > v1) return 1;
4472 if (v2 < v1) return -1;
4474 if (d2->min < d1->min) return 1;
4475 if (d2->min > d1->min) return -1;
4476 return 0;
4479 static int
4480 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4482 return (a->min == b->min && a->max == b->max) ? 1 : 0;
4486 static void
4487 set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4489 mml->min = min;
4490 mml->max = max;
4493 static void
4494 clear_mml(MinMaxLen* mml)
4496 mml->min = mml->max = 0;
4499 static void
4500 copy_mml(MinMaxLen* to, MinMaxLen* from)
4502 to->min = from->min;
4503 to->max = from->max;
4506 static void
4507 add_mml(MinMaxLen* to, MinMaxLen* from)
4509 to->min = distance_add(to->min, from->min);
4510 to->max = distance_add(to->max, from->max);
4513 #if 0
4514 static void
4515 add_len_mml(MinMaxLen* to, OnigDistance len)
4517 to->min = distance_add(to->min, len);
4518 to->max = distance_add(to->max, len);
4520 #endif
4522 static void
4523 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4525 if (to->min > from->min) to->min = from->min;
4526 if (to->max < from->max) to->max = from->max;
4529 static void
4530 copy_opt_env(OptEnv* to, OptEnv* from)
4532 *to = *from;
4535 static void
4536 clear_opt_anc_info(OptAncInfo* anc)
4538 anc->left_anchor = 0;
4539 anc->right_anchor = 0;
4542 static void
4543 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4545 *to = *from;
4548 static void
4549 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4550 OnigDistance left_len, OnigDistance right_len)
4552 clear_opt_anc_info(to);
4554 to->left_anchor = left->left_anchor;
4555 if (left_len == 0) {
4556 to->left_anchor |= right->left_anchor;
4559 to->right_anchor = right->right_anchor;
4560 if (right_len == 0) {
4561 to->right_anchor |= left->right_anchor;
4563 else {
4564 to->right_anchor |= (left->right_anchor & ANCHOR_PREC_READ_NOT);
4568 static int
4569 is_left_anchor(int anc)
4571 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4572 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4573 anc == ANCHOR_PREC_READ_NOT)
4574 return 0;
4576 return 1;
4579 static int
4580 is_set_opt_anc_info(OptAncInfo* to, int anc)
4582 if ((to->left_anchor & anc) != 0) return 1;
4584 return ((to->right_anchor & anc) != 0 ? 1 : 0);
4587 static void
4588 add_opt_anc_info(OptAncInfo* to, int anc)
4590 if (is_left_anchor(anc))
4591 to->left_anchor |= anc;
4592 else
4593 to->right_anchor |= anc;
4596 static void
4597 remove_opt_anc_info(OptAncInfo* to, int anc)
4599 if (is_left_anchor(anc))
4600 to->left_anchor &= ~anc;
4601 else
4602 to->right_anchor &= ~anc;
4605 static void
4606 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4608 to->left_anchor &= add->left_anchor;
4609 to->right_anchor &= add->right_anchor;
4612 static int
4613 is_full_opt_exact_info(OptExactInfo* ex)
4615 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4618 static void
4619 clear_opt_exact_info(OptExactInfo* ex)
4621 clear_mml(&ex->mmd);
4622 clear_opt_anc_info(&ex->anc);
4623 ex->reach_end = 0;
4624 ex->ignore_case = -1; /* unset */
4625 ex->len = 0;
4626 ex->s[0] = '\0';
4629 static void
4630 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4632 *to = *from;
4635 static void
4636 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4638 int i, j, len;
4639 UChar *p, *end;
4640 OptAncInfo tanc;
4642 if (to->ignore_case < 0)
4643 to->ignore_case = add->ignore_case;
4644 else if (to->ignore_case != add->ignore_case)
4645 return ; /* avoid */
4647 p = add->s;
4648 end = p + add->len;
4649 for (i = to->len; p < end; ) {
4650 len = enclen(enc, p, end);
4651 if (i + len > OPT_EXACT_MAXLEN) break;
4652 for (j = 0; j < len && p < end; j++)
4653 to->s[i++] = *p++;
4656 to->len = i;
4657 to->reach_end = (p == end ? add->reach_end : 0);
4659 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4660 if (! to->reach_end) tanc.right_anchor = 0;
4661 copy_opt_anc_info(&to->anc, &tanc);
4664 static void
4665 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4666 int raw ARG_UNUSED, OnigEncoding enc)
4668 int i, j, len;
4669 UChar *p;
4671 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4672 len = enclen(enc, p, end);
4673 if (i + len > OPT_EXACT_MAXLEN) break;
4674 for (j = 0; j < len && p < end; j++)
4675 to->s[i++] = *p++;
4678 to->len = i;
4681 static void
4682 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4684 int i, j, len;
4686 if (add->len == 0 || to->len == 0) {
4687 clear_opt_exact_info(to);
4688 return ;
4691 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4692 clear_opt_exact_info(to);
4693 return ;
4696 for (i = 0; i < to->len && i < add->len; ) {
4697 if (to->s[i] != add->s[i]) break;
4698 len = enclen(env->enc, to->s + i, to->s + to->len);
4700 for (j = 1; j < len; j++) {
4701 if (to->s[i+j] != add->s[i+j]) break;
4703 if (j < len) break;
4704 i += len;
4707 if (! add->reach_end || i < add->len || i < to->len) {
4708 to->reach_end = 0;
4710 to->len = i;
4711 if (to->ignore_case < 0)
4712 to->ignore_case = add->ignore_case;
4713 else if (add->ignore_case >= 0)
4714 to->ignore_case |= add->ignore_case;
4716 alt_merge_opt_anc_info(&to->anc, &add->anc);
4717 if (! to->reach_end) to->anc.right_anchor = 0;
4720 static void
4721 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4723 int v1, v2;
4725 v1 = now->len;
4726 v2 = alt->len;
4728 if (v2 == 0) {
4729 return ;
4731 else if (v1 == 0) {
4732 copy_opt_exact_info(now, alt);
4733 return ;
4735 else if (v1 <= 2 && v2 <= 2) {
4736 /* ByteValTable[x] is big value --> low price */
4737 v2 = map_position_value(enc, now->s[0]);
4738 v1 = map_position_value(enc, alt->s[0]);
4740 if (now->len > 1) v1 += 5;
4741 if (alt->len > 1) v2 += 5;
4744 if (now->ignore_case <= 0) v1 *= 2;
4745 if (alt->ignore_case <= 0) v2 *= 2;
4747 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4748 copy_opt_exact_info(now, alt);
4751 static void
4752 clear_opt_map_info(OptMapInfo* map)
4754 static const OptMapInfo clean_info = {
4755 {0, 0}, {0, 0}, 0,
4757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4758 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4759 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4760 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4761 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4762 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4763 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4764 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4766 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4767 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4768 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4769 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4770 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4771 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4772 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4776 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4779 static void
4780 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4782 *to = *from;
4785 static void
4786 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4788 if (map->map[c] == 0) {
4789 map->map[c] = 1;
4790 map->value += map_position_value(enc, c);
4794 static int
4795 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4796 OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4798 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4799 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4800 int i, n;
4802 add_char_opt_map_info(map, p[0], enc);
4804 case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4805 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4806 if (n < 0) return n;
4808 for (i = 0; i < n; i++) {
4809 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4810 add_char_opt_map_info(map, buf[0], enc);
4813 return 0;
4816 static void
4817 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4819 const int z = 1<<15; /* 32768: something big value */
4821 int v1, v2;
4823 if (alt->value == 0) return ;
4824 if (now->value == 0) {
4825 copy_opt_map_info(now, alt);
4826 return ;
4829 v1 = z / now->value;
4830 v2 = z / alt->value;
4831 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4832 copy_opt_map_info(now, alt);
4835 static int
4836 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4838 #define COMP_EM_BASE 20
4839 int ve, vm;
4841 if (m->value <= 0) return -1;
4843 ve = COMP_EM_BASE * e->len * (e->ignore_case > 0 ? 1 : 2);
4844 vm = COMP_EM_BASE * 5 * 2 / m->value;
4845 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4848 static void
4849 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4851 int i, val;
4853 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4854 if (to->value == 0) return ;
4855 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4856 clear_opt_map_info(to);
4857 return ;
4860 alt_merge_mml(&to->mmd, &add->mmd);
4862 val = 0;
4863 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4864 if (add->map[i])
4865 to->map[i] = 1;
4867 if (to->map[i])
4868 val += map_position_value(enc, i);
4870 to->value = val;
4872 alt_merge_opt_anc_info(&to->anc, &add->anc);
4875 static void
4876 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4878 copy_mml(&(opt->exb.mmd), mmd);
4879 copy_mml(&(opt->expr.mmd), mmd);
4880 copy_mml(&(opt->map.mmd), mmd);
4883 static void
4884 clear_node_opt_info(NodeOptInfo* opt)
4886 clear_mml(&opt->len);
4887 clear_opt_anc_info(&opt->anc);
4888 clear_opt_exact_info(&opt->exb);
4889 clear_opt_exact_info(&opt->exm);
4890 clear_opt_exact_info(&opt->expr);
4891 clear_opt_map_info(&opt->map);
4894 static void
4895 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4897 *to = *from;
4900 static void
4901 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4903 int exb_reach, exm_reach;
4904 OptAncInfo tanc;
4906 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4907 copy_opt_anc_info(&to->anc, &tanc);
4909 if (add->exb.len > 0 && to->len.max == 0) {
4910 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4911 to->len.max, add->len.max);
4912 copy_opt_anc_info(&add->exb.anc, &tanc);
4915 if (add->map.value > 0 && to->len.max == 0) {
4916 if (add->map.mmd.max == 0)
4917 add->map.anc.left_anchor |= to->anc.left_anchor;
4920 exb_reach = to->exb.reach_end;
4921 exm_reach = to->exm.reach_end;
4923 if (add->len.max != 0)
4924 to->exb.reach_end = to->exm.reach_end = 0;
4926 if (add->exb.len > 0) {
4927 if (exb_reach) {
4928 concat_opt_exact_info(&to->exb, &add->exb, enc);
4929 clear_opt_exact_info(&add->exb);
4931 else if (exm_reach) {
4932 concat_opt_exact_info(&to->exm, &add->exb, enc);
4933 clear_opt_exact_info(&add->exb);
4936 select_opt_exact_info(enc, &to->exm, &add->exb);
4937 select_opt_exact_info(enc, &to->exm, &add->exm);
4939 if (to->expr.len > 0) {
4940 if (add->len.max > 0) {
4941 if (to->expr.len > (int )add->len.max)
4942 to->expr.len = (int )add->len.max;
4944 if (to->expr.mmd.max == 0)
4945 select_opt_exact_info(enc, &to->exb, &to->expr);
4946 else
4947 select_opt_exact_info(enc, &to->exm, &to->expr);
4950 else if (add->expr.len > 0) {
4951 copy_opt_exact_info(&to->expr, &add->expr);
4954 select_opt_map_info(&to->map, &add->map);
4956 add_mml(&to->len, &add->len);
4959 static void
4960 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4962 alt_merge_opt_anc_info (&to->anc, &add->anc);
4963 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4964 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4965 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4966 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4968 alt_merge_mml(&to->len, &add->len);
4972 #define MAX_NODE_OPT_INFO_REF_COUNT 5
4974 static int
4975 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4977 int type;
4978 int r = 0;
4980 clear_node_opt_info(opt);
4981 set_bound_node_opt_info(opt, &env->mmd);
4983 type = NTYPE(node);
4984 switch (type) {
4985 case NT_LIST:
4987 OptEnv nenv;
4988 NodeOptInfo nopt;
4989 Node* nd = node;
4991 copy_opt_env(&nenv, env);
4992 do {
4993 r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4994 if (r == 0) {
4995 add_mml(&nenv.mmd, &nopt.len);
4996 concat_left_node_opt_info(env->enc, opt, &nopt);
4998 } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
5000 break;
5002 case NT_ALT:
5004 NodeOptInfo nopt;
5005 Node* nd = node;
5007 do {
5008 r = optimize_node_left(NCAR(nd), &nopt, env);
5009 if (r == 0) {
5010 if (nd == node) copy_node_opt_info(opt, &nopt);
5011 else alt_merge_node_opt_info(opt, &nopt, env);
5013 } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
5015 break;
5017 case NT_STR:
5019 StrNode* sn = NSTR(node);
5020 OnigDistance slen = sn->end - sn->s;
5021 int is_raw = NSTRING_IS_RAW(node);
5023 if (! NSTRING_IS_AMBIG(node)) {
5024 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
5025 is_raw, env->enc);
5026 opt->exb.ignore_case = 0;
5027 if (slen > 0) {
5028 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
5030 set_mml(&opt->len, slen, slen);
5032 else {
5033 OnigDistance max;
5035 if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
5036 int n = onigenc_strlen(env->enc, sn->s, sn->end);
5037 max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * (OnigDistance)n;
5039 else {
5040 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
5041 is_raw, env->enc);
5042 opt->exb.ignore_case = 1;
5044 if (slen > 0) {
5045 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
5046 env->enc, env->case_fold_flag);
5047 if (r != 0) break;
5050 max = slen;
5053 set_mml(&opt->len, slen, max);
5056 if ((OnigDistance )opt->exb.len == slen)
5057 opt->exb.reach_end = 1;
5059 break;
5061 case NT_CCLASS:
5063 int i, z;
5064 CClassNode* cc = NCCLASS(node);
5066 /* no need to check ignore case. (set in setup_tree()) */
5068 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
5069 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
5070 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
5072 set_mml(&opt->len, min, max);
5074 else {
5075 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
5076 z = BITSET_AT(cc->bs, i);
5077 if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
5078 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5081 set_mml(&opt->len, 1, 1);
5084 break;
5086 case NT_CTYPE:
5088 int i, min, max;
5089 int maxcode;
5091 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
5093 if (max == 1) {
5094 min = 1;
5096 maxcode = NCTYPE(node)->ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
5097 switch (NCTYPE(node)->ctype) {
5098 case ONIGENC_CTYPE_WORD:
5099 if (NCTYPE(node)->not != 0) {
5100 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
5101 if (! ONIGENC_IS_CODE_WORD(env->enc, i) || i >= maxcode) {
5102 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5106 else {
5107 for (i = 0; i < maxcode; i++) {
5108 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
5109 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5113 break;
5116 else {
5117 min = ONIGENC_MBC_MINLEN(env->enc);
5119 set_mml(&opt->len, min, max);
5121 break;
5123 case NT_CANY:
5125 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
5126 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
5127 set_mml(&opt->len, min, max);
5129 break;
5131 case NT_ANCHOR:
5132 switch (NANCHOR(node)->type) {
5133 case ANCHOR_BEGIN_BUF:
5134 case ANCHOR_BEGIN_POSITION:
5135 case ANCHOR_BEGIN_LINE:
5136 case ANCHOR_END_BUF:
5137 case ANCHOR_SEMI_END_BUF:
5138 case ANCHOR_END_LINE:
5139 case ANCHOR_LOOK_BEHIND: /* just for (?<=x).* */
5140 case ANCHOR_PREC_READ_NOT: /* just for (?!x).* */
5141 add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
5142 break;
5144 case ANCHOR_PREC_READ:
5146 NodeOptInfo nopt;
5148 r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
5149 if (r == 0) {
5150 if (nopt.exb.len > 0)
5151 copy_opt_exact_info(&opt->expr, &nopt.exb);
5152 else if (nopt.exm.len > 0)
5153 copy_opt_exact_info(&opt->expr, &nopt.exm);
5155 opt->expr.reach_end = 0;
5157 if (nopt.map.value > 0)
5158 copy_opt_map_info(&opt->map, &nopt.map);
5161 break;
5163 case ANCHOR_LOOK_BEHIND_NOT:
5164 break;
5166 break;
5168 case NT_BREF:
5170 int i;
5171 int* backs;
5172 OnigDistance min, max, tmin, tmax;
5173 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
5174 BRefNode* br = NBREF(node);
5176 if (br->state & NST_RECURSION) {
5177 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5178 break;
5180 backs = BACKREFS_P(br);
5181 r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
5182 if (r != 0) break;
5183 r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
5184 if (r != 0) break;
5185 for (i = 1; i < br->back_num; i++) {
5186 r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
5187 if (r != 0) break;
5188 r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
5189 if (r != 0) break;
5190 if (min > tmin) min = tmin;
5191 if (max < tmax) max = tmax;
5193 if (r == 0) set_mml(&opt->len, min, max);
5195 break;
5197 #ifdef USE_SUBEXP_CALL
5198 case NT_CALL:
5199 if (IS_CALL_RECURSION(NCALL(node)))
5200 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5201 else {
5202 OnigOptionType save = env->options;
5203 env->options = NENCLOSE(NCALL(node)->target)->option;
5204 r = optimize_node_left(NCALL(node)->target, opt, env);
5205 env->options = save;
5207 break;
5208 #endif
5210 case NT_QTFR:
5212 int i;
5213 OnigDistance min, max;
5214 NodeOptInfo nopt;
5215 QtfrNode* qn = NQTFR(node);
5217 r = optimize_node_left(qn->target, &nopt, env);
5218 if (r) break;
5220 if (/*qn->lower == 0 &&*/ IS_REPEAT_INFINITE(qn->upper)) {
5221 if (env->mmd.max == 0 &&
5222 NTYPE(qn->target) == NT_CANY && qn->greedy) {
5223 if (IS_MULTILINE(env->options))
5224 /* implicit anchor: /.*a/ ==> /\A.*a/ */
5225 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
5226 else
5227 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
5230 else {
5231 if (qn->lower > 0) {
5232 copy_node_opt_info(opt, &nopt);
5233 if (nopt.exb.len > 0) {
5234 if (nopt.exb.reach_end) {
5235 for (i = 2; i <= qn->lower &&
5236 ! is_full_opt_exact_info(&opt->exb); i++) {
5237 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
5239 if (i < qn->lower) {
5240 opt->exb.reach_end = 0;
5245 if (qn->lower != qn->upper) {
5246 opt->exb.reach_end = 0;
5247 opt->exm.reach_end = 0;
5249 if (qn->lower > 1)
5250 opt->exm.reach_end = 0;
5254 min = distance_multiply(nopt.len.min, qn->lower);
5255 if (IS_REPEAT_INFINITE(qn->upper))
5256 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
5257 else
5258 max = distance_multiply(nopt.len.max, qn->upper);
5260 set_mml(&opt->len, min, max);
5262 break;
5264 case NT_ENCLOSE:
5266 EncloseNode* en = NENCLOSE(node);
5268 switch (en->type) {
5269 case ENCLOSE_OPTION:
5271 OnigOptionType save = env->options;
5273 env->options = en->option;
5274 r = optimize_node_left(en->target, opt, env);
5275 env->options = save;
5277 break;
5279 case ENCLOSE_MEMORY:
5280 #ifdef USE_SUBEXP_CALL
5281 en->opt_count++;
5282 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
5283 OnigDistance min, max;
5285 min = 0;
5286 max = ONIG_INFINITE_DISTANCE;
5287 if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
5288 if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
5289 set_mml(&opt->len, min, max);
5291 else
5292 #endif
5294 r = optimize_node_left(en->target, opt, env);
5296 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
5297 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
5298 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
5301 break;
5303 case ENCLOSE_STOP_BACKTRACK:
5304 case ENCLOSE_CONDITION:
5305 r = optimize_node_left(en->target, opt, env);
5306 break;
5308 case ENCLOSE_ABSENT:
5309 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5310 break;
5313 break;
5315 default:
5316 #ifdef ONIG_DEBUG
5317 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
5318 NTYPE(node));
5319 #endif
5320 r = ONIGERR_TYPE_BUG;
5321 break;
5324 return r;
5327 static int
5328 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
5330 int r;
5331 int allow_reverse;
5333 if (e->len == 0) return 0;
5335 reg->exact = (UChar* )xmalloc(e->len);
5336 CHECK_NULL_RETURN_MEMERR(reg->exact);
5337 xmemcpy(reg->exact, e->s, e->len);
5338 reg->exact_end = reg->exact + e->len;
5340 allow_reverse =
5341 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
5343 if (e->ignore_case > 0) {
5344 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
5345 r = set_bm_skip(reg->exact, reg->exact_end, reg,
5346 reg->map, &(reg->int_map), 1);
5347 if (r == 0) {
5348 reg->optimize = (allow_reverse != 0
5349 ? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC);
5351 else {
5352 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
5355 else {
5356 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
5359 else {
5360 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
5361 r = set_bm_skip(reg->exact, reg->exact_end, reg,
5362 reg->map, &(reg->int_map), 0);
5363 if (r == 0) {
5364 reg->optimize = (allow_reverse != 0
5365 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
5367 else {
5368 reg->optimize = ONIG_OPTIMIZE_EXACT;
5371 else {
5372 reg->optimize = ONIG_OPTIMIZE_EXACT;
5376 reg->dmin = e->mmd.min;
5377 reg->dmax = e->mmd.max;
5379 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
5380 reg->threshold_len = (int )(reg->dmin + (reg->exact_end - reg->exact));
5383 return 0;
5386 static void
5387 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
5389 int i;
5391 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5392 reg->map[i] = m->map[i];
5394 reg->optimize = ONIG_OPTIMIZE_MAP;
5395 reg->dmin = m->mmd.min;
5396 reg->dmax = m->mmd.max;
5398 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
5399 reg->threshold_len = (int )(reg->dmin + 1);
5403 static void
5404 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
5406 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
5407 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
5410 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5411 static void print_optimize_info(FILE* f, regex_t* reg);
5412 #endif
5414 static int
5415 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
5418 int r;
5419 NodeOptInfo opt;
5420 OptEnv env;
5422 env.enc = reg->enc;
5423 env.options = reg->options;
5424 env.case_fold_flag = reg->case_fold_flag;
5425 env.scan_env = scan_env;
5426 clear_mml(&env.mmd);
5428 r = optimize_node_left(node, &opt, &env);
5429 if (r) return r;
5431 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
5432 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML |
5433 ANCHOR_LOOK_BEHIND);
5435 if ((opt.anc.left_anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) != 0)
5436 reg->anchor &= ~ANCHOR_ANYCHAR_STAR_ML;
5438 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF |
5439 ANCHOR_PREC_READ_NOT);
5441 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
5442 reg->anchor_dmin = opt.len.min;
5443 reg->anchor_dmax = opt.len.max;
5446 if (opt.exb.len > 0 || opt.exm.len > 0) {
5447 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
5448 if (opt.map.value > 0 &&
5449 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
5450 goto set_map;
5452 else {
5453 r = set_optimize_exact_info(reg, &opt.exb);
5454 set_sub_anchor(reg, &opt.exb.anc);
5457 else if (opt.map.value > 0) {
5458 set_map:
5459 set_optimize_map_info(reg, &opt.map);
5460 set_sub_anchor(reg, &opt.map.anc);
5462 else {
5463 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5464 if (opt.len.max == 0)
5465 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5468 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5469 print_optimize_info(stderr, reg);
5470 #endif
5471 return r;
5474 static void
5475 clear_optimize_info(regex_t* reg)
5477 reg->optimize = ONIG_OPTIMIZE_NONE;
5478 reg->anchor = 0;
5479 reg->anchor_dmin = 0;
5480 reg->anchor_dmax = 0;
5481 reg->sub_anchor = 0;
5482 reg->exact_end = (UChar* )NULL;
5483 reg->threshold_len = 0;
5484 if (IS_NOT_NULL(reg->exact)) {
5485 xfree(reg->exact);
5486 reg->exact = (UChar* )NULL;
5490 #ifdef ONIG_DEBUG
5492 static void print_enc_string(FILE* fp, OnigEncoding enc,
5493 const UChar *s, const UChar *end)
5495 fprintf(fp, "\nPATTERN: /");
5497 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5498 const UChar *p;
5499 OnigCodePoint code;
5501 p = s;
5502 while (p < end) {
5503 code = ONIGENC_MBC_TO_CODE(enc, p, end);
5504 if (code >= 0x80) {
5505 fprintf(fp, " 0x%04x ", (int )code);
5507 else {
5508 fputc((int )code, fp);
5511 p += enclen(enc, p, end);
5514 else {
5515 while (s < end) {
5516 fputc((int )*s, fp);
5517 s++;
5521 fprintf(fp, "/ (%s)\n", enc->name);
5523 #endif /* ONIG_DEBUG */
5525 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5526 static void
5527 print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5529 if (a == ONIG_INFINITE_DISTANCE)
5530 fputs("inf", f);
5531 else
5532 fprintf(f, "(%"PRIuPTR")", a);
5534 fputs("-", f);
5536 if (b == ONIG_INFINITE_DISTANCE)
5537 fputs("inf", f);
5538 else
5539 fprintf(f, "(%"PRIuPTR")", b);
5542 static void
5543 print_anchor(FILE* f, int anchor)
5545 int q = 0;
5547 fprintf(f, "[");
5549 if (anchor & ANCHOR_BEGIN_BUF) {
5550 fprintf(f, "begin-buf");
5551 q = 1;
5553 if (anchor & ANCHOR_BEGIN_LINE) {
5554 if (q) fprintf(f, ", ");
5555 q = 1;
5556 fprintf(f, "begin-line");
5558 if (anchor & ANCHOR_BEGIN_POSITION) {
5559 if (q) fprintf(f, ", ");
5560 q = 1;
5561 fprintf(f, "begin-pos");
5563 if (anchor & ANCHOR_END_BUF) {
5564 if (q) fprintf(f, ", ");
5565 q = 1;
5566 fprintf(f, "end-buf");
5568 if (anchor & ANCHOR_SEMI_END_BUF) {
5569 if (q) fprintf(f, ", ");
5570 q = 1;
5571 fprintf(f, "semi-end-buf");
5573 if (anchor & ANCHOR_END_LINE) {
5574 if (q) fprintf(f, ", ");
5575 q = 1;
5576 fprintf(f, "end-line");
5578 if (anchor & ANCHOR_ANYCHAR_STAR) {
5579 if (q) fprintf(f, ", ");
5580 q = 1;
5581 fprintf(f, "anychar-star");
5583 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5584 if (q) fprintf(f, ", ");
5585 fprintf(f, "anychar-star-ml");
5588 fprintf(f, "]");
5591 static void
5592 print_optimize_info(FILE* f, regex_t* reg)
5594 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5595 "EXACT_IC", "MAP",
5596 "EXACT_BM_IC", "EXACT_BM_NOT_REV_IC" };
5598 fprintf(f, "optimize: %s\n", on[reg->optimize]);
5599 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
5600 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5601 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5602 fprintf(f, "\n");
5604 if (reg->optimize) {
5605 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
5606 fprintf(f, "\n");
5608 fprintf(f, "\n");
5610 if (reg->exact) {
5611 UChar *p;
5612 fprintf(f, "exact: [");
5613 for (p = reg->exact; p < reg->exact_end; p++) {
5614 fputc(*p, f);
5616 fprintf(f, "]: length: %"PRIdPTR"\n", (reg->exact_end - reg->exact));
5618 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5619 int c, i, n = 0;
5621 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5622 if (reg->map[i]) n++;
5624 fprintf(f, "map: n=%d\n", n);
5625 if (n > 0) {
5626 c = 0;
5627 fputc('[', f);
5628 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5629 if (reg->map[i] != 0) {
5630 if (c > 0) fputs(", ", f);
5631 c++;
5632 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5633 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5634 fputc(i, f);
5635 else
5636 fprintf(f, "%d", i);
5639 fprintf(f, "]\n");
5643 #endif /* ONIG_DEBUG_COMPILE || ONIG_DEBUG_MATCH */
5646 extern void
5647 onig_free_body(regex_t* reg)
5649 if (IS_NOT_NULL(reg)) {
5650 if (IS_NOT_NULL(reg->p)) xfree(reg->p);
5651 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
5652 if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
5653 if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
5654 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
5655 if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
5657 #ifdef USE_NAMED_GROUP
5658 onig_names_free(reg);
5659 #endif
5663 extern void
5664 onig_free(regex_t* reg)
5666 if (IS_NOT_NULL(reg)) {
5667 onig_free_body(reg);
5668 xfree(reg);
5672 #ifdef RUBY
5673 size_t
5674 onig_memsize(const regex_t *reg)
5676 size_t size = sizeof(regex_t);
5677 if (IS_NULL(reg)) return 0;
5678 if (IS_NOT_NULL(reg->p)) size += reg->alloc;
5679 if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact;
5680 if (IS_NOT_NULL(reg->int_map)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
5681 if (IS_NOT_NULL(reg->int_map_backward)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
5682 if (IS_NOT_NULL(reg->repeat_range)) size += reg->repeat_range_alloc * sizeof(OnigRepeatRange);
5683 if (IS_NOT_NULL(reg->chain)) size += onig_memsize(reg->chain);
5685 return size;
5688 size_t
5689 onig_region_memsize(const OnigRegion *regs)
5691 size_t size = sizeof(*regs);
5692 if (IS_NULL(regs)) return 0;
5693 size += regs->allocated * (sizeof(*regs->beg) + sizeof(*regs->end));
5694 return size;
5696 #endif
5698 #define REGEX_TRANSFER(to,from) do {\
5699 onig_free_body(to);\
5700 xmemcpy(to, from, sizeof(regex_t));\
5701 xfree(from);\
5702 } while (0)
5704 #if 0
5705 extern void
5706 onig_transfer(regex_t* to, regex_t* from)
5708 REGEX_TRANSFER(to, from);
5710 #endif
5712 #ifdef ONIG_DEBUG_COMPILE
5713 static void print_compiled_byte_code_list(FILE* f, regex_t* reg);
5714 #endif
5715 #ifdef ONIG_DEBUG_PARSE_TREE
5716 static void print_tree(FILE* f, Node* node);
5717 #endif
5719 #ifdef RUBY
5720 extern int
5721 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5722 OnigErrorInfo* einfo)
5724 return onig_compile_ruby(reg, pattern, pattern_end, einfo, NULL, 0);
5726 #endif
5728 #ifdef RUBY
5729 extern int
5730 onig_compile_ruby(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5731 OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
5732 #else
5733 extern int
5734 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5735 OnigErrorInfo* einfo)
5736 #endif
5738 #define COMPILE_INIT_SIZE 20
5740 int r;
5741 OnigDistance init_size;
5742 Node* root;
5743 ScanEnv scan_env = {0};
5744 #ifdef USE_SUBEXP_CALL
5745 UnsetAddrList uslist;
5746 #endif
5748 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5750 #ifdef RUBY
5751 scan_env.sourcefile = sourcefile;
5752 scan_env.sourceline = sourceline;
5753 #endif
5755 #ifdef ONIG_DEBUG
5756 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5757 #endif
5759 if (reg->alloc == 0) {
5760 init_size = (pattern_end - pattern) * 2;
5761 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5762 r = BBUF_INIT(reg, init_size);
5763 if (r != 0) goto end;
5765 else
5766 reg->used = 0;
5768 reg->num_mem = 0;
5769 reg->num_repeat = 0;
5770 reg->num_null_check = 0;
5771 reg->repeat_range_alloc = 0;
5772 reg->repeat_range = (OnigRepeatRange* )NULL;
5773 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5774 reg->num_comb_exp_check = 0;
5775 #endif
5777 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5778 if (r != 0) goto err;
5780 #ifdef ONIG_DEBUG_PARSE_TREE
5781 # if 0
5782 fprintf(stderr, "ORIGINAL PARSE TREE:\n");
5783 print_tree(stderr, root);
5784 # endif
5785 #endif
5787 #ifdef USE_NAMED_GROUP
5788 /* mixed use named group and no-named group */
5789 if (scan_env.num_named > 0 &&
5790 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5791 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5792 if (scan_env.num_named != scan_env.num_mem)
5793 r = disable_noname_group_capture(&root, reg, &scan_env);
5794 else
5795 r = numbered_ref_check(root);
5797 if (r != 0) goto err;
5799 #endif
5801 #ifdef USE_SUBEXP_CALL
5802 if (scan_env.num_call > 0) {
5803 r = unset_addr_list_init(&uslist, scan_env.num_call);
5804 if (r != 0) goto err;
5805 scan_env.unset_addr_list = &uslist;
5806 r = setup_subexp_call(root, &scan_env);
5807 if (r != 0) goto err_unset;
5808 r = subexp_recursive_check_trav(root, &scan_env);
5809 if (r < 0) goto err_unset;
5810 r = subexp_inf_recursive_check_trav(root, &scan_env);
5811 if (r != 0) goto err_unset;
5813 reg->num_call = scan_env.num_call;
5815 else
5816 reg->num_call = 0;
5817 #endif
5819 r = setup_tree(root, reg, 0, &scan_env);
5820 if (r != 0) goto err_unset;
5822 #ifdef ONIG_DEBUG_PARSE_TREE
5823 print_tree(stderr, root);
5824 #endif
5826 reg->capture_history = scan_env.capture_history;
5827 reg->bt_mem_start = scan_env.bt_mem_start;
5828 reg->bt_mem_start |= reg->capture_history;
5829 if (IS_FIND_CONDITION(reg->options))
5830 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5831 else {
5832 reg->bt_mem_end = scan_env.bt_mem_end;
5833 reg->bt_mem_end |= reg->capture_history;
5836 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5837 if (scan_env.backrefed_mem == 0
5838 # ifdef USE_SUBEXP_CALL
5839 || scan_env.num_call == 0
5840 # endif
5842 setup_comb_exp_check(root, 0, &scan_env);
5843 # ifdef USE_SUBEXP_CALL
5844 if (scan_env.has_recursion != 0) {
5845 scan_env.num_comb_exp_check = 0;
5847 else
5848 # endif
5849 if (scan_env.comb_exp_max_regnum > 0) {
5850 int i;
5851 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5852 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5853 scan_env.num_comb_exp_check = 0;
5854 break;
5860 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5861 #endif
5863 clear_optimize_info(reg);
5864 #ifndef ONIG_DONT_OPTIMIZE
5865 r = set_optimize_info_from_tree(root, reg, &scan_env);
5866 if (r != 0) goto err_unset;
5867 #endif
5869 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5870 xfree(scan_env.mem_nodes_dynamic);
5871 scan_env.mem_nodes_dynamic = (Node** )NULL;
5874 r = compile_tree(root, reg);
5875 if (r == 0) {
5876 r = add_opcode(reg, OP_END);
5877 #ifdef USE_SUBEXP_CALL
5878 if (scan_env.num_call > 0) {
5879 r = unset_addr_list_fix(&uslist, reg);
5880 unset_addr_list_end(&uslist);
5881 if (r) goto err;
5883 #endif
5885 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5886 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5887 else {
5888 if (reg->bt_mem_start != 0)
5889 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5890 else
5891 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5894 #ifdef USE_SUBEXP_CALL
5895 else if (scan_env.num_call > 0) {
5896 unset_addr_list_end(&uslist);
5898 #endif
5899 onig_node_free(root);
5901 #ifdef ONIG_DEBUG_COMPILE
5902 # ifdef USE_NAMED_GROUP
5903 onig_print_names(stderr, reg);
5904 # endif
5905 print_compiled_byte_code_list(stderr, reg);
5906 #endif
5908 end:
5909 onig_reg_resize(reg);
5910 return r;
5912 err_unset:
5913 #ifdef USE_SUBEXP_CALL
5914 if (scan_env.num_call > 0) {
5915 unset_addr_list_end(&uslist);
5917 #endif
5918 err:
5919 if (IS_NOT_NULL(scan_env.error)) {
5920 if (IS_NOT_NULL(einfo)) {
5921 einfo->enc = scan_env.enc;
5922 einfo->par = scan_env.error;
5923 einfo->par_end = scan_env.error_end;
5927 onig_node_free(root);
5928 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5929 xfree(scan_env.mem_nodes_dynamic);
5930 return r;
5933 static int onig_inited = 0;
5935 extern int
5936 onig_reg_init(regex_t* reg, OnigOptionType option,
5937 OnigCaseFoldType case_fold_flag,
5938 OnigEncoding enc, const OnigSyntaxType* syntax)
5940 if (! onig_inited)
5941 onig_init();
5943 if (IS_NULL(reg))
5944 return ONIGERR_INVALID_ARGUMENT;
5946 if (ONIGENC_IS_UNDEF(enc))
5947 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SET;
5949 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5950 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5951 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5954 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5955 option |= syntax->options;
5956 option &= ~ONIG_OPTION_SINGLELINE;
5958 else
5959 option |= syntax->options;
5961 (reg)->enc = enc;
5962 (reg)->options = option;
5963 (reg)->syntax = syntax;
5964 (reg)->optimize = 0;
5965 (reg)->exact = (UChar* )NULL;
5966 (reg)->int_map = (int* )NULL;
5967 (reg)->int_map_backward = (int* )NULL;
5968 (reg)->chain = (regex_t* )NULL;
5970 (reg)->p = (UChar* )NULL;
5971 (reg)->alloc = 0;
5972 (reg)->used = 0;
5973 (reg)->name_table = (void* )NULL;
5975 (reg)->case_fold_flag = case_fold_flag;
5976 return 0;
5979 extern int
5980 onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5981 const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5982 const OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5984 int r;
5986 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5987 if (r) return r;
5989 r = onig_compile(reg, pattern, pattern_end, einfo);
5990 return r;
5993 extern int
5994 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5995 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
5996 OnigErrorInfo* einfo)
5998 int r;
6000 *reg = (regex_t* )xmalloc(sizeof(regex_t));
6001 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
6003 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
6004 if (r) goto err;
6006 r = onig_compile(*reg, pattern, pattern_end, einfo);
6007 if (r) {
6008 err:
6009 onig_free(*reg);
6010 *reg = NULL;
6012 return r;
6015 extern int
6016 onig_initialize(OnigEncoding encodings[] ARG_UNUSED, int n ARG_UNUSED)
6018 return onig_init();
6021 extern int
6022 onig_init(void)
6024 if (onig_inited != 0)
6025 return 0;
6027 onig_inited = 1;
6029 #if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER)
6030 _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
6031 #endif
6033 onigenc_init();
6034 /* onigenc_set_default_caseconv_table((UChar* )0); */
6036 #ifdef ONIG_DEBUG_STATISTICS
6037 onig_statistics_init();
6038 #endif
6040 return 0;
6044 static OnigEndCallListItemType* EndCallTop;
6046 extern void onig_add_end_call(void (*func)(void))
6048 OnigEndCallListItemType* item;
6050 item = (OnigEndCallListItemType* )xmalloc(sizeof(*item));
6051 if (item == 0) return ;
6053 item->next = EndCallTop;
6054 item->func = func;
6056 EndCallTop = item;
6059 static void
6060 exec_end_call_list(void)
6062 OnigEndCallListItemType* prev;
6063 void (*func)(void);
6065 while (EndCallTop != 0) {
6066 func = EndCallTop->func;
6067 (*func)();
6069 prev = EndCallTop;
6070 EndCallTop = EndCallTop->next;
6071 xfree(prev);
6075 extern int
6076 onig_end(void)
6078 exec_end_call_list();
6080 #ifdef ONIG_DEBUG_STATISTICS
6081 onig_print_statistics(stderr);
6082 #endif
6084 #if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER)
6085 _CrtDumpMemoryLeaks();
6086 #endif
6088 onig_inited = 0;
6090 return 0;
6093 extern int
6094 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
6096 OnigCodePoint n, *data;
6097 OnigCodePoint low, high, x;
6099 GET_CODE_POINT(n, p);
6100 data = (OnigCodePoint* )p;
6101 data++;
6103 for (low = 0, high = n; low < high; ) {
6104 x = (low + high) >> 1;
6105 if (code > data[x * 2 + 1])
6106 low = x + 1;
6107 else
6108 high = x;
6111 return ((low < n && code >= data[low * 2]) ? 1 : 0);
6114 extern int
6115 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
6117 int found;
6119 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
6120 if (IS_NULL(cc->mbuf)) {
6121 found = 0;
6123 else {
6124 found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
6127 else {
6128 found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
6131 if (IS_NCCLASS_NOT(cc))
6132 return !found;
6133 else
6134 return found;
6137 extern int
6138 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
6140 int len;
6142 if (ONIGENC_MBC_MINLEN(enc) > 1) {
6143 len = 2;
6145 else {
6146 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
6148 return onig_is_code_in_cc_len(len, code, cc);
6152 #ifdef ONIG_DEBUG
6154 /* arguments type */
6155 # define ARG_SPECIAL -1
6156 # define ARG_NON 0
6157 # define ARG_RELADDR 1
6158 # define ARG_ABSADDR 2
6159 # define ARG_LENGTH 3
6160 # define ARG_MEMNUM 4
6161 # define ARG_OPTION 5
6162 # define ARG_STATE_CHECK 6
6164 OnigOpInfoType OnigOpInfo[] = {
6165 { OP_FINISH, "finish", ARG_NON },
6166 { OP_END, "end", ARG_NON },
6167 { OP_EXACT1, "exact1", ARG_SPECIAL },
6168 { OP_EXACT2, "exact2", ARG_SPECIAL },
6169 { OP_EXACT3, "exact3", ARG_SPECIAL },
6170 { OP_EXACT4, "exact4", ARG_SPECIAL },
6171 { OP_EXACT5, "exact5", ARG_SPECIAL },
6172 { OP_EXACTN, "exactn", ARG_SPECIAL },
6173 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
6174 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
6175 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
6176 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
6177 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
6178 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
6179 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
6180 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
6181 { OP_CCLASS, "cclass", ARG_SPECIAL },
6182 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
6183 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
6184 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
6185 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
6186 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
6187 { OP_ANYCHAR, "anychar", ARG_NON },
6188 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
6189 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
6190 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
6191 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
6192 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
6193 { OP_WORD, "word", ARG_NON },
6194 { OP_NOT_WORD, "not-word", ARG_NON },
6195 { OP_WORD_BOUND, "word-bound", ARG_NON },
6196 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
6197 { OP_WORD_BEGIN, "word-begin", ARG_NON },
6198 { OP_WORD_END, "word-end", ARG_NON },
6199 { OP_ASCII_WORD, "ascii-word", ARG_NON },
6200 { OP_NOT_ASCII_WORD, "not-ascii-word", ARG_NON },
6201 { OP_ASCII_WORD_BOUND, "ascii-word-bound", ARG_NON },
6202 { OP_NOT_ASCII_WORD_BOUND,"not-ascii-word-bound", ARG_NON },
6203 { OP_ASCII_WORD_BEGIN, "ascii-word-begin", ARG_NON },
6204 { OP_ASCII_WORD_END, "ascii-word-end", ARG_NON },
6205 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
6206 { OP_END_BUF, "end-buf", ARG_NON },
6207 { OP_BEGIN_LINE, "begin-line", ARG_NON },
6208 { OP_END_LINE, "end-line", ARG_NON },
6209 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
6210 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
6211 { OP_BACKREF1, "backref1", ARG_NON },
6212 { OP_BACKREF2, "backref2", ARG_NON },
6213 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
6214 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
6215 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
6216 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
6217 { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
6218 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
6219 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
6220 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
6221 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
6222 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
6223 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
6224 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
6225 { OP_SET_OPTION, "set-option", ARG_OPTION },
6226 { OP_KEEP, "keep", ARG_NON },
6227 { OP_FAIL, "fail", ARG_NON },
6228 { OP_JUMP, "jump", ARG_RELADDR },
6229 { OP_PUSH, "push", ARG_RELADDR },
6230 { OP_POP, "pop", ARG_NON },
6231 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
6232 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
6233 { OP_REPEAT, "repeat", ARG_SPECIAL },
6234 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
6235 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
6236 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
6237 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
6238 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
6239 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
6240 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
6241 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
6242 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
6243 { OP_PUSH_POS, "push-pos", ARG_NON },
6244 { OP_POP_POS, "pop-pos", ARG_NON },
6245 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
6246 { OP_FAIL_POS, "fail-pos", ARG_NON },
6247 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
6248 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
6249 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
6250 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
6251 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
6252 { OP_PUSH_ABSENT_POS, "push-absent-pos", ARG_NON },
6253 { OP_ABSENT, "absent", ARG_RELADDR },
6254 { OP_ABSENT_END, "absent-end", ARG_NON },
6255 { OP_CALL, "call", ARG_ABSADDR },
6256 { OP_RETURN, "return", ARG_NON },
6257 { OP_CONDITION, "condition", ARG_SPECIAL },
6258 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
6259 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
6260 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
6261 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
6262 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
6263 "state-check-anychar-ml*", ARG_STATE_CHECK },
6264 { -1, "", ARG_NON }
6267 static const char*
6268 op2name(int opcode)
6270 int i;
6272 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
6273 if (opcode == OnigOpInfo[i].opcode)
6274 return OnigOpInfo[i].name;
6276 return "";
6279 static int
6280 op2arg_type(int opcode)
6282 int i;
6284 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
6285 if (opcode == OnigOpInfo[i].opcode)
6286 return OnigOpInfo[i].arg_type;
6288 return ARG_SPECIAL;
6291 # ifdef ONIG_DEBUG_PARSE_TREE
6292 static void
6293 Indent(FILE* f, int indent)
6295 int i;
6296 for (i = 0; i < indent; i++) putc(' ', f);
6298 # endif /* ONIG_DEBUG_PARSE_TREE */
6300 static void
6301 p_string(FILE* f, ptrdiff_t len, UChar* s)
6303 fputs(":", f);
6304 while (len-- > 0) { fputc(*s++, f); }
6307 static void
6308 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
6310 int x = len * mb_len;
6312 fprintf(f, ":%d:", len);
6313 while (x-- > 0) { fputc(*s++, f); }
6316 extern void
6317 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp,
6318 OnigEncoding enc)
6320 int i, n, arg_type;
6321 RelAddrType addr;
6322 LengthType len;
6323 MemNumType mem;
6324 StateCheckNumType scn;
6325 OnigCodePoint code;
6326 UChar *q;
6328 fprintf(f, "[%s", op2name(*bp));
6329 arg_type = op2arg_type(*bp);
6330 if (arg_type != ARG_SPECIAL) {
6331 bp++;
6332 switch (arg_type) {
6333 case ARG_NON:
6334 break;
6335 case ARG_RELADDR:
6336 GET_RELADDR_INC(addr, bp);
6337 fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr);
6338 break;
6339 case ARG_ABSADDR:
6340 GET_ABSADDR_INC(addr, bp);
6341 fprintf(f, ":(%d)", addr);
6342 break;
6343 case ARG_LENGTH:
6344 GET_LENGTH_INC(len, bp);
6345 fprintf(f, ":%d", len);
6346 break;
6347 case ARG_MEMNUM:
6348 mem = *((MemNumType* )bp);
6349 bp += SIZE_MEMNUM;
6350 fprintf(f, ":%d", mem);
6351 break;
6352 case ARG_OPTION:
6354 OnigOptionType option = *((OnigOptionType* )bp);
6355 bp += SIZE_OPTION;
6356 fprintf(f, ":%d", option);
6358 break;
6360 case ARG_STATE_CHECK:
6361 scn = *((StateCheckNumType* )bp);
6362 bp += SIZE_STATE_CHECK_NUM;
6363 fprintf(f, ":%d", scn);
6364 break;
6367 else {
6368 switch (*bp++) {
6369 case OP_EXACT1:
6370 case OP_ANYCHAR_STAR_PEEK_NEXT:
6371 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
6372 p_string(f, 1, bp++); break;
6373 case OP_EXACT2:
6374 p_string(f, 2, bp); bp += 2; break;
6375 case OP_EXACT3:
6376 p_string(f, 3, bp); bp += 3; break;
6377 case OP_EXACT4:
6378 p_string(f, 4, bp); bp += 4; break;
6379 case OP_EXACT5:
6380 p_string(f, 5, bp); bp += 5; break;
6381 case OP_EXACTN:
6382 GET_LENGTH_INC(len, bp);
6383 p_len_string(f, len, 1, bp);
6384 bp += len;
6385 break;
6387 case OP_EXACTMB2N1:
6388 p_string(f, 2, bp); bp += 2; break;
6389 case OP_EXACTMB2N2:
6390 p_string(f, 4, bp); bp += 4; break;
6391 case OP_EXACTMB2N3:
6392 p_string(f, 6, bp); bp += 6; break;
6393 case OP_EXACTMB2N:
6394 GET_LENGTH_INC(len, bp);
6395 p_len_string(f, len, 2, bp);
6396 bp += len * 2;
6397 break;
6398 case OP_EXACTMB3N:
6399 GET_LENGTH_INC(len, bp);
6400 p_len_string(f, len, 3, bp);
6401 bp += len * 3;
6402 break;
6403 case OP_EXACTMBN:
6405 int mb_len;
6407 GET_LENGTH_INC(mb_len, bp);
6408 GET_LENGTH_INC(len, bp);
6409 fprintf(f, ":%d:%d:", mb_len, len);
6410 n = len * mb_len;
6411 while (n-- > 0) { fputc(*bp++, f); }
6413 break;
6415 case OP_EXACT1_IC:
6416 len = enclen(enc, bp, bpend);
6417 p_string(f, len, bp);
6418 bp += len;
6419 break;
6420 case OP_EXACTN_IC:
6421 GET_LENGTH_INC(len, bp);
6422 p_len_string(f, len, 1, bp);
6423 bp += len;
6424 break;
6426 case OP_CCLASS:
6427 n = bitset_on_num((BitSetRef )bp);
6428 bp += SIZE_BITSET;
6429 fprintf(f, ":%d", n);
6430 break;
6432 case OP_CCLASS_NOT:
6433 n = bitset_on_num((BitSetRef )bp);
6434 bp += SIZE_BITSET;
6435 fprintf(f, ":%d", n);
6436 break;
6438 case OP_CCLASS_MB:
6439 case OP_CCLASS_MB_NOT:
6440 GET_LENGTH_INC(len, bp);
6441 q = bp;
6442 # ifndef PLATFORM_UNALIGNED_WORD_ACCESS
6443 ALIGNMENT_RIGHT(q);
6444 # endif
6445 GET_CODE_POINT(code, q);
6446 bp += len;
6447 fprintf(f, ":%d:%d", (int )code, len);
6448 break;
6450 case OP_CCLASS_MIX:
6451 case OP_CCLASS_MIX_NOT:
6452 n = bitset_on_num((BitSetRef )bp);
6453 bp += SIZE_BITSET;
6454 GET_LENGTH_INC(len, bp);
6455 q = bp;
6456 # ifndef PLATFORM_UNALIGNED_WORD_ACCESS
6457 ALIGNMENT_RIGHT(q);
6458 # endif
6459 GET_CODE_POINT(code, q);
6460 bp += len;
6461 fprintf(f, ":%d:%d:%d", n, (int )code, len);
6462 break;
6464 case OP_BACKREFN_IC:
6465 mem = *((MemNumType* )bp);
6466 bp += SIZE_MEMNUM;
6467 fprintf(f, ":%d", mem);
6468 break;
6470 case OP_BACKREF_MULTI_IC:
6471 case OP_BACKREF_MULTI:
6472 fputs(" ", f);
6473 GET_LENGTH_INC(len, bp);
6474 for (i = 0; i < len; i++) {
6475 GET_MEMNUM_INC(mem, bp);
6476 if (i > 0) fputs(", ", f);
6477 fprintf(f, "%d", mem);
6479 break;
6481 case OP_BACKREF_WITH_LEVEL:
6483 OnigOptionType option;
6484 LengthType level;
6486 GET_OPTION_INC(option, bp);
6487 fprintf(f, ":%d", option);
6488 GET_LENGTH_INC(level, bp);
6489 fprintf(f, ":%d", level);
6491 fputs(" ", f);
6492 GET_LENGTH_INC(len, bp);
6493 for (i = 0; i < len; i++) {
6494 GET_MEMNUM_INC(mem, bp);
6495 if (i > 0) fputs(", ", f);
6496 fprintf(f, "%d", mem);
6499 break;
6501 case OP_REPEAT:
6502 case OP_REPEAT_NG:
6504 mem = *((MemNumType* )bp);
6505 bp += SIZE_MEMNUM;
6506 addr = *((RelAddrType* )bp);
6507 bp += SIZE_RELADDR;
6508 fprintf(f, ":%d:%d", mem, addr);
6510 break;
6512 case OP_PUSH_OR_JUMP_EXACT1:
6513 case OP_PUSH_IF_PEEK_NEXT:
6514 addr = *((RelAddrType* )bp);
6515 bp += SIZE_RELADDR;
6516 fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr);
6517 p_string(f, 1, bp);
6518 bp += 1;
6519 break;
6521 case OP_LOOK_BEHIND:
6522 GET_LENGTH_INC(len, bp);
6523 fprintf(f, ":%d", len);
6524 break;
6526 case OP_PUSH_LOOK_BEHIND_NOT:
6527 GET_RELADDR_INC(addr, bp);
6528 GET_LENGTH_INC(len, bp);
6529 fprintf(f, ":%d:(%s%d)", len, (addr >= 0) ? "+" : "", addr);
6530 break;
6532 case OP_STATE_CHECK_PUSH:
6533 case OP_STATE_CHECK_PUSH_OR_JUMP:
6534 scn = *((StateCheckNumType* )bp);
6535 bp += SIZE_STATE_CHECK_NUM;
6536 addr = *((RelAddrType* )bp);
6537 bp += SIZE_RELADDR;
6538 fprintf(f, ":%d:(%s%d)", scn, (addr >= 0) ? "+" : "", addr);
6539 break;
6541 case OP_CONDITION:
6542 GET_MEMNUM_INC(mem, bp);
6543 GET_RELADDR_INC(addr, bp);
6544 fprintf(f, ":%d:(%s%d)", mem, (addr >= 0) ? "+" : "", addr);
6545 break;
6547 default:
6548 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6549 bp[-1]);
6552 fputs("]", f);
6553 if (nextp) *nextp = bp;
6556 # ifdef ONIG_DEBUG_COMPILE
6557 static void
6558 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6560 int ncode;
6561 UChar* bp = reg->p;
6562 UChar* end = reg->p + reg->used;
6564 fprintf(f, "code length: %d", reg->used);
6566 ncode = -1;
6567 while (bp < end) {
6568 ncode++;
6569 if (ncode % 5 == 0)
6570 fprintf(f, "\n%ld:", bp - reg->p);
6571 else
6572 fprintf(f, " %ld:", bp - reg->p);
6573 onig_print_compiled_byte_code(f, bp, end, &bp, reg->enc);
6576 fprintf(f, "\n");
6578 # endif /* ONIG_DEBUG_COMPILE */
6580 # ifdef ONIG_DEBUG_PARSE_TREE
6581 static void
6582 print_indent_tree(FILE* f, Node* node, int indent)
6584 int i, type, container_p = 0;
6585 int add = 3;
6586 UChar* p;
6588 Indent(f, indent);
6589 if (IS_NULL(node)) {
6590 fprintf(f, "ERROR: null node!!!\n");
6591 exit (0);
6594 type = NTYPE(node);
6595 switch (type) {
6596 case NT_LIST:
6597 case NT_ALT:
6598 if (NTYPE(node) == NT_LIST)
6599 fprintf(f, "<list:%"PRIxPTR">\n", (intptr_t )node);
6600 else
6601 fprintf(f, "<alt:%"PRIxPTR">\n", (intptr_t )node);
6603 print_indent_tree(f, NCAR(node), indent + add);
6604 while (IS_NOT_NULL(node = NCDR(node))) {
6605 if (NTYPE(node) != type) {
6606 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6607 exit(0);
6609 print_indent_tree(f, NCAR(node), indent + add);
6611 break;
6613 case NT_STR:
6614 fprintf(f, "<string%s:%"PRIxPTR">",
6615 (NSTRING_IS_RAW(node) ? "-raw" : ""), (intptr_t )node);
6616 for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6617 if (*p >= 0x20 && *p < 0x7f)
6618 fputc(*p, f);
6619 else {
6620 fprintf(f, " 0x%02x", *p);
6623 break;
6625 case NT_CCLASS:
6626 fprintf(f, "<cclass:%"PRIxPTR">", (intptr_t )node);
6627 if (IS_NCCLASS_NOT(NCCLASS(node))) fputs("not ", f);
6628 if (NCCLASS(node)->mbuf) {
6629 BBuf* bbuf = NCCLASS(node)->mbuf;
6630 OnigCodePoint* data = (OnigCodePoint* )bbuf->p;
6631 OnigCodePoint* end = (OnigCodePoint* )(bbuf->p + bbuf->used);
6632 fprintf(f, "%d", *data++);
6633 for (; data < end; data+=2) {
6634 fprintf(f, ",");
6635 fprintf(f, "%04x-%04x", data[0], data[1]);
6638 break;
6640 case NT_CTYPE:
6641 fprintf(f, "<ctype:%"PRIxPTR"> ", (intptr_t )node);
6642 switch (NCTYPE(node)->ctype) {
6643 case ONIGENC_CTYPE_WORD:
6644 if (NCTYPE(node)->not != 0)
6645 fputs("not word", f);
6646 else
6647 fputs("word", f);
6648 break;
6650 default:
6651 fprintf(f, "ERROR: undefined ctype.\n");
6652 exit(0);
6654 break;
6656 case NT_CANY:
6657 fprintf(f, "<anychar:%"PRIxPTR">", (intptr_t )node);
6658 break;
6660 case NT_ANCHOR:
6661 fprintf(f, "<anchor:%"PRIxPTR"> ", (intptr_t )node);
6662 switch (NANCHOR(node)->type) {
6663 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
6664 case ANCHOR_END_BUF: fputs("end buf", f); break;
6665 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
6666 case ANCHOR_END_LINE: fputs("end line", f); break;
6667 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
6668 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6670 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
6671 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
6672 # ifdef USE_WORD_BEGIN_END
6673 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
6674 case ANCHOR_WORD_END: fputs("word end", f); break;
6675 # endif
6676 case ANCHOR_PREC_READ: fputs("prec read", f); container_p = TRUE; break;
6677 case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); container_p = TRUE; break;
6678 case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); container_p = TRUE; break;
6679 case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); container_p = TRUE; break;
6680 case ANCHOR_KEEP: fputs("keep",f); break;
6682 default:
6683 fprintf(f, "ERROR: undefined anchor type.\n");
6684 break;
6686 break;
6688 case NT_BREF:
6690 int* p;
6691 BRefNode* br = NBREF(node);
6692 p = BACKREFS_P(br);
6693 fprintf(f, "<backref:%"PRIxPTR">", (intptr_t )node);
6694 for (i = 0; i < br->back_num; i++) {
6695 if (i > 0) fputs(", ", f);
6696 fprintf(f, "%d", p[i]);
6699 break;
6701 # ifdef USE_SUBEXP_CALL
6702 case NT_CALL:
6704 CallNode* cn = NCALL(node);
6705 fprintf(f, "<call:%"PRIxPTR">", (intptr_t )node);
6706 p_string(f, cn->name_end - cn->name, cn->name);
6708 break;
6709 # endif
6711 case NT_QTFR:
6712 fprintf(f, "<quantifier:%"PRIxPTR">{%d,%d}%s\n", (intptr_t )node,
6713 NQTFR(node)->lower, NQTFR(node)->upper,
6714 (NQTFR(node)->greedy ? "" : "?"));
6715 print_indent_tree(f, NQTFR(node)->target, indent + add);
6716 break;
6718 case NT_ENCLOSE:
6719 fprintf(f, "<enclose:%"PRIxPTR"> ", (intptr_t )node);
6720 switch (NENCLOSE(node)->type) {
6721 case ENCLOSE_OPTION:
6722 fprintf(f, "option:%d", NENCLOSE(node)->option);
6723 break;
6724 case ENCLOSE_MEMORY:
6725 fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6726 break;
6727 case ENCLOSE_STOP_BACKTRACK:
6728 fprintf(f, "stop-bt");
6729 break;
6730 case ENCLOSE_CONDITION:
6731 fprintf(f, "condition:%d", NENCLOSE(node)->regnum);
6732 break;
6733 case ENCLOSE_ABSENT:
6734 fprintf(f, "absent");
6735 break;
6737 default:
6738 break;
6740 fprintf(f, "\n");
6741 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6742 break;
6744 default:
6745 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6746 break;
6749 if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6750 type != NT_ENCLOSE)
6751 fprintf(f, "\n");
6753 if (container_p) print_indent_tree(f, NANCHOR(node)->target, indent + add);
6755 fflush(f);
6758 static void
6759 print_tree(FILE* f, Node* node)
6761 print_indent_tree(f, node, 0);
6763 # endif /* ONIG_DEBUG_PARSE_TREE */
6764 #endif /* ONIG_DEBUG */