[rubygems/rubygems] Use a constant empty tar header to avoid extra allocations
[ruby.git] / regcomp.c
blob38bfed5631d643465b1dcc9ae4bc8ade9bd350b7
1 /**********************************************************************
2 regcomp.c - Onigmo (Oniguruma-mod) (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011-2016 K.Takata <kentkt AT csc DOT jp>
7 * All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
31 #include "regparse.h"
33 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
35 extern OnigCaseFoldType
36 onig_get_default_case_fold_flag(void)
38 return OnigDefaultCaseFoldFlag;
41 extern int
42 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
44 OnigDefaultCaseFoldFlag = case_fold_flag;
45 return 0;
49 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
50 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
51 #endif
53 #if 0
54 static UChar*
55 str_dup(UChar* s, UChar* end)
57 ptrdiff_t len = end - s;
59 if (len > 0) {
60 UChar* r = (UChar* )xmalloc(len + 1);
61 CHECK_NULL_RETURN(r);
62 xmemcpy(r, s, len);
63 r[len] = (UChar )0;
64 return r;
66 else return NULL;
68 #endif
70 static void
71 swap_node(Node* a, Node* b)
73 Node c;
74 c = *a; *a = *b; *b = c;
76 if (NTYPE(a) == NT_STR) {
77 StrNode* sn = NSTR(a);
78 if (sn->capa == 0) {
79 size_t len = sn->end - sn->s;
80 sn->s = sn->buf;
81 sn->end = sn->s + len;
85 if (NTYPE(b) == NT_STR) {
86 StrNode* sn = NSTR(b);
87 if (sn->capa == 0) {
88 size_t len = sn->end - sn->s;
89 sn->s = sn->buf;
90 sn->end = sn->s + len;
95 static OnigDistance
96 distance_add(OnigDistance d1, OnigDistance d2)
98 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
99 return ONIG_INFINITE_DISTANCE;
100 else {
101 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
102 else return ONIG_INFINITE_DISTANCE;
106 static OnigDistance
107 distance_multiply(OnigDistance d, int m)
109 if (m == 0) return 0;
111 if (d < ONIG_INFINITE_DISTANCE / m)
112 return d * m;
113 else
114 return ONIG_INFINITE_DISTANCE;
117 static int
118 bitset_is_empty(BitSetRef bs)
120 int i;
121 for (i = 0; i < BITSET_SIZE; i++) {
122 if (bs[i] != 0) return 0;
124 return 1;
127 #ifdef ONIG_DEBUG
128 static int
129 bitset_on_num(BitSetRef bs)
131 int i, n;
133 n = 0;
134 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
135 if (BITSET_AT(bs, i)) n++;
137 return n;
139 #endif
141 // Attempt to right size allocated buffers for a regex post compile
142 static void
143 onig_reg_resize(regex_t *reg)
145 do {
146 if (!reg->used) {
147 xfree(reg->p);
148 reg->alloc = 0;
149 reg->p = 0;
151 else if (reg->alloc > reg->used) {
152 unsigned char *new_ptr = xrealloc(reg->p, reg->used);
153 // Skip the right size optimization if memory allocation fails
154 if (new_ptr) {
155 reg->alloc = reg->used;
156 reg->p = new_ptr;
159 } while ((reg = reg->chain) != 0);
162 extern int
163 onig_bbuf_init(BBuf* buf, OnigDistance size)
165 if (size <= 0) {
166 size = 0;
167 buf->p = NULL;
169 else {
170 buf->p = (UChar* )xmalloc(size);
171 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
174 buf->alloc = (unsigned int )size;
175 buf->used = 0;
176 return 0;
180 #ifdef USE_SUBEXP_CALL
182 static int
183 unset_addr_list_init(UnsetAddrList* uslist, int size)
185 UnsetAddr* p;
187 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
188 CHECK_NULL_RETURN_MEMERR(p);
189 uslist->num = 0;
190 uslist->alloc = size;
191 uslist->us = p;
192 return 0;
195 static void
196 unset_addr_list_end(UnsetAddrList* uslist)
198 xfree(uslist->us);
201 static int
202 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
204 UnsetAddr* p;
205 int size;
207 if (uslist->num >= uslist->alloc) {
208 size = uslist->alloc * 2;
209 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
210 CHECK_NULL_RETURN_MEMERR(p);
211 uslist->alloc = size;
212 uslist->us = p;
215 uslist->us[uslist->num].offset = offset;
216 uslist->us[uslist->num].target = node;
217 uslist->num++;
218 return 0;
220 #endif /* USE_SUBEXP_CALL */
223 static int
224 add_opcode(regex_t* reg, int opcode)
226 BBUF_ADD1(reg, opcode);
227 return 0;
230 #ifdef USE_COMBINATION_EXPLOSION_CHECK
231 static int
232 add_state_check_num(regex_t* reg, int num)
234 StateCheckNumType n = (StateCheckNumType )num;
236 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
237 return 0;
239 #endif
241 static int
242 add_rel_addr(regex_t* reg, int addr)
244 RelAddrType ra = (RelAddrType )addr;
246 BBUF_ADD(reg, &ra, SIZE_RELADDR);
247 return 0;
250 static int
251 add_abs_addr(regex_t* reg, int addr)
253 AbsAddrType ra = (AbsAddrType )addr;
255 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
256 return 0;
259 static int
260 add_length(regex_t* reg, OnigDistance len)
262 LengthType l = (LengthType )len;
264 BBUF_ADD(reg, &l, SIZE_LENGTH);
265 return 0;
268 static int
269 add_mem_num(regex_t* reg, int num)
271 MemNumType n = (MemNumType )num;
273 BBUF_ADD(reg, &n, SIZE_MEMNUM);
274 return 0;
277 #if 0
278 static int
279 add_pointer(regex_t* reg, void* addr)
281 PointerType ptr = (PointerType )addr;
283 BBUF_ADD(reg, &ptr, SIZE_POINTER);
284 return 0;
286 #endif
288 static int
289 add_option(regex_t* reg, OnigOptionType option)
291 BBUF_ADD(reg, &option, SIZE_OPTION);
292 return 0;
295 static int
296 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
298 int r;
300 r = add_opcode(reg, opcode);
301 if (r) return r;
302 r = add_rel_addr(reg, addr);
303 return r;
306 static int
307 add_bytes(regex_t* reg, UChar* bytes, OnigDistance len)
309 BBUF_ADD(reg, bytes, len);
310 return 0;
313 static int
314 add_bitset(regex_t* reg, BitSetRef bs)
316 BBUF_ADD(reg, bs, SIZE_BITSET);
317 return 0;
320 static int
321 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
323 int r;
325 r = add_opcode(reg, opcode);
326 if (r) return r;
327 r = add_option(reg, option);
328 return r;
331 static int compile_length_tree(Node* node, regex_t* reg);
332 static int compile_tree(Node* node, regex_t* reg);
335 #define IS_NEED_STR_LEN_OP_EXACT(op) \
336 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
337 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
339 static int
340 select_str_opcode(int mb_len, OnigDistance byte_len, int ignore_case)
342 int op;
343 OnigDistance str_len = roomof(byte_len, mb_len);
345 if (ignore_case) {
346 switch (str_len) {
347 case 1: op = OP_EXACT1_IC; break;
348 default: op = OP_EXACTN_IC; break;
351 else {
352 switch (mb_len) {
353 case 1:
354 switch (str_len) {
355 case 1: op = OP_EXACT1; break;
356 case 2: op = OP_EXACT2; break;
357 case 3: op = OP_EXACT3; break;
358 case 4: op = OP_EXACT4; break;
359 case 5: op = OP_EXACT5; break;
360 default: op = OP_EXACTN; break;
362 break;
364 case 2:
365 switch (str_len) {
366 case 1: op = OP_EXACTMB2N1; break;
367 case 2: op = OP_EXACTMB2N2; break;
368 case 3: op = OP_EXACTMB2N3; break;
369 default: op = OP_EXACTMB2N; break;
371 break;
373 case 3:
374 op = OP_EXACTMB3N;
375 break;
377 default:
378 op = OP_EXACTMBN;
379 break;
382 return op;
385 static int
386 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
388 int r;
389 int saved_num_null_check = reg->num_null_check;
391 if (empty_info != 0) {
392 r = add_opcode(reg, OP_NULL_CHECK_START);
393 if (r) return r;
394 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
395 if (r) return r;
396 reg->num_null_check++;
399 r = compile_tree(node, reg);
400 if (r) return r;
402 if (empty_info != 0) {
403 if (empty_info == NQ_TARGET_IS_EMPTY)
404 r = add_opcode(reg, OP_NULL_CHECK_END);
405 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
406 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
407 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
408 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
410 if (r) return r;
411 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
413 return r;
416 #ifdef USE_SUBEXP_CALL
417 static int
418 compile_call(CallNode* node, regex_t* reg)
420 int r;
422 r = add_opcode(reg, OP_CALL);
423 if (r) return r;
424 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
425 node->target);
426 if (r) return r;
427 r = add_abs_addr(reg, 0 /*dummy addr.*/);
428 return r;
430 #endif
432 static int
433 compile_tree_n_times(Node* node, int n, regex_t* reg)
435 int i, r;
437 for (i = 0; i < n; i++) {
438 r = compile_tree(node, reg);
439 if (r) return r;
441 return 0;
444 static int
445 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance byte_len,
446 regex_t* reg ARG_UNUSED, int ignore_case)
448 int len;
449 int op = select_str_opcode(mb_len, byte_len, ignore_case);
451 len = SIZE_OPCODE;
453 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
454 if (IS_NEED_STR_LEN_OP_EXACT(op))
455 len += SIZE_LENGTH;
457 len += (int )byte_len;
458 return len;
461 static int
462 add_compile_string(UChar* s, int mb_len, OnigDistance byte_len,
463 regex_t* reg, int ignore_case)
465 int op = select_str_opcode(mb_len, byte_len, ignore_case);
466 add_opcode(reg, op);
468 if (op == OP_EXACTMBN)
469 add_length(reg, mb_len);
471 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
472 if (op == OP_EXACTN_IC)
473 add_length(reg, byte_len);
474 else
475 add_length(reg, byte_len / mb_len);
478 add_bytes(reg, s, byte_len);
479 return 0;
483 static int
484 compile_length_string_node(Node* node, regex_t* reg)
486 int rlen, r, len, prev_len, blen, ambig;
487 OnigEncoding enc = reg->enc;
488 UChar *p, *prev;
489 StrNode* sn;
491 sn = NSTR(node);
492 if (sn->end <= sn->s)
493 return 0;
495 ambig = NSTRING_IS_AMBIG(node);
497 p = prev = sn->s;
498 prev_len = enclen(enc, p, sn->end);
499 p += prev_len;
500 blen = prev_len;
501 rlen = 0;
503 for (; p < sn->end; ) {
504 len = enclen(enc, p, sn->end);
505 if (len == prev_len || ambig) {
506 blen += len;
508 else {
509 r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
510 rlen += r;
511 prev = p;
512 blen = len;
513 prev_len = len;
515 p += len;
517 r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
518 rlen += r;
519 return rlen;
522 static int
523 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
525 if (sn->end <= sn->s)
526 return 0;
528 return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
531 static int
532 compile_string_node(Node* node, regex_t* reg)
534 int r, len, prev_len, blen, ambig;
535 OnigEncoding enc = reg->enc;
536 UChar *p, *prev, *end;
537 StrNode* sn;
539 sn = NSTR(node);
540 if (sn->end <= sn->s)
541 return 0;
543 end = sn->end;
544 ambig = NSTRING_IS_AMBIG(node);
546 p = prev = sn->s;
547 prev_len = enclen(enc, p, end);
548 p += prev_len;
549 blen = prev_len;
551 for (; p < end; ) {
552 len = enclen(enc, p, end);
553 if (len == prev_len || ambig) {
554 blen += len;
556 else {
557 r = add_compile_string(prev, prev_len, blen, reg, ambig);
558 if (r) return r;
560 prev = p;
561 blen = len;
562 prev_len = len;
565 p += len;
567 return add_compile_string(prev, prev_len, blen, reg, ambig);
570 static int
571 compile_string_raw_node(StrNode* sn, regex_t* reg)
573 if (sn->end <= sn->s)
574 return 0;
576 return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
579 static int
580 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
582 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
583 add_length(reg, mbuf->used);
584 return add_bytes(reg, mbuf->p, mbuf->used);
585 #else
586 int r, pad_size;
587 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
589 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
590 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
591 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
593 r = add_bytes(reg, mbuf->p, mbuf->used);
595 /* padding for return value from compile_length_cclass_node() to be fix. */
596 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
597 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
598 return r;
599 #endif
602 static int
603 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
605 int len;
607 if (IS_NULL(cc->mbuf)) {
608 len = SIZE_OPCODE + SIZE_BITSET;
610 else {
611 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
612 len = SIZE_OPCODE;
614 else {
615 len = SIZE_OPCODE + SIZE_BITSET;
617 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
618 len += SIZE_LENGTH + cc->mbuf->used;
619 #else
620 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
621 #endif
624 return len;
627 static int
628 compile_cclass_node(CClassNode* cc, regex_t* reg)
630 int r;
632 if (IS_NULL(cc->mbuf)) {
633 if (IS_NCCLASS_NOT(cc))
634 add_opcode(reg, OP_CCLASS_NOT);
635 else
636 add_opcode(reg, OP_CCLASS);
638 r = add_bitset(reg, cc->bs);
640 else {
641 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
642 if (IS_NCCLASS_NOT(cc))
643 add_opcode(reg, OP_CCLASS_MB_NOT);
644 else
645 add_opcode(reg, OP_CCLASS_MB);
647 r = add_multi_byte_cclass(cc->mbuf, reg);
649 else {
650 if (IS_NCCLASS_NOT(cc))
651 add_opcode(reg, OP_CCLASS_MIX_NOT);
652 else
653 add_opcode(reg, OP_CCLASS_MIX);
655 r = add_bitset(reg, cc->bs);
656 if (r) return r;
657 r = add_multi_byte_cclass(cc->mbuf, reg);
661 return r;
664 static int
665 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
667 #define REPEAT_RANGE_ALLOC 4
669 OnigRepeatRange* p;
671 if (reg->repeat_range_alloc == 0) {
672 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
673 CHECK_NULL_RETURN_MEMERR(p);
674 reg->repeat_range = p;
675 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
677 else if (reg->repeat_range_alloc <= id) {
678 int n;
679 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
680 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
681 sizeof(OnigRepeatRange) * n);
682 CHECK_NULL_RETURN_MEMERR(p);
683 reg->repeat_range = p;
684 reg->repeat_range_alloc = n;
686 else {
687 p = reg->repeat_range;
690 p[id].lower = lower;
691 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
692 return 0;
695 static int
696 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
697 regex_t* reg)
699 int r;
700 int num_repeat = reg->num_repeat;
702 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
703 if (r) return r;
704 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
705 reg->num_repeat++;
706 if (r) return r;
707 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
708 if (r) return r;
710 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
711 if (r) return r;
713 r = compile_tree_empty_check(qn->target, reg, empty_info);
714 if (r) return r;
716 if (
717 #ifdef USE_SUBEXP_CALL
718 reg->num_call > 0 ||
719 #endif
720 IS_QUANTIFIER_IN_REPEAT(qn)) {
721 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
723 else {
724 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
726 if (r) return r;
727 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
728 return r;
731 static int
732 is_anychar_star_quantifier(QtfrNode* qn)
734 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
735 NTYPE(qn->target) == NT_CANY)
736 return 1;
737 else
738 return 0;
741 #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
742 #define CKN_ON (ckn > 0)
744 #ifdef USE_COMBINATION_EXPLOSION_CHECK
746 static int
747 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
749 int len, mod_tlen, cklen;
750 int ckn;
751 int infinite = IS_REPEAT_INFINITE(qn->upper);
752 int empty_info = qn->target_empty_info;
753 int tlen = compile_length_tree(qn->target, reg);
755 if (tlen < 0) return tlen;
757 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
759 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
761 /* anychar repeat */
762 if (NTYPE(qn->target) == NT_CANY) {
763 if (qn->greedy && infinite) {
764 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
765 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
766 else
767 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
771 if (empty_info != 0)
772 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
773 else
774 mod_tlen = tlen;
776 if (infinite && qn->lower <= 1) {
777 if (qn->greedy) {
778 if (qn->lower == 1)
779 len = SIZE_OP_JUMP;
780 else
781 len = 0;
783 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
785 else {
786 if (qn->lower == 0)
787 len = SIZE_OP_JUMP;
788 else
789 len = 0;
791 len += mod_tlen + SIZE_OP_PUSH + cklen;
794 else if (qn->upper == 0) {
795 if (qn->is_referred != 0) /* /(?<n>..){0}/ */
796 len = SIZE_OP_JUMP + tlen;
797 else
798 len = 0;
800 else if (qn->upper == 1 && qn->greedy) {
801 if (qn->lower == 0) {
802 if (CKN_ON) {
803 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
805 else {
806 len = SIZE_OP_PUSH + tlen;
809 else {
810 len = tlen;
813 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
814 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
816 else {
817 len = SIZE_OP_REPEAT_INC
818 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
819 if (CKN_ON)
820 len += SIZE_OP_STATE_CHECK;
823 return len;
826 static int
827 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
829 int r, mod_tlen;
830 int ckn;
831 int infinite = IS_REPEAT_INFINITE(qn->upper);
832 int empty_info = qn->target_empty_info;
833 int tlen = compile_length_tree(qn->target, reg);
835 if (tlen < 0) return tlen;
837 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
839 if (is_anychar_star_quantifier(qn)) {
840 r = compile_tree_n_times(qn->target, qn->lower, reg);
841 if (r) return r;
842 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
843 if (IS_MULTILINE(reg->options))
844 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
845 else
846 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
847 if (r) return r;
848 if (CKN_ON) {
849 r = add_state_check_num(reg, ckn);
850 if (r) return r;
853 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
855 else {
856 if (IS_MULTILINE(reg->options)) {
857 r = add_opcode(reg, (CKN_ON ?
858 OP_STATE_CHECK_ANYCHAR_ML_STAR
859 : OP_ANYCHAR_ML_STAR));
861 else {
862 r = add_opcode(reg, (CKN_ON ?
863 OP_STATE_CHECK_ANYCHAR_STAR
864 : OP_ANYCHAR_STAR));
866 if (r) return r;
867 if (CKN_ON)
868 r = add_state_check_num(reg, ckn);
870 return r;
874 if (empty_info != 0)
875 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
876 else
877 mod_tlen = tlen;
879 if (infinite && qn->lower <= 1) {
880 if (qn->greedy) {
881 if (qn->lower == 1) {
882 r = add_opcode_rel_addr(reg, OP_JUMP,
883 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
884 if (r) return r;
887 if (CKN_ON) {
888 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
889 if (r) return r;
890 r = add_state_check_num(reg, ckn);
891 if (r) return r;
892 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
894 else {
895 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
897 if (r) return r;
898 r = compile_tree_empty_check(qn->target, reg, empty_info);
899 if (r) return r;
900 r = add_opcode_rel_addr(reg, OP_JUMP,
901 -(mod_tlen + (int )SIZE_OP_JUMP
902 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
904 else {
905 if (qn->lower == 0) {
906 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
907 if (r) return r;
909 r = compile_tree_empty_check(qn->target, reg, empty_info);
910 if (r) return r;
911 if (CKN_ON) {
912 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
913 if (r) return r;
914 r = add_state_check_num(reg, ckn);
915 if (r) return r;
916 r = add_rel_addr(reg,
917 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
919 else
920 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
923 else if (qn->upper == 0) {
924 if (qn->is_referred != 0) { /* /(?<n>..){0}/ */
925 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
926 if (r) return r;
927 r = compile_tree(qn->target, reg);
929 else
930 r = 0;
932 else if (qn->upper == 1 && qn->greedy) {
933 if (qn->lower == 0) {
934 if (CKN_ON) {
935 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
936 if (r) return r;
937 r = add_state_check_num(reg, ckn);
938 if (r) return r;
939 r = add_rel_addr(reg, tlen);
941 else {
942 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
944 if (r) return r;
947 r = compile_tree(qn->target, reg);
949 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
950 if (CKN_ON) {
951 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
952 if (r) return r;
953 r = add_state_check_num(reg, ckn);
954 if (r) return r;
955 r = add_rel_addr(reg, SIZE_OP_JUMP);
957 else {
958 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
961 if (r) return r;
962 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
963 if (r) return r;
964 r = compile_tree(qn->target, reg);
966 else {
967 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
968 if (CKN_ON) {
969 if (r) return r;
970 r = add_opcode(reg, OP_STATE_CHECK);
971 if (r) return r;
972 r = add_state_check_num(reg, ckn);
975 return r;
978 #else /* USE_COMBINATION_EXPLOSION_CHECK */
980 static int
981 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
983 int len, mod_tlen;
984 int infinite = IS_REPEAT_INFINITE(qn->upper);
985 int empty_info = qn->target_empty_info;
986 int tlen = compile_length_tree(qn->target, reg);
988 if (tlen < 0) return tlen;
990 /* anychar repeat */
991 if (NTYPE(qn->target) == NT_CANY) {
992 if (qn->greedy && infinite) {
993 if (IS_NOT_NULL(qn->next_head_exact))
994 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
995 else
996 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
1000 if (empty_info != 0)
1001 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1002 else
1003 mod_tlen = tlen;
1005 if (infinite &&
1006 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1007 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1008 len = SIZE_OP_JUMP;
1010 else {
1011 len = tlen * qn->lower;
1014 if (qn->greedy) {
1015 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1016 if (IS_NOT_NULL(qn->head_exact))
1017 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1018 else
1019 #endif
1020 if (IS_NOT_NULL(qn->next_head_exact))
1021 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1022 else
1023 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1025 else
1026 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1028 else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */
1029 len = SIZE_OP_JUMP + tlen;
1031 else if (!infinite && qn->greedy &&
1032 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1033 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1034 len = tlen * qn->lower;
1035 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1037 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1038 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1040 else {
1041 len = SIZE_OP_REPEAT_INC
1042 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1045 return len;
1048 static int
1049 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1051 int i, r, mod_tlen;
1052 int infinite = IS_REPEAT_INFINITE(qn->upper);
1053 int empty_info = qn->target_empty_info;
1054 int tlen = compile_length_tree(qn->target, reg);
1056 if (tlen < 0) return tlen;
1058 if (is_anychar_star_quantifier(qn)) {
1059 r = compile_tree_n_times(qn->target, qn->lower, reg);
1060 if (r) return r;
1061 if (IS_NOT_NULL(qn->next_head_exact)) {
1062 if (IS_MULTILINE(reg->options))
1063 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1064 else
1065 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1066 if (r) return r;
1067 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1069 else {
1070 if (IS_MULTILINE(reg->options))
1071 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1072 else
1073 return add_opcode(reg, OP_ANYCHAR_STAR);
1077 if (empty_info != 0)
1078 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1079 else
1080 mod_tlen = tlen;
1082 if (infinite &&
1083 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1084 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1085 if (qn->greedy) {
1086 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1087 if (IS_NOT_NULL(qn->head_exact))
1088 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1089 else
1090 #endif
1091 if (IS_NOT_NULL(qn->next_head_exact))
1092 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1093 else
1094 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1096 else {
1097 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1099 if (r) return r;
1101 else {
1102 r = compile_tree_n_times(qn->target, qn->lower, reg);
1103 if (r) return r;
1106 if (qn->greedy) {
1107 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
1108 if (IS_NOT_NULL(qn->head_exact)) {
1109 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1110 mod_tlen + SIZE_OP_JUMP);
1111 if (r) return r;
1112 add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1113 r = compile_tree_empty_check(qn->target, reg, empty_info);
1114 if (r) return r;
1115 r = add_opcode_rel_addr(reg, OP_JUMP,
1116 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1118 else
1119 #endif
1120 if (IS_NOT_NULL(qn->next_head_exact)) {
1121 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1122 mod_tlen + SIZE_OP_JUMP);
1123 if (r) return r;
1124 add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1125 r = compile_tree_empty_check(qn->target, reg, empty_info);
1126 if (r) return r;
1127 r = add_opcode_rel_addr(reg, OP_JUMP,
1128 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1130 else {
1131 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1132 if (r) return r;
1133 r = compile_tree_empty_check(qn->target, reg, empty_info);
1134 if (r) return r;
1135 r = add_opcode_rel_addr(reg, OP_JUMP,
1136 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1139 else {
1140 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1141 if (r) return r;
1142 r = compile_tree_empty_check(qn->target, reg, empty_info);
1143 if (r) return r;
1144 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1147 else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */
1148 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1149 if (r) return r;
1150 r = compile_tree(qn->target, reg);
1152 else if (!infinite && qn->greedy &&
1153 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1154 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1155 int n = qn->upper - qn->lower;
1157 r = compile_tree_n_times(qn->target, qn->lower, reg);
1158 if (r) return r;
1160 for (i = 0; i < n; i++) {
1161 r = add_opcode_rel_addr(reg, OP_PUSH,
1162 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1163 if (r) return r;
1164 r = compile_tree(qn->target, reg);
1165 if (r) return r;
1168 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1169 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1170 if (r) return r;
1171 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1172 if (r) return r;
1173 r = compile_tree(qn->target, reg);
1175 else {
1176 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1178 return r;
1180 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1182 static int
1183 compile_length_option_node(EncloseNode* node, regex_t* reg)
1185 int tlen;
1186 OnigOptionType prev = reg->options;
1188 reg->options = node->option;
1189 tlen = compile_length_tree(node->target, reg);
1190 reg->options = prev;
1192 if (tlen < 0) return tlen;
1194 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1195 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1196 + tlen + SIZE_OP_SET_OPTION;
1198 else
1199 return tlen;
1202 static int
1203 compile_option_node(EncloseNode* node, regex_t* reg)
1205 int r;
1206 OnigOptionType prev = reg->options;
1208 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1209 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1210 if (r) return r;
1211 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1212 if (r) return r;
1213 r = add_opcode(reg, OP_FAIL);
1214 if (r) return r;
1217 reg->options = node->option;
1218 r = compile_tree(node->target, reg);
1219 reg->options = prev;
1221 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1222 if (r) return r;
1223 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1225 return r;
1228 static int
1229 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1231 int len;
1232 int tlen;
1234 if (node->type == ENCLOSE_OPTION)
1235 return compile_length_option_node(node, reg);
1237 if (node->target) {
1238 tlen = compile_length_tree(node->target, reg);
1239 if (tlen < 0) return tlen;
1241 else
1242 tlen = 0;
1244 switch (node->type) {
1245 case ENCLOSE_MEMORY:
1246 #ifdef USE_SUBEXP_CALL
1247 if (IS_ENCLOSE_CALLED(node)) {
1248 len = SIZE_OP_MEMORY_START_PUSH + tlen
1249 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1250 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1251 len += (IS_ENCLOSE_RECURSION(node)
1252 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1253 else
1254 len += (IS_ENCLOSE_RECURSION(node)
1255 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1257 else if (IS_ENCLOSE_RECURSION(node)) {
1258 len = SIZE_OP_MEMORY_START_PUSH;
1259 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1260 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC);
1262 else
1263 #endif
1265 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1266 len = SIZE_OP_MEMORY_START_PUSH;
1267 else
1268 len = SIZE_OP_MEMORY_START;
1270 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1271 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1273 break;
1275 case ENCLOSE_STOP_BACKTRACK:
1276 /* Disable POP_STOP_BT optimization for simple repeat under the match cache */
1277 /* optimization because the match cache optimization pushes an extra item to */
1278 /* the stack and it breaks the assumption for this optimization. */
1279 #ifndef USE_MATCH_CACHE
1280 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1281 QtfrNode* qn = NQTFR(node->target);
1282 tlen = compile_length_tree(qn->target, reg);
1283 if (tlen < 0) return tlen;
1285 len = tlen * qn->lower
1286 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1288 else {
1289 #endif
1290 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1291 #ifndef USE_MATCH_CACHE
1293 #endif
1294 break;
1296 case ENCLOSE_CONDITION:
1297 len = SIZE_OP_CONDITION;
1298 if (NTYPE(node->target) == NT_ALT) {
1299 Node* x = node->target;
1301 tlen = compile_length_tree(NCAR(x), reg); /* yes-node */
1302 if (tlen < 0) return tlen;
1303 len += tlen + SIZE_OP_JUMP;
1304 if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
1305 x = NCDR(x);
1306 tlen = compile_length_tree(NCAR(x), reg); /* no-node */
1307 if (tlen < 0) return tlen;
1308 len += tlen;
1309 if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
1311 else {
1312 return ONIGERR_PARSER_BUG;
1314 break;
1316 case ENCLOSE_ABSENT:
1317 len = SIZE_OP_PUSH_ABSENT_POS + SIZE_OP_ABSENT + tlen + SIZE_OP_ABSENT_END;
1318 break;
1320 default:
1321 return ONIGERR_TYPE_BUG;
1322 break;
1325 return len;
1328 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1330 static int
1331 compile_enclose_node(EncloseNode* node, regex_t* reg)
1333 int r, len;
1335 if (node->type == ENCLOSE_OPTION)
1336 return compile_option_node(node, reg);
1338 switch (node->type) {
1339 case ENCLOSE_MEMORY:
1340 #ifdef USE_SUBEXP_CALL
1341 if (IS_ENCLOSE_CALLED(node)) {
1342 r = add_opcode(reg, OP_CALL);
1343 if (r) return r;
1344 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1345 node->state |= NST_ADDR_FIXED;
1346 r = add_abs_addr(reg, (int )node->call_addr);
1347 if (r) return r;
1348 len = compile_length_tree(node->target, reg);
1349 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1350 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1351 len += (IS_ENCLOSE_RECURSION(node)
1352 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1353 else
1354 len += (IS_ENCLOSE_RECURSION(node)
1355 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1357 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1358 if (r) return r;
1360 #endif
1361 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1362 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1363 else
1364 r = add_opcode(reg, OP_MEMORY_START);
1365 if (r) return r;
1366 r = add_mem_num(reg, node->regnum);
1367 if (r) return r;
1368 r = compile_tree(node->target, reg);
1369 if (r) return r;
1370 #ifdef USE_SUBEXP_CALL
1371 if (IS_ENCLOSE_CALLED(node)) {
1372 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1373 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1374 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1375 else
1376 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1377 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1379 if (r) return r;
1380 r = add_mem_num(reg, node->regnum);
1381 if (r) return r;
1382 r = add_opcode(reg, OP_RETURN);
1384 else if (IS_ENCLOSE_RECURSION(node)) {
1385 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1386 r = add_opcode(reg, OP_MEMORY_END_PUSH_REC);
1387 else
1388 r = add_opcode(reg, OP_MEMORY_END_REC);
1389 if (r) return r;
1390 r = add_mem_num(reg, node->regnum);
1392 else
1393 #endif
1395 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1396 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1397 else
1398 r = add_opcode(reg, OP_MEMORY_END);
1399 if (r) return r;
1400 r = add_mem_num(reg, node->regnum);
1402 break;
1404 case ENCLOSE_STOP_BACKTRACK:
1405 /* Disable POP_STOP_BT optimization for simple repeat under the match cache */
1406 /* optimization because the match cache optimization pushes an extra item to */
1407 /* the stack and it breaks the assumption for this optimization. */
1408 #ifndef USE_MATCH_CACHE
1409 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1410 QtfrNode* qn = NQTFR(node->target);
1411 r = compile_tree_n_times(qn->target, qn->lower, reg);
1412 if (r) return r;
1414 len = compile_length_tree(qn->target, reg);
1415 if (len < 0) return len;
1417 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1418 if (r) return r;
1419 r = compile_tree(qn->target, reg);
1420 if (r) return r;
1421 r = add_opcode(reg, OP_POP);
1422 if (r) return r;
1423 r = add_opcode_rel_addr(reg, OP_JUMP,
1424 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1426 else {
1427 #endif
1428 r = add_opcode(reg, OP_PUSH_STOP_BT);
1429 if (r) return r;
1430 r = compile_tree(node->target, reg);
1431 if (r) return r;
1432 r = add_opcode(reg, OP_POP_STOP_BT);
1433 #ifndef USE_MATCH_CACHE
1435 #endif
1436 break;
1438 case ENCLOSE_CONDITION:
1439 r = add_opcode(reg, OP_CONDITION);
1440 if (r) return r;
1441 r = add_mem_num(reg, node->regnum);
1442 if (r) return r;
1444 if (NTYPE(node->target) == NT_ALT) {
1445 Node* x = node->target;
1446 int len2;
1448 len = compile_length_tree(NCAR(x), reg); /* yes-node */
1449 if (len < 0) return len;
1450 if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
1451 x = NCDR(x);
1452 len2 = compile_length_tree(NCAR(x), reg); /* no-node */
1453 if (len2 < 0) return len2;
1454 if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
1456 x = node->target;
1457 r = add_rel_addr(reg, len + SIZE_OP_JUMP);
1458 if (r) return r;
1459 r = compile_tree(NCAR(x), reg); /* yes-node */
1460 if (r) return r;
1461 r = add_opcode_rel_addr(reg, OP_JUMP, len2);
1462 if (r) return r;
1463 x = NCDR(x);
1464 r = compile_tree(NCAR(x), reg); /* no-node */
1466 else {
1467 return ONIGERR_PARSER_BUG;
1469 break;
1471 case ENCLOSE_ABSENT:
1472 len = compile_length_tree(node->target, reg);
1473 if (len < 0) return len;
1475 r = add_opcode(reg, OP_PUSH_ABSENT_POS);
1476 if (r) return r;
1477 r = add_opcode_rel_addr(reg, OP_ABSENT, len + SIZE_OP_ABSENT_END);
1478 if (r) return r;
1479 r = compile_tree(node->target, reg);
1480 if (r) return r;
1481 r = add_opcode(reg, OP_ABSENT_END);
1482 break;
1484 default:
1485 return ONIGERR_TYPE_BUG;
1486 break;
1489 return r;
1492 static int
1493 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1495 int len;
1496 int tlen = 0;
1498 if (node->target) {
1499 tlen = compile_length_tree(node->target, reg);
1500 if (tlen < 0) return tlen;
1503 switch (node->type) {
1504 case ANCHOR_PREC_READ:
1505 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1506 break;
1507 case ANCHOR_PREC_READ_NOT:
1508 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1509 break;
1510 case ANCHOR_LOOK_BEHIND:
1511 len = SIZE_OP_LOOK_BEHIND + tlen;
1512 break;
1513 case ANCHOR_LOOK_BEHIND_NOT:
1514 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1515 break;
1517 default:
1518 len = SIZE_OPCODE;
1519 break;
1522 return len;
1525 static int
1526 compile_anchor_node(AnchorNode* node, regex_t* reg)
1528 int r, len;
1530 switch (node->type) {
1531 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1532 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1533 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1534 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1535 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1536 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1538 case ANCHOR_WORD_BOUND:
1539 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BOUND);
1540 else r = add_opcode(reg, OP_WORD_BOUND);
1541 break;
1542 case ANCHOR_NOT_WORD_BOUND:
1543 if (node->ascii_range) r = add_opcode(reg, OP_NOT_ASCII_WORD_BOUND);
1544 else r = add_opcode(reg, OP_NOT_WORD_BOUND);
1545 break;
1546 #ifdef USE_WORD_BEGIN_END
1547 case ANCHOR_WORD_BEGIN:
1548 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BEGIN);
1549 else r = add_opcode(reg, OP_WORD_BEGIN);
1550 break;
1551 case ANCHOR_WORD_END:
1552 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_END);
1553 else r = add_opcode(reg, OP_WORD_END);
1554 break;
1555 #endif
1556 case ANCHOR_KEEP: r = add_opcode(reg, OP_KEEP); break;
1558 case ANCHOR_PREC_READ:
1559 r = add_opcode(reg, OP_PUSH_POS);
1560 if (r) return r;
1561 r = compile_tree(node->target, reg);
1562 if (r) return r;
1563 r = add_opcode(reg, OP_POP_POS);
1564 break;
1566 case ANCHOR_PREC_READ_NOT:
1567 len = compile_length_tree(node->target, reg);
1568 if (len < 0) return len;
1569 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1570 if (r) return r;
1571 r = compile_tree(node->target, reg);
1572 if (r) return r;
1573 r = add_opcode(reg, OP_FAIL_POS);
1574 break;
1576 case ANCHOR_LOOK_BEHIND:
1578 int n;
1579 r = add_opcode(reg, OP_LOOK_BEHIND);
1580 if (r) return r;
1581 if (node->char_len < 0) {
1582 r = get_char_length_tree(node->target, reg, &n);
1583 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1585 else
1586 n = node->char_len;
1587 r = add_length(reg, n);
1588 if (r) return r;
1589 r = compile_tree(node->target, reg);
1591 break;
1593 case ANCHOR_LOOK_BEHIND_NOT:
1595 int n;
1596 len = compile_length_tree(node->target, reg);
1597 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1598 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1599 if (r) return r;
1600 if (node->char_len < 0) {
1601 r = get_char_length_tree(node->target, reg, &n);
1602 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1604 else
1605 n = node->char_len;
1606 r = add_length(reg, n);
1607 if (r) return r;
1608 r = compile_tree(node->target, reg);
1609 if (r) return r;
1610 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1612 break;
1614 default:
1615 return ONIGERR_TYPE_BUG;
1616 break;
1619 return r;
1622 static int
1623 compile_length_tree(Node* node, regex_t* reg)
1625 int len, type, r;
1627 type = NTYPE(node);
1628 switch (type) {
1629 case NT_LIST:
1630 len = 0;
1631 do {
1632 r = compile_length_tree(NCAR(node), reg);
1633 if (r < 0) return r;
1634 len += r;
1635 } while (IS_NOT_NULL(node = NCDR(node)));
1636 r = len;
1637 break;
1639 case NT_ALT:
1641 int n = 0;
1642 len = 0;
1643 do {
1644 r = compile_length_tree(NCAR(node), reg);
1645 if (r < 0) return r;
1646 len += r;
1647 n++;
1648 } while (IS_NOT_NULL(node = NCDR(node)));
1649 r = len;
1650 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1652 break;
1654 case NT_STR:
1655 if (NSTRING_IS_RAW(node))
1656 r = compile_length_string_raw_node(NSTR(node), reg);
1657 else
1658 r = compile_length_string_node(node, reg);
1659 break;
1661 case NT_CCLASS:
1662 r = compile_length_cclass_node(NCCLASS(node), reg);
1663 break;
1665 case NT_CTYPE:
1666 case NT_CANY:
1667 r = SIZE_OPCODE;
1668 break;
1670 case NT_BREF:
1672 BRefNode* br = NBREF(node);
1674 #ifdef USE_BACKREF_WITH_LEVEL
1675 if (IS_BACKREF_NEST_LEVEL(br)) {
1676 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1677 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1679 else
1680 #endif
1681 if (br->back_num == 1) {
1682 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1683 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1685 else {
1686 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1689 break;
1691 #ifdef USE_SUBEXP_CALL
1692 case NT_CALL:
1693 r = SIZE_OP_CALL;
1694 break;
1695 #endif
1697 case NT_QTFR:
1698 r = compile_length_quantifier_node(NQTFR(node), reg);
1699 break;
1701 case NT_ENCLOSE:
1702 r = compile_length_enclose_node(NENCLOSE(node), reg);
1703 break;
1705 case NT_ANCHOR:
1706 r = compile_length_anchor_node(NANCHOR(node), reg);
1707 break;
1709 default:
1710 return ONIGERR_TYPE_BUG;
1711 break;
1714 return r;
1717 static int
1718 compile_tree(Node* node, regex_t* reg)
1720 int n, type, len, pos, r = 0;
1722 type = NTYPE(node);
1723 switch (type) {
1724 case NT_LIST:
1725 do {
1726 r = compile_tree(NCAR(node), reg);
1727 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1728 break;
1730 case NT_ALT:
1732 Node* x = node;
1733 len = 0;
1734 do {
1735 len += compile_length_tree(NCAR(x), reg);
1736 if (NCDR(x) != NULL) {
1737 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1739 } while (IS_NOT_NULL(x = NCDR(x)));
1740 pos = reg->used + len; /* goal position */
1742 do {
1743 len = compile_length_tree(NCAR(node), reg);
1744 if (IS_NOT_NULL(NCDR(node))) {
1745 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1746 if (r) break;
1748 r = compile_tree(NCAR(node), reg);
1749 if (r) break;
1750 if (IS_NOT_NULL(NCDR(node))) {
1751 len = pos - (reg->used + SIZE_OP_JUMP);
1752 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1753 if (r) break;
1755 } while (IS_NOT_NULL(node = NCDR(node)));
1757 break;
1759 case NT_STR:
1760 if (NSTRING_IS_RAW(node))
1761 r = compile_string_raw_node(NSTR(node), reg);
1762 else
1763 r = compile_string_node(node, reg);
1764 break;
1766 case NT_CCLASS:
1767 r = compile_cclass_node(NCCLASS(node), reg);
1768 break;
1770 case NT_CTYPE:
1772 int op;
1774 switch (NCTYPE(node)->ctype) {
1775 case ONIGENC_CTYPE_WORD:
1776 if (NCTYPE(node)->ascii_range != 0) {
1777 if (NCTYPE(node)->not != 0) op = OP_NOT_ASCII_WORD;
1778 else op = OP_ASCII_WORD;
1780 else {
1781 if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
1782 else op = OP_WORD;
1784 break;
1785 default:
1786 return ONIGERR_TYPE_BUG;
1787 break;
1789 r = add_opcode(reg, op);
1791 break;
1793 case NT_CANY:
1794 if (IS_MULTILINE(reg->options))
1795 r = add_opcode(reg, OP_ANYCHAR_ML);
1796 else
1797 r = add_opcode(reg, OP_ANYCHAR);
1798 break;
1800 case NT_BREF:
1802 BRefNode* br = NBREF(node);
1804 #ifdef USE_BACKREF_WITH_LEVEL
1805 if (IS_BACKREF_NEST_LEVEL(br)) {
1806 r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1807 if (r) return r;
1808 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1809 if (r) return r;
1810 r = add_length(reg, br->nest_level);
1811 if (r) return r;
1813 goto add_bacref_mems;
1815 else
1816 #endif
1817 if (br->back_num == 1) {
1818 n = br->back_static[0];
1819 if (IS_IGNORECASE(reg->options)) {
1820 r = add_opcode(reg, OP_BACKREFN_IC);
1821 if (r) return r;
1822 r = add_mem_num(reg, n);
1824 else {
1825 switch (n) {
1826 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1827 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1828 default:
1829 r = add_opcode(reg, OP_BACKREFN);
1830 if (r) return r;
1831 r = add_mem_num(reg, n);
1832 break;
1836 else {
1837 int i;
1838 int* p;
1840 if (IS_IGNORECASE(reg->options)) {
1841 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1843 else {
1844 r = add_opcode(reg, OP_BACKREF_MULTI);
1846 if (r) return r;
1848 #ifdef USE_BACKREF_WITH_LEVEL
1849 add_bacref_mems:
1850 #endif
1851 r = add_length(reg, br->back_num);
1852 if (r) return r;
1853 p = BACKREFS_P(br);
1854 for (i = br->back_num - 1; i >= 0; i--) {
1855 r = add_mem_num(reg, p[i]);
1856 if (r) return r;
1860 break;
1862 #ifdef USE_SUBEXP_CALL
1863 case NT_CALL:
1864 r = compile_call(NCALL(node), reg);
1865 break;
1866 #endif
1868 case NT_QTFR:
1869 r = compile_quantifier_node(NQTFR(node), reg);
1870 break;
1872 case NT_ENCLOSE:
1873 r = compile_enclose_node(NENCLOSE(node), reg);
1874 break;
1876 case NT_ANCHOR:
1877 r = compile_anchor_node(NANCHOR(node), reg);
1878 break;
1880 default:
1881 #ifdef ONIG_DEBUG
1882 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1883 #endif
1884 break;
1887 return r;
1890 #ifdef USE_NAMED_GROUP
1892 static int
1893 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1895 int r = 0;
1896 Node* node = *plink;
1898 switch (NTYPE(node)) {
1899 case NT_LIST:
1900 case NT_ALT:
1901 do {
1902 r = noname_disable_map(&(NCAR(node)), map, counter);
1903 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1904 break;
1906 case NT_QTFR:
1908 Node** ptarget = &(NQTFR(node)->target);
1909 Node* old = *ptarget;
1910 r = noname_disable_map(ptarget, map, counter);
1911 if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1912 onig_reduce_nested_quantifier(node, *ptarget);
1915 break;
1917 case NT_ENCLOSE:
1919 EncloseNode* en = NENCLOSE(node);
1920 if (en->type == ENCLOSE_MEMORY) {
1921 if (IS_ENCLOSE_NAMED_GROUP(en)) {
1922 (*counter)++;
1923 map[en->regnum].new_val = *counter;
1924 en->regnum = *counter;
1926 else if (en->regnum != 0) {
1927 *plink = en->target;
1928 en->target = NULL_NODE;
1929 onig_node_free(node);
1930 r = noname_disable_map(plink, map, counter);
1931 break;
1934 r = noname_disable_map(&(en->target), map, counter);
1936 break;
1938 case NT_ANCHOR:
1939 if (NANCHOR(node)->target)
1940 r = noname_disable_map(&(NANCHOR(node)->target), map, counter);
1941 break;
1943 default:
1944 break;
1947 return r;
1950 static int
1951 renumber_node_backref(Node* node, GroupNumRemap* map, const int num_mem)
1953 int i, pos, n, old_num;
1954 int *backs;
1955 BRefNode* bn = NBREF(node);
1957 if (! IS_BACKREF_NAME_REF(bn))
1958 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1960 old_num = bn->back_num;
1961 if (IS_NULL(bn->back_dynamic))
1962 backs = bn->back_static;
1963 else
1964 backs = bn->back_dynamic;
1966 for (i = 0, pos = 0; i < old_num; i++) {
1967 if (backs[i] > num_mem) return ONIGERR_INVALID_BACKREF;
1968 n = map[backs[i]].new_val;
1969 if (n > 0) {
1970 backs[pos] = n;
1971 pos++;
1975 bn->back_num = pos;
1976 return 0;
1979 static int
1980 renumber_by_map(Node* node, GroupNumRemap* map, const int num_mem)
1982 int r = 0;
1984 switch (NTYPE(node)) {
1985 case NT_LIST:
1986 case NT_ALT:
1987 do {
1988 r = renumber_by_map(NCAR(node), map, num_mem);
1989 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1990 break;
1991 case NT_QTFR:
1992 r = renumber_by_map(NQTFR(node)->target, map, num_mem);
1993 break;
1994 case NT_ENCLOSE:
1996 EncloseNode* en = NENCLOSE(node);
1997 if (en->type == ENCLOSE_CONDITION) {
1998 if (en->regnum > num_mem) return ONIGERR_INVALID_BACKREF;
1999 en->regnum = map[en->regnum].new_val;
2001 r = renumber_by_map(en->target, map, num_mem);
2003 break;
2005 case NT_BREF:
2006 r = renumber_node_backref(node, map, num_mem);
2007 break;
2009 case NT_ANCHOR:
2010 if (NANCHOR(node)->target)
2011 r = renumber_by_map(NANCHOR(node)->target, map, num_mem);
2012 break;
2014 default:
2015 break;
2018 return r;
2021 static int
2022 numbered_ref_check(Node* node)
2024 int r = 0;
2026 switch (NTYPE(node)) {
2027 case NT_LIST:
2028 case NT_ALT:
2029 do {
2030 r = numbered_ref_check(NCAR(node));
2031 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2032 break;
2033 case NT_QTFR:
2034 r = numbered_ref_check(NQTFR(node)->target);
2035 break;
2036 case NT_ENCLOSE:
2037 r = numbered_ref_check(NENCLOSE(node)->target);
2038 break;
2040 case NT_BREF:
2041 if (! IS_BACKREF_NAME_REF(NBREF(node)))
2042 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2043 break;
2045 case NT_ANCHOR:
2046 if (NANCHOR(node)->target)
2047 r = numbered_ref_check(NANCHOR(node)->target);
2048 break;
2050 default:
2051 break;
2054 return r;
2057 static int
2058 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
2060 int r, i, pos, counter;
2061 BitStatusType loc;
2062 GroupNumRemap* map;
2064 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
2065 CHECK_NULL_RETURN_MEMERR(map);
2066 for (i = 1; i <= env->num_mem; i++) {
2067 map[i].new_val = 0;
2069 counter = 0;
2070 r = noname_disable_map(root, map, &counter);
2071 if (r != 0) return r;
2073 r = renumber_by_map(*root, map, env->num_mem);
2074 if (r != 0) return r;
2076 for (i = 1, pos = 1; i <= env->num_mem; i++) {
2077 if (map[i].new_val > 0) {
2078 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
2079 pos++;
2083 loc = env->capture_history;
2084 BIT_STATUS_CLEAR(env->capture_history);
2085 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
2086 if (BIT_STATUS_AT(loc, i)) {
2087 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
2091 env->num_mem = env->num_named;
2092 reg->num_mem = env->num_named;
2094 return onig_renumber_name_table(reg, map);
2096 #endif /* USE_NAMED_GROUP */
2098 #ifdef USE_SUBEXP_CALL
2099 static int
2100 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
2102 int i, offset;
2103 EncloseNode* en;
2104 AbsAddrType addr;
2106 for (i = 0; i < uslist->num; i++) {
2107 en = NENCLOSE(uslist->us[i].target);
2108 if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
2109 addr = en->call_addr;
2110 offset = uslist->us[i].offset;
2112 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
2114 return 0;
2116 #endif
2118 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
2119 static int
2120 quantifiers_memory_node_info(Node* node)
2122 int r = 0;
2124 switch (NTYPE(node)) {
2125 case NT_LIST:
2126 case NT_ALT:
2128 int v;
2129 do {
2130 v = quantifiers_memory_node_info(NCAR(node));
2131 if (v > r) r = v;
2132 } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
2134 break;
2136 # ifdef USE_SUBEXP_CALL
2137 case NT_CALL:
2138 if (IS_CALL_RECURSION(NCALL(node))) {
2139 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
2141 else
2142 r = quantifiers_memory_node_info(NCALL(node)->target);
2143 break;
2144 # endif
2146 case NT_QTFR:
2148 QtfrNode* qn = NQTFR(node);
2149 if (qn->upper != 0) {
2150 r = quantifiers_memory_node_info(qn->target);
2153 break;
2155 case NT_ENCLOSE:
2157 EncloseNode* en = NENCLOSE(node);
2158 switch (en->type) {
2159 case ENCLOSE_MEMORY:
2160 return NQ_TARGET_IS_EMPTY_MEM;
2161 break;
2163 case ENCLOSE_OPTION:
2164 case ENCLOSE_STOP_BACKTRACK:
2165 case ENCLOSE_CONDITION:
2166 case ENCLOSE_ABSENT:
2167 r = quantifiers_memory_node_info(en->target);
2168 break;
2169 default:
2170 break;
2173 break;
2175 case NT_BREF:
2176 case NT_STR:
2177 case NT_CTYPE:
2178 case NT_CCLASS:
2179 case NT_CANY:
2180 case NT_ANCHOR:
2181 default:
2182 break;
2185 return r;
2187 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2189 static int
2190 get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2192 OnigDistance tmin;
2193 int r = 0;
2195 *min = 0;
2196 switch (NTYPE(node)) {
2197 case NT_BREF:
2199 int i;
2200 int* backs;
2201 Node** nodes = SCANENV_MEM_NODES(env);
2202 BRefNode* br = NBREF(node);
2203 if (br->state & NST_RECURSION) break;
2205 backs = BACKREFS_P(br);
2206 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2207 r = get_min_match_length(nodes[backs[0]], min, env);
2208 if (r != 0) break;
2209 for (i = 1; i < br->back_num; i++) {
2210 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2211 r = get_min_match_length(nodes[backs[i]], &tmin, env);
2212 if (r != 0) break;
2213 if (*min > tmin) *min = tmin;
2216 break;
2218 #ifdef USE_SUBEXP_CALL
2219 case NT_CALL:
2220 if (IS_CALL_RECURSION(NCALL(node))) {
2221 EncloseNode* en = NENCLOSE(NCALL(node)->target);
2222 if (IS_ENCLOSE_MIN_FIXED(en))
2223 *min = en->min_len;
2225 else
2226 r = get_min_match_length(NCALL(node)->target, min, env);
2227 break;
2228 #endif
2230 case NT_LIST:
2231 do {
2232 r = get_min_match_length(NCAR(node), &tmin, env);
2233 if (r == 0) *min += tmin;
2234 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2235 break;
2237 case NT_ALT:
2239 Node *x, *y;
2240 y = node;
2241 do {
2242 x = NCAR(y);
2243 r = get_min_match_length(x, &tmin, env);
2244 if (r != 0) break;
2245 if (y == node) *min = tmin;
2246 else if (*min > tmin) *min = tmin;
2247 } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2249 break;
2251 case NT_STR:
2253 StrNode* sn = NSTR(node);
2254 *min = sn->end - sn->s;
2256 break;
2258 case NT_CTYPE:
2259 *min = 1;
2260 break;
2262 case NT_CCLASS:
2263 case NT_CANY:
2264 *min = 1;
2265 break;
2267 case NT_QTFR:
2269 QtfrNode* qn = NQTFR(node);
2271 if (qn->lower > 0) {
2272 r = get_min_match_length(qn->target, min, env);
2273 if (r == 0)
2274 *min = distance_multiply(*min, qn->lower);
2277 break;
2279 case NT_ENCLOSE:
2281 EncloseNode* en = NENCLOSE(node);
2282 switch (en->type) {
2283 case ENCLOSE_MEMORY:
2284 if (IS_ENCLOSE_MIN_FIXED(en))
2285 *min = en->min_len;
2286 else {
2287 if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2288 *min = 0; /* recursive */
2289 else {
2290 SET_ENCLOSE_STATUS(node, NST_MARK1);
2291 r = get_min_match_length(en->target, min, env);
2292 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2293 if (r == 0) {
2294 en->min_len = *min;
2295 SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2299 break;
2301 case ENCLOSE_OPTION:
2302 case ENCLOSE_STOP_BACKTRACK:
2303 case ENCLOSE_CONDITION:
2304 r = get_min_match_length(en->target, min, env);
2305 break;
2307 case ENCLOSE_ABSENT:
2308 break;
2311 break;
2313 case NT_ANCHOR:
2314 default:
2315 break;
2318 return r;
2321 static int
2322 get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2324 OnigDistance tmax;
2325 int r = 0;
2327 *max = 0;
2328 switch (NTYPE(node)) {
2329 case NT_LIST:
2330 do {
2331 r = get_max_match_length(NCAR(node), &tmax, env);
2332 if (r == 0)
2333 *max = distance_add(*max, tmax);
2334 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2335 break;
2337 case NT_ALT:
2338 do {
2339 r = get_max_match_length(NCAR(node), &tmax, env);
2340 if (r == 0 && *max < tmax) *max = tmax;
2341 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2342 break;
2344 case NT_STR:
2346 StrNode* sn = NSTR(node);
2347 *max = sn->end - sn->s;
2349 break;
2351 case NT_CTYPE:
2352 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2353 break;
2355 case NT_CCLASS:
2356 case NT_CANY:
2357 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2358 break;
2360 case NT_BREF:
2362 int i;
2363 int* backs;
2364 Node** nodes = SCANENV_MEM_NODES(env);
2365 BRefNode* br = NBREF(node);
2366 if (br->state & NST_RECURSION) {
2367 *max = ONIG_INFINITE_DISTANCE;
2368 break;
2370 backs = BACKREFS_P(br);
2371 for (i = 0; i < br->back_num; i++) {
2372 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2373 r = get_max_match_length(nodes[backs[i]], &tmax, env);
2374 if (r != 0) break;
2375 if (*max < tmax) *max = tmax;
2378 break;
2380 #ifdef USE_SUBEXP_CALL
2381 case NT_CALL:
2382 if (! IS_CALL_RECURSION(NCALL(node)))
2383 r = get_max_match_length(NCALL(node)->target, max, env);
2384 else
2385 *max = ONIG_INFINITE_DISTANCE;
2386 break;
2387 #endif
2389 case NT_QTFR:
2391 QtfrNode* qn = NQTFR(node);
2393 if (qn->upper != 0) {
2394 r = get_max_match_length(qn->target, max, env);
2395 if (r == 0 && *max != 0) {
2396 if (! IS_REPEAT_INFINITE(qn->upper))
2397 *max = distance_multiply(*max, qn->upper);
2398 else
2399 *max = ONIG_INFINITE_DISTANCE;
2403 break;
2405 case NT_ENCLOSE:
2407 EncloseNode* en = NENCLOSE(node);
2408 switch (en->type) {
2409 case ENCLOSE_MEMORY:
2410 if (IS_ENCLOSE_MAX_FIXED(en))
2411 *max = en->max_len;
2412 else {
2413 if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2414 *max = ONIG_INFINITE_DISTANCE;
2415 else {
2416 SET_ENCLOSE_STATUS(node, NST_MARK1);
2417 r = get_max_match_length(en->target, max, env);
2418 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2419 if (r == 0) {
2420 en->max_len = *max;
2421 SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2425 break;
2427 case ENCLOSE_OPTION:
2428 case ENCLOSE_STOP_BACKTRACK:
2429 case ENCLOSE_CONDITION:
2430 r = get_max_match_length(en->target, max, env);
2431 break;
2433 case ENCLOSE_ABSENT:
2434 break;
2437 break;
2439 case NT_ANCHOR:
2440 default:
2441 break;
2444 return r;
2447 #define GET_CHAR_LEN_VARLEN -1
2448 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2450 /* fixed size pattern node only */
2451 static int
2452 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2454 int tlen;
2455 int r = 0;
2457 level++;
2458 *len = 0;
2459 switch (NTYPE(node)) {
2460 case NT_LIST:
2461 do {
2462 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2463 if (r == 0)
2464 *len = (int )distance_add(*len, tlen);
2465 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2466 break;
2468 case NT_ALT:
2470 int tlen2;
2471 int varlen = 0;
2473 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2474 while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2475 r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2476 if (r == 0) {
2477 if (tlen != tlen2)
2478 varlen = 1;
2481 if (r == 0) {
2482 if (varlen != 0) {
2483 if (level == 1)
2484 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2485 else
2486 r = GET_CHAR_LEN_VARLEN;
2488 else
2489 *len = tlen;
2492 break;
2494 case NT_STR:
2496 StrNode* sn = NSTR(node);
2497 UChar *s = sn->s;
2498 while (s < sn->end) {
2499 s += enclen(reg->enc, s, sn->end);
2500 (*len)++;
2503 break;
2505 case NT_QTFR:
2507 QtfrNode* qn = NQTFR(node);
2508 if (qn->lower == qn->upper) {
2509 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2510 if (r == 0)
2511 *len = (int )distance_multiply(tlen, qn->lower);
2513 else
2514 r = GET_CHAR_LEN_VARLEN;
2516 break;
2518 #ifdef USE_SUBEXP_CALL
2519 case NT_CALL:
2520 if (! IS_CALL_RECURSION(NCALL(node)))
2521 r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2522 else
2523 r = GET_CHAR_LEN_VARLEN;
2524 break;
2525 #endif
2527 case NT_CTYPE:
2528 *len = 1;
2529 break;
2531 case NT_CCLASS:
2532 case NT_CANY:
2533 *len = 1;
2534 break;
2536 case NT_ENCLOSE:
2538 EncloseNode* en = NENCLOSE(node);
2539 switch (en->type) {
2540 case ENCLOSE_MEMORY:
2541 #ifdef USE_SUBEXP_CALL
2542 if (IS_ENCLOSE_CLEN_FIXED(en))
2543 *len = en->char_len;
2544 else {
2545 r = get_char_length_tree1(en->target, reg, len, level);
2546 if (r == 0) {
2547 en->char_len = *len;
2548 SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2551 break;
2552 #endif
2553 case ENCLOSE_OPTION:
2554 case ENCLOSE_STOP_BACKTRACK:
2555 case ENCLOSE_CONDITION:
2556 r = get_char_length_tree1(en->target, reg, len, level);
2557 break;
2558 case ENCLOSE_ABSENT:
2559 default:
2560 break;
2563 break;
2565 case NT_ANCHOR:
2566 break;
2568 default:
2569 r = GET_CHAR_LEN_VARLEN;
2570 break;
2573 return r;
2576 static int
2577 get_char_length_tree(Node* node, regex_t* reg, int* len)
2579 return get_char_length_tree1(node, reg, len, 0);
2582 /* x is not included y ==> 1 : 0 */
2583 static int
2584 is_not_included(Node* x, Node* y, regex_t* reg)
2586 int i;
2587 OnigDistance len;
2588 OnigCodePoint code;
2589 UChar *p;
2590 int ytype;
2592 retry:
2593 ytype = NTYPE(y);
2594 switch (NTYPE(x)) {
2595 case NT_CTYPE:
2597 switch (ytype) {
2598 case NT_CTYPE:
2599 if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2600 NCTYPE(y)->not != NCTYPE(x)->not &&
2601 NCTYPE(y)->ascii_range == NCTYPE(x)->ascii_range)
2602 return 1;
2603 else
2604 return 0;
2605 break;
2607 case NT_CCLASS:
2608 swap:
2610 Node* tmp;
2611 tmp = x; x = y; y = tmp;
2612 goto retry;
2614 break;
2616 case NT_STR:
2617 goto swap;
2618 break;
2620 default:
2621 break;
2624 break;
2626 case NT_CCLASS:
2628 CClassNode* xc = NCCLASS(x);
2629 switch (ytype) {
2630 case NT_CTYPE:
2631 switch (NCTYPE(y)->ctype) {
2632 case ONIGENC_CTYPE_WORD:
2633 if (NCTYPE(y)->not == 0) {
2634 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2635 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2636 if (BITSET_AT(xc->bs, i)) {
2637 if (NCTYPE(y)->ascii_range) {
2638 if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2640 else {
2641 if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
2645 return 1;
2647 return 0;
2649 else {
2650 if (IS_NOT_NULL(xc->mbuf)) return 0;
2651 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2652 int is_word;
2653 if (NCTYPE(y)->ascii_range)
2654 is_word = IS_CODE_SB_WORD(reg->enc, i);
2655 else
2656 is_word = ONIGENC_IS_CODE_WORD(reg->enc, i);
2657 if (! is_word) {
2658 if (!IS_NCCLASS_NOT(xc)) {
2659 if (BITSET_AT(xc->bs, i))
2660 return 0;
2662 else {
2663 if (! BITSET_AT(xc->bs, i))
2664 return 0;
2668 return 1;
2670 break;
2672 default:
2673 break;
2675 break;
2677 case NT_CCLASS:
2679 int v;
2680 CClassNode* yc = NCCLASS(y);
2682 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2683 v = BITSET_AT(xc->bs, i);
2684 if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2685 (v == 0 && IS_NCCLASS_NOT(xc))) {
2686 v = BITSET_AT(yc->bs, i);
2687 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2688 (v == 0 && IS_NCCLASS_NOT(yc)))
2689 return 0;
2692 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2693 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2694 return 1;
2695 return 0;
2697 break;
2699 case NT_STR:
2700 goto swap;
2701 break;
2703 default:
2704 break;
2707 break;
2709 case NT_STR:
2711 StrNode* xs = NSTR(x);
2712 if (NSTRING_LEN(x) == 0)
2713 break;
2715 switch (ytype) {
2716 case NT_CTYPE:
2717 switch (NCTYPE(y)->ctype) {
2718 case ONIGENC_CTYPE_WORD:
2719 if (NCTYPE(y)->ascii_range) {
2720 if (ONIGENC_IS_MBC_ASCII_WORD(reg->enc, xs->s, xs->end))
2721 return NCTYPE(y)->not;
2722 else
2723 return !(NCTYPE(y)->not);
2725 else {
2726 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2727 return NCTYPE(y)->not;
2728 else
2729 return !(NCTYPE(y)->not);
2731 break;
2732 default:
2733 break;
2735 break;
2737 case NT_CCLASS:
2739 CClassNode* cc = NCCLASS(y);
2741 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2742 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2743 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2745 break;
2747 case NT_STR:
2749 UChar *q;
2750 StrNode* ys = NSTR(y);
2751 len = NSTRING_LEN(x);
2752 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2753 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2754 /* tiny version */
2755 return 0;
2757 else {
2758 for (i = 0, p = ys->s, q = xs->s; (OnigDistance )i < len; i++, p++, q++) {
2759 if (*p != *q) return 1;
2763 break;
2765 default:
2766 break;
2769 break;
2771 default:
2772 break;
2775 return 0;
2778 static Node*
2779 get_head_value_node(Node* node, int exact, regex_t* reg)
2781 Node* n = NULL_NODE;
2783 switch (NTYPE(node)) {
2784 case NT_BREF:
2785 case NT_ALT:
2786 case NT_CANY:
2787 #ifdef USE_SUBEXP_CALL
2788 case NT_CALL:
2789 #endif
2790 break;
2792 case NT_CTYPE:
2793 case NT_CCLASS:
2794 if (exact == 0) {
2795 n = node;
2797 break;
2799 case NT_LIST:
2800 n = get_head_value_node(NCAR(node), exact, reg);
2801 break;
2803 case NT_STR:
2805 StrNode* sn = NSTR(node);
2807 if (sn->end <= sn->s)
2808 break;
2810 if (exact != 0 &&
2811 !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2813 else {
2814 n = node;
2817 break;
2819 case NT_QTFR:
2821 QtfrNode* qn = NQTFR(node);
2822 if (qn->lower > 0) {
2823 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
2824 if (IS_NOT_NULL(qn->head_exact))
2825 n = qn->head_exact;
2826 else
2827 #endif
2828 n = get_head_value_node(qn->target, exact, reg);
2831 break;
2833 case NT_ENCLOSE:
2835 EncloseNode* en = NENCLOSE(node);
2836 switch (en->type) {
2837 case ENCLOSE_OPTION:
2839 OnigOptionType options = reg->options;
2841 reg->options = NENCLOSE(node)->option;
2842 n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2843 reg->options = options;
2845 break;
2847 case ENCLOSE_MEMORY:
2848 case ENCLOSE_STOP_BACKTRACK:
2849 case ENCLOSE_CONDITION:
2850 n = get_head_value_node(en->target, exact, reg);
2851 break;
2853 case ENCLOSE_ABSENT:
2854 break;
2857 break;
2859 case NT_ANCHOR:
2860 if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2861 n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2862 break;
2864 default:
2865 break;
2868 return n;
2871 static int
2872 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2874 int type, r = 0;
2876 type = NTYPE(node);
2877 if ((NTYPE2BIT(type) & type_mask) == 0)
2878 return 1;
2880 switch (type) {
2881 case NT_LIST:
2882 case NT_ALT:
2883 do {
2884 r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2885 anchor_mask);
2886 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2887 break;
2889 case NT_QTFR:
2890 r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2891 anchor_mask);
2892 break;
2894 case NT_ENCLOSE:
2896 EncloseNode* en = NENCLOSE(node);
2897 if ((en->type & enclose_mask) == 0)
2898 return 1;
2900 r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2902 break;
2904 case NT_ANCHOR:
2905 type = NANCHOR(node)->type;
2906 if ((type & anchor_mask) == 0)
2907 return 1;
2909 if (NANCHOR(node)->target)
2910 r = check_type_tree(NANCHOR(node)->target,
2911 type_mask, enclose_mask, anchor_mask);
2912 break;
2914 default:
2915 break;
2917 return r;
2920 #ifdef USE_SUBEXP_CALL
2922 # define RECURSION_EXIST 1
2923 # define RECURSION_INFINITE 2
2925 static int
2926 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2928 int type;
2929 int r = 0;
2931 type = NTYPE(node);
2932 switch (type) {
2933 case NT_LIST:
2935 Node *x;
2936 OnigDistance min;
2937 int ret;
2939 x = node;
2940 do {
2941 ret = subexp_inf_recursive_check(NCAR(x), env, head);
2942 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2943 r |= ret;
2944 if (head) {
2945 ret = get_min_match_length(NCAR(x), &min, env);
2946 if (ret != 0) return ret;
2947 if (min != 0) head = 0;
2949 } while (IS_NOT_NULL(x = NCDR(x)));
2951 break;
2953 case NT_ALT:
2955 int ret;
2956 r = RECURSION_EXIST;
2957 do {
2958 ret = subexp_inf_recursive_check(NCAR(node), env, head);
2959 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2960 r &= ret;
2961 } while (IS_NOT_NULL(node = NCDR(node)));
2963 break;
2965 case NT_QTFR:
2966 r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2967 if (r == RECURSION_EXIST) {
2968 if (NQTFR(node)->lower == 0) r = 0;
2970 break;
2972 case NT_ANCHOR:
2974 AnchorNode* an = NANCHOR(node);
2975 switch (an->type) {
2976 case ANCHOR_PREC_READ:
2977 case ANCHOR_PREC_READ_NOT:
2978 case ANCHOR_LOOK_BEHIND:
2979 case ANCHOR_LOOK_BEHIND_NOT:
2980 r = subexp_inf_recursive_check(an->target, env, head);
2981 break;
2984 break;
2986 case NT_CALL:
2987 r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2988 break;
2990 case NT_ENCLOSE:
2991 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2992 return 0;
2993 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2994 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2995 else {
2996 SET_ENCLOSE_STATUS(node, NST_MARK2);
2997 r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2998 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
3000 break;
3002 default:
3003 break;
3006 return r;
3009 static int
3010 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
3012 int type;
3013 int r = 0;
3015 type = NTYPE(node);
3016 switch (type) {
3017 case NT_LIST:
3018 case NT_ALT:
3019 do {
3020 r = subexp_inf_recursive_check_trav(NCAR(node), env);
3021 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3022 break;
3024 case NT_QTFR:
3025 r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
3026 break;
3028 case NT_ANCHOR:
3030 AnchorNode* an = NANCHOR(node);
3031 switch (an->type) {
3032 case ANCHOR_PREC_READ:
3033 case ANCHOR_PREC_READ_NOT:
3034 case ANCHOR_LOOK_BEHIND:
3035 case ANCHOR_LOOK_BEHIND_NOT:
3036 r = subexp_inf_recursive_check_trav(an->target, env);
3037 break;
3040 break;
3042 case NT_ENCLOSE:
3044 EncloseNode* en = NENCLOSE(node);
3046 if (IS_ENCLOSE_RECURSION(en)) {
3047 SET_ENCLOSE_STATUS(node, NST_MARK1);
3048 r = subexp_inf_recursive_check(en->target, env, 1);
3049 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
3050 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
3052 r = subexp_inf_recursive_check_trav(en->target, env);
3055 break;
3057 default:
3058 break;
3061 return r;
3064 static int
3065 subexp_recursive_check(Node* node)
3067 int r = 0;
3069 switch (NTYPE(node)) {
3070 case NT_LIST:
3071 case NT_ALT:
3072 do {
3073 r |= subexp_recursive_check(NCAR(node));
3074 } while (IS_NOT_NULL(node = NCDR(node)));
3075 break;
3077 case NT_QTFR:
3078 r = subexp_recursive_check(NQTFR(node)->target);
3079 break;
3081 case NT_ANCHOR:
3083 AnchorNode* an = NANCHOR(node);
3084 switch (an->type) {
3085 case ANCHOR_PREC_READ:
3086 case ANCHOR_PREC_READ_NOT:
3087 case ANCHOR_LOOK_BEHIND:
3088 case ANCHOR_LOOK_BEHIND_NOT:
3089 r = subexp_recursive_check(an->target);
3090 break;
3093 break;
3095 case NT_CALL:
3096 r = subexp_recursive_check(NCALL(node)->target);
3097 if (r != 0) SET_CALL_RECURSION(node);
3098 break;
3100 case NT_ENCLOSE:
3101 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
3102 return 0;
3103 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
3104 return 1; /* recursion */
3105 else {
3106 SET_ENCLOSE_STATUS(node, NST_MARK2);
3107 r = subexp_recursive_check(NENCLOSE(node)->target);
3108 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
3110 break;
3112 default:
3113 break;
3116 return r;
3120 static int
3121 subexp_recursive_check_trav(Node* node, ScanEnv* env)
3123 # define FOUND_CALLED_NODE 1
3125 int type;
3126 int r = 0;
3128 type = NTYPE(node);
3129 switch (type) {
3130 case NT_LIST:
3131 case NT_ALT:
3133 int ret;
3134 do {
3135 ret = subexp_recursive_check_trav(NCAR(node), env);
3136 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
3137 else if (ret < 0) return ret;
3138 } while (IS_NOT_NULL(node = NCDR(node)));
3140 break;
3142 case NT_QTFR:
3143 r = subexp_recursive_check_trav(NQTFR(node)->target, env);
3144 if (NQTFR(node)->upper == 0) {
3145 if (r == FOUND_CALLED_NODE)
3146 NQTFR(node)->is_referred = 1;
3148 break;
3150 case NT_ANCHOR:
3152 AnchorNode* an = NANCHOR(node);
3153 switch (an->type) {
3154 case ANCHOR_PREC_READ:
3155 case ANCHOR_PREC_READ_NOT:
3156 case ANCHOR_LOOK_BEHIND:
3157 case ANCHOR_LOOK_BEHIND_NOT:
3158 r = subexp_recursive_check_trav(an->target, env);
3159 break;
3162 break;
3164 case NT_ENCLOSE:
3166 EncloseNode* en = NENCLOSE(node);
3168 if (! IS_ENCLOSE_RECURSION(en)) {
3169 if (IS_ENCLOSE_CALLED(en)) {
3170 SET_ENCLOSE_STATUS(node, NST_MARK1);
3171 r = subexp_recursive_check(en->target);
3172 if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
3173 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
3176 r = subexp_recursive_check_trav(en->target, env);
3177 if (IS_ENCLOSE_CALLED(en))
3178 r |= FOUND_CALLED_NODE;
3180 break;
3182 default:
3183 break;
3186 return r;
3189 static int
3190 setup_subexp_call(Node* node, ScanEnv* env)
3192 int type;
3193 int r = 0;
3195 type = NTYPE(node);
3196 switch (type) {
3197 case NT_LIST:
3198 do {
3199 r = setup_subexp_call(NCAR(node), env);
3200 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3201 break;
3203 case NT_ALT:
3204 do {
3205 r = setup_subexp_call(NCAR(node), env);
3206 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3207 break;
3209 case NT_QTFR:
3210 r = setup_subexp_call(NQTFR(node)->target, env);
3211 break;
3212 case NT_ENCLOSE:
3213 r = setup_subexp_call(NENCLOSE(node)->target, env);
3214 break;
3216 case NT_CALL:
3218 CallNode* cn = NCALL(node);
3219 Node** nodes = SCANENV_MEM_NODES(env);
3221 if (cn->group_num != 0) {
3222 int gnum = cn->group_num;
3224 # ifdef USE_NAMED_GROUP
3225 if (env->num_named > 0 &&
3226 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3227 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3228 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3230 # endif
3231 if (gnum > env->num_mem) {
3232 onig_scan_env_set_error_string(env,
3233 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3234 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3237 # ifdef USE_NAMED_GROUP
3238 set_call_attr:
3239 # endif
3240 cn->target = nodes[cn->group_num];
3241 if (IS_NULL(cn->target)) {
3242 onig_scan_env_set_error_string(env,
3243 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3244 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3246 SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3247 BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3248 cn->unset_addr_list = env->unset_addr_list;
3250 # ifdef USE_NAMED_GROUP
3251 # ifdef USE_PERL_SUBEXP_CALL
3252 else if (cn->name == cn->name_end) {
3253 goto set_call_attr;
3255 # endif
3256 else {
3257 int *refs;
3259 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3260 &refs);
3261 if (n <= 0) {
3262 onig_scan_env_set_error_string(env,
3263 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3264 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3266 else if (n > 1 &&
3267 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL)) {
3268 onig_scan_env_set_error_string(env,
3269 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3270 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3272 else {
3273 cn->group_num = refs[0];
3274 goto set_call_attr;
3277 # endif
3279 break;
3281 case NT_ANCHOR:
3283 AnchorNode* an = NANCHOR(node);
3285 switch (an->type) {
3286 case ANCHOR_PREC_READ:
3287 case ANCHOR_PREC_READ_NOT:
3288 case ANCHOR_LOOK_BEHIND:
3289 case ANCHOR_LOOK_BEHIND_NOT:
3290 r = setup_subexp_call(an->target, env);
3291 break;
3294 break;
3296 default:
3297 break;
3300 return r;
3302 #endif
3304 /* divide different length alternatives in look-behind.
3305 (?<=A|B) ==> (?<=A)|(?<=B)
3306 (?<!A|B) ==> (?<!A)(?<!B)
3308 static int
3309 divide_look_behind_alternatives(Node* node)
3311 Node *head, *np, *insert_node;
3312 AnchorNode* an = NANCHOR(node);
3313 int anc_type = an->type;
3315 head = an->target;
3316 np = NCAR(head);
3317 swap_node(node, head);
3318 NCAR(node) = head;
3319 NANCHOR(head)->target = np;
3321 np = node;
3322 while ((np = NCDR(np)) != NULL_NODE) {
3323 insert_node = onig_node_new_anchor(anc_type);
3324 CHECK_NULL_RETURN_MEMERR(insert_node);
3325 NANCHOR(insert_node)->target = NCAR(np);
3326 NCAR(np) = insert_node;
3329 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3330 np = node;
3331 do {
3332 SET_NTYPE(np, NT_LIST); /* alt -> list */
3333 } while ((np = NCDR(np)) != NULL_NODE);
3335 return 0;
3338 static int
3339 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3341 int r, len;
3342 AnchorNode* an = NANCHOR(node);
3344 r = get_char_length_tree(an->target, reg, &len);
3345 if (r == 0)
3346 an->char_len = len;
3347 else if (r == GET_CHAR_LEN_VARLEN)
3348 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3349 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3350 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3351 r = divide_look_behind_alternatives(node);
3352 else
3353 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3356 return r;
3359 static int
3360 next_setup(Node* node, Node* next_node, regex_t* reg)
3362 int type;
3364 retry:
3365 type = NTYPE(node);
3366 if (type == NT_QTFR) {
3367 QtfrNode* qn = NQTFR(node);
3368 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3369 #ifdef USE_QTFR_PEEK_NEXT
3370 Node* n = get_head_value_node(next_node, 1, reg);
3371 /* '\0': for UTF-16BE etc... */
3372 if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3373 qn->next_head_exact = n;
3375 #endif
3376 /* automatic possessification a*b ==> (?>a*)b */
3377 if (qn->lower <= 1) {
3378 int ttype = NTYPE(qn->target);
3379 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3380 Node *x, *y;
3381 x = get_head_value_node(qn->target, 0, reg);
3382 if (IS_NOT_NULL(x)) {
3383 y = get_head_value_node(next_node, 0, reg);
3384 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3385 Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3386 CHECK_NULL_RETURN_MEMERR(en);
3387 SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3388 swap_node(node, en);
3389 NENCLOSE(node)->target = en;
3396 else if (type == NT_ENCLOSE) {
3397 EncloseNode* en = NENCLOSE(node);
3398 if (en->type == ENCLOSE_MEMORY && !IS_ENCLOSE_CALLED(en)) {
3399 node = en->target;
3400 goto retry;
3403 return 0;
3407 static int
3408 update_string_node_case_fold(regex_t* reg, Node *node)
3410 UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3411 UChar *sbuf, *ebuf, *sp;
3412 int r, i, len;
3413 OnigDistance sbuf_size;
3414 StrNode* sn = NSTR(node);
3416 end = sn->end;
3417 sbuf_size = (end - sn->s) * 2;
3418 sbuf = (UChar* )xmalloc(sbuf_size);
3419 CHECK_NULL_RETURN_MEMERR(sbuf);
3420 ebuf = sbuf + sbuf_size;
3422 sp = sbuf;
3423 p = sn->s;
3424 while (p < end) {
3425 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3426 for (i = 0; i < len; i++) {
3427 if (sp >= ebuf) {
3428 UChar* p = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3429 if (IS_NULL(p)) {
3430 xfree(sbuf);
3431 return ONIGERR_MEMORY;
3433 sbuf = p;
3434 sp = sbuf + sbuf_size;
3435 sbuf_size *= 2;
3436 ebuf = sbuf + sbuf_size;
3439 *sp++ = buf[i];
3443 r = onig_node_str_set(node, sbuf, sp);
3445 xfree(sbuf);
3446 return r;
3449 static int
3450 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3451 regex_t* reg)
3453 int r;
3454 Node *node;
3456 node = onig_node_new_str(s, end);
3457 if (IS_NULL(node)) return ONIGERR_MEMORY;
3459 r = update_string_node_case_fold(reg, node);
3460 if (r != 0) {
3461 onig_node_free(node);
3462 return r;
3465 NSTRING_SET_AMBIG(node);
3466 NSTRING_SET_DONT_GET_OPT_INFO(node);
3467 *rnode = node;
3468 return 0;
3471 static int
3472 is_case_fold_variable_len(int item_num, OnigCaseFoldCodeItem items[],
3473 int slen)
3475 int i;
3477 for (i = 0; i < item_num; i++) {
3478 if (items[i].byte_len != slen) {
3479 return 1;
3481 if (items[i].code_len != 1) {
3482 return 1;
3485 return 0;
3488 static int
3489 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3490 UChar *p, int slen, UChar *end,
3491 regex_t* reg, Node **rnode)
3493 int r, i, j, len, varlen;
3494 Node *anode, *var_anode, *snode, *xnode, *an;
3495 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3497 *rnode = var_anode = NULL_NODE;
3499 varlen = 0;
3500 for (i = 0; i < item_num; i++) {
3501 if (items[i].byte_len != slen) {
3502 varlen = 1;
3503 break;
3507 if (varlen != 0) {
3508 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3509 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3511 xnode = onig_node_new_list(NULL, NULL);
3512 if (IS_NULL(xnode)) goto mem_err;
3513 NCAR(var_anode) = xnode;
3515 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3516 if (IS_NULL(anode)) goto mem_err;
3517 NCAR(xnode) = anode;
3519 else {
3520 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3521 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3524 snode = onig_node_new_str(p, p + slen);
3525 if (IS_NULL(snode)) goto mem_err;
3527 NCAR(anode) = snode;
3529 for (i = 0; i < item_num; i++) {
3530 snode = onig_node_new_str(NULL, NULL);
3531 if (IS_NULL(snode)) goto mem_err;
3533 for (j = 0; j < items[i].code_len; j++) {
3534 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3535 if (len < 0) {
3536 r = len;
3537 goto mem_err2;
3540 r = onig_node_str_cat(snode, buf, buf + len);
3541 if (r != 0) goto mem_err2;
3544 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3545 if (IS_NULL(an)) {
3546 goto mem_err2;
3549 if (items[i].byte_len != slen) {
3550 Node *rem;
3551 UChar *q = p + items[i].byte_len;
3553 if (q < end) {
3554 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3555 if (r != 0) {
3556 onig_node_free(an);
3557 goto mem_err2;
3560 xnode = onig_node_list_add(NULL_NODE, snode);
3561 if (IS_NULL(xnode)) {
3562 onig_node_free(an);
3563 onig_node_free(rem);
3564 goto mem_err2;
3566 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3567 onig_node_free(an);
3568 onig_node_free(xnode);
3569 onig_node_free(rem);
3570 goto mem_err;
3573 NCAR(an) = xnode;
3575 else {
3576 NCAR(an) = snode;
3579 NCDR(var_anode) = an;
3580 var_anode = an;
3582 else {
3583 NCAR(an) = snode;
3584 NCDR(anode) = an;
3585 anode = an;
3589 return varlen;
3591 mem_err2:
3592 onig_node_free(snode);
3594 mem_err:
3595 onig_node_free(*rnode);
3597 return ONIGERR_MEMORY;
3600 static int
3601 expand_case_fold_string(Node* node, regex_t* reg)
3603 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3605 int r, n, len, alt_num;
3606 int varlen = 0;
3607 UChar *start, *end, *p;
3608 Node *top_root, *root, *snode, *prev_node;
3609 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3610 StrNode* sn = NSTR(node);
3612 if (NSTRING_IS_AMBIG(node)) return 0;
3614 start = sn->s;
3615 end = sn->end;
3616 if (start >= end) return 0;
3618 r = 0;
3619 top_root = root = prev_node = snode = NULL_NODE;
3620 alt_num = 1;
3621 p = start;
3622 while (p < end) {
3623 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3624 p, end, items);
3625 if (n < 0) {
3626 r = n;
3627 goto err;
3630 len = enclen(reg->enc, p, end);
3632 varlen = is_case_fold_variable_len(n, items, len);
3633 if (n == 0 || varlen == 0) {
3634 if (IS_NULL(snode)) {
3635 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3636 onig_node_free(top_root);
3637 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3638 if (IS_NULL(root)) {
3639 onig_node_free(prev_node);
3640 goto mem_err;
3644 prev_node = snode = onig_node_new_str(NULL, NULL);
3645 if (IS_NULL(snode)) goto mem_err;
3646 if (IS_NOT_NULL(root)) {
3647 if (IS_NULL(onig_node_list_add(root, snode))) {
3648 onig_node_free(snode);
3649 goto mem_err;
3654 r = onig_node_str_cat(snode, p, p + len);
3655 if (r != 0) goto err;
3657 else {
3658 alt_num *= (n + 1);
3659 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3661 if (IS_NOT_NULL(snode)) {
3662 r = update_string_node_case_fold(reg, snode);
3663 if (r == 0) {
3664 NSTRING_SET_AMBIG(snode);
3667 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3668 onig_node_free(top_root);
3669 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3670 if (IS_NULL(root)) {
3671 onig_node_free(prev_node);
3672 goto mem_err;
3676 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3677 if (r < 0) goto mem_err;
3678 if (r == 1) {
3679 if (IS_NULL(root)) {
3680 top_root = prev_node;
3682 else {
3683 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3684 onig_node_free(prev_node);
3685 goto mem_err;
3689 root = NCAR(prev_node);
3691 else { /* r == 0 */
3692 if (IS_NOT_NULL(root)) {
3693 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3694 onig_node_free(prev_node);
3695 goto mem_err;
3700 snode = NULL_NODE;
3703 p += len;
3705 if (IS_NOT_NULL(snode)) {
3706 r = update_string_node_case_fold(reg, snode);
3707 if (r == 0) {
3708 NSTRING_SET_AMBIG(snode);
3712 if (p < end) {
3713 Node *srem;
3715 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3716 if (r != 0) goto mem_err;
3718 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3719 onig_node_free(top_root);
3720 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3721 if (IS_NULL(root)) {
3722 onig_node_free(srem);
3723 onig_node_free(prev_node);
3724 goto mem_err;
3728 if (IS_NULL(root)) {
3729 prev_node = srem;
3731 else {
3732 if (IS_NULL(onig_node_list_add(root, srem))) {
3733 onig_node_free(srem);
3734 goto mem_err;
3739 /* ending */
3740 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3741 swap_node(node, top_root);
3742 onig_node_free(top_root);
3743 return 0;
3745 mem_err:
3746 r = ONIGERR_MEMORY;
3748 err:
3749 onig_node_free(top_root);
3750 return r;
3754 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3756 # define CEC_THRES_NUM_BIG_REPEAT 512
3757 # define CEC_INFINITE_NUM 0x7fffffff
3759 # define CEC_IN_INFINITE_REPEAT (1<<0)
3760 # define CEC_IN_FINITE_REPEAT (1<<1)
3761 # define CEC_CONT_BIG_REPEAT (1<<2)
3763 static int
3764 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3766 int type;
3767 int r = state;
3769 type = NTYPE(node);
3770 switch (type) {
3771 case NT_LIST:
3773 do {
3774 r = setup_comb_exp_check(NCAR(node), r, env);
3775 } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3777 break;
3779 case NT_ALT:
3781 int ret;
3782 do {
3783 ret = setup_comb_exp_check(NCAR(node), state, env);
3784 r |= ret;
3785 } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3787 break;
3789 case NT_QTFR:
3791 int child_state = state;
3792 int add_state = 0;
3793 QtfrNode* qn = NQTFR(node);
3794 Node* target = qn->target;
3795 int var_num;
3797 if (! IS_REPEAT_INFINITE(qn->upper)) {
3798 if (qn->upper > 1) {
3799 /* {0,1}, {1,1} are allowed */
3800 child_state |= CEC_IN_FINITE_REPEAT;
3802 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3803 if (env->backrefed_mem == 0) {
3804 if (NTYPE(qn->target) == NT_ENCLOSE) {
3805 EncloseNode* en = NENCLOSE(qn->target);
3806 if (en->type == ENCLOSE_MEMORY) {
3807 if (NTYPE(en->target) == NT_QTFR) {
3808 QtfrNode* q = NQTFR(en->target);
3809 if (IS_REPEAT_INFINITE(q->upper)
3810 && q->greedy == qn->greedy) {
3811 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3812 if (qn->upper == 1)
3813 child_state = state;
3822 if (state & CEC_IN_FINITE_REPEAT) {
3823 qn->comb_exp_check_num = -1;
3825 else {
3826 if (IS_REPEAT_INFINITE(qn->upper)) {
3827 var_num = CEC_INFINITE_NUM;
3828 child_state |= CEC_IN_INFINITE_REPEAT;
3830 else {
3831 var_num = qn->upper - qn->lower;
3834 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3835 add_state |= CEC_CONT_BIG_REPEAT;
3837 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3838 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3839 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3840 if (qn->comb_exp_check_num == 0) {
3841 env->num_comb_exp_check++;
3842 qn->comb_exp_check_num = env->num_comb_exp_check;
3843 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3844 env->comb_exp_max_regnum = env->curr_max_regnum;
3849 r = setup_comb_exp_check(target, child_state, env);
3850 r |= add_state;
3852 break;
3854 case NT_ENCLOSE:
3856 EncloseNode* en = NENCLOSE(node);
3858 switch (en->type) {
3859 case ENCLOSE_MEMORY:
3861 if (env->curr_max_regnum < en->regnum)
3862 env->curr_max_regnum = en->regnum;
3864 r = setup_comb_exp_check(en->target, state, env);
3866 break;
3868 default:
3869 r = setup_comb_exp_check(en->target, state, env);
3870 break;
3873 break;
3875 # ifdef USE_SUBEXP_CALL
3876 case NT_CALL:
3877 if (IS_CALL_RECURSION(NCALL(node)))
3878 env->has_recursion = 1;
3879 else
3880 r = setup_comb_exp_check(NCALL(node)->target, state, env);
3881 break;
3882 # endif
3884 default:
3885 break;
3888 return r;
3890 #endif
3892 #define IN_ALT (1<<0)
3893 #define IN_NOT (1<<1)
3894 #define IN_REPEAT (1<<2)
3895 #define IN_VAR_REPEAT (1<<3)
3896 #define IN_CALL (1<<4)
3897 #define IN_RECCALL (1<<5)
3899 /* setup_tree does the following work.
3900 1. check empty loop. (set qn->target_empty_info)
3901 2. expand ignore-case in char class.
3902 3. set memory status bit flags. (reg->mem_stats)
3903 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3904 5. find invalid patterns in look-behind.
3905 6. expand repeated string.
3907 static int
3908 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3910 int type;
3911 int r = 0;
3913 restart:
3914 type = NTYPE(node);
3915 switch (type) {
3916 case NT_LIST:
3918 Node* prev = NULL_NODE;
3919 do {
3920 r = setup_tree(NCAR(node), reg, state, env);
3921 if (IS_NOT_NULL(prev) && r == 0) {
3922 r = next_setup(prev, NCAR(node), reg);
3924 prev = NCAR(node);
3925 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3927 break;
3929 case NT_ALT:
3930 do {
3931 r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3932 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3933 break;
3935 case NT_CCLASS:
3936 break;
3938 case NT_STR:
3939 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3940 r = expand_case_fold_string(node, reg);
3942 break;
3944 case NT_CTYPE:
3945 case NT_CANY:
3946 break;
3948 #ifdef USE_SUBEXP_CALL
3949 case NT_CALL:
3950 break;
3951 #endif
3953 case NT_BREF:
3955 int i;
3956 int* p;
3957 Node** nodes = SCANENV_MEM_NODES(env);
3958 BRefNode* br = NBREF(node);
3959 p = BACKREFS_P(br);
3960 for (i = 0; i < br->back_num; i++) {
3961 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3962 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3963 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3964 #ifdef USE_BACKREF_WITH_LEVEL
3965 if (IS_BACKREF_NEST_LEVEL(br)) {
3966 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3968 #endif
3969 SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3972 break;
3974 case NT_QTFR:
3976 OnigDistance d;
3977 QtfrNode* qn = NQTFR(node);
3978 Node* target = qn->target;
3980 if ((state & IN_REPEAT) != 0) {
3981 qn->state |= NST_IN_REPEAT;
3984 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3985 r = get_min_match_length(target, &d, env);
3986 if (r) break;
3987 if (d == 0) {
3988 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3989 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3990 r = quantifiers_memory_node_info(target);
3991 if (r < 0) break;
3992 if (r > 0) {
3993 qn->target_empty_info = r;
3995 #endif
3996 #if 0
3997 r = get_max_match_length(target, &d, env);
3998 if (r == 0 && d == 0) {
3999 /* ()* ==> ()?, ()+ ==> () */
4000 qn->upper = 1;
4001 if (qn->lower > 1) qn->lower = 1;
4002 if (NTYPE(target) == NT_STR) {
4003 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
4006 #endif
4010 state |= IN_REPEAT;
4011 if (qn->lower != qn->upper)
4012 state |= IN_VAR_REPEAT;
4013 r = setup_tree(target, reg, state, env);
4014 if (r) break;
4016 /* expand string */
4017 #define EXPAND_STRING_MAX_LENGTH 100
4018 if (NTYPE(target) == NT_STR) {
4019 if (qn->lower > 1) {
4020 int i, n = qn->lower;
4021 OnigDistance len = NSTRING_LEN(target);
4022 StrNode* sn = NSTR(target);
4023 Node* np;
4025 np = onig_node_new_str(sn->s, sn->end);
4026 if (IS_NULL(np)) return ONIGERR_MEMORY;
4027 NSTR(np)->flag = sn->flag;
4029 for (i = 1; i < n && (i+1) * len <= EXPAND_STRING_MAX_LENGTH; i++) {
4030 r = onig_node_str_cat(np, sn->s, sn->end);
4031 if (r) {
4032 onig_node_free(np);
4033 return r;
4036 if (i < qn->upper || IS_REPEAT_INFINITE(qn->upper)) {
4037 Node *np1, *np2;
4039 qn->lower -= i;
4040 if (! IS_REPEAT_INFINITE(qn->upper))
4041 qn->upper -= i;
4043 np1 = onig_node_new_list(np, NULL);
4044 if (IS_NULL(np1)) {
4045 onig_node_free(np);
4046 return ONIGERR_MEMORY;
4048 swap_node(np1, node);
4049 np2 = onig_node_list_add(node, np1);
4050 if (IS_NULL(np2)) {
4051 onig_node_free(np1);
4052 return ONIGERR_MEMORY;
4055 else {
4056 swap_node(np, node);
4057 onig_node_free(np);
4059 break; /* break case NT_QTFR: */
4063 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
4064 if (qn->greedy && (qn->target_empty_info != 0)) {
4065 if (NTYPE(target) == NT_QTFR) {
4066 QtfrNode* tqn = NQTFR(target);
4067 if (IS_NOT_NULL(tqn->head_exact)) {
4068 qn->head_exact = tqn->head_exact;
4069 tqn->head_exact = NULL;
4072 else {
4073 qn->head_exact = get_head_value_node(qn->target, 1, reg);
4076 #endif
4078 break;
4080 case NT_ENCLOSE:
4082 EncloseNode* en = NENCLOSE(node);
4084 switch (en->type) {
4085 case ENCLOSE_OPTION:
4087 OnigOptionType options = reg->options;
4088 reg->options = NENCLOSE(node)->option;
4089 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4090 reg->options = options;
4092 break;
4094 case ENCLOSE_MEMORY:
4095 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_CALL)) != 0) {
4096 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
4097 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
4099 if (IS_ENCLOSE_CALLED(en))
4100 state |= IN_CALL;
4101 if (IS_ENCLOSE_RECURSION(en))
4102 state |= IN_RECCALL;
4103 else if ((state & IN_RECCALL) != 0)
4104 SET_CALL_RECURSION(node);
4105 r = setup_tree(en->target, reg, state, env);
4106 break;
4108 case ENCLOSE_STOP_BACKTRACK:
4110 Node* target = en->target;
4111 r = setup_tree(target, reg, state, env);
4112 if (NTYPE(target) == NT_QTFR) {
4113 QtfrNode* tqn = NQTFR(target);
4114 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
4115 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
4116 int qtype = NTYPE(tqn->target);
4117 if (IS_NODE_TYPE_SIMPLE(qtype))
4118 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
4122 break;
4124 case ENCLOSE_CONDITION:
4125 #ifdef USE_NAMED_GROUP
4126 if (! IS_ENCLOSE_NAME_REF(NENCLOSE(node)) &&
4127 env->num_named > 0 &&
4128 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
4129 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
4130 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
4132 #endif
4133 if (NENCLOSE(node)->regnum > env->num_mem)
4134 return ONIGERR_INVALID_BACKREF;
4135 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4136 break;
4138 case ENCLOSE_ABSENT:
4139 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4140 break;
4143 break;
4145 case NT_ANCHOR:
4147 AnchorNode* an = NANCHOR(node);
4149 switch (an->type) {
4150 case ANCHOR_PREC_READ:
4151 r = setup_tree(an->target, reg, state, env);
4152 break;
4153 case ANCHOR_PREC_READ_NOT:
4154 r = setup_tree(an->target, reg, (state | IN_NOT), env);
4155 break;
4157 /* allowed node types in look-behind */
4158 #define ALLOWED_TYPE_IN_LB \
4159 ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
4160 BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
4162 #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY | ENCLOSE_OPTION )
4163 #define ALLOWED_ENCLOSE_IN_LB_NOT ENCLOSE_OPTION
4165 #define ALLOWED_ANCHOR_IN_LB \
4166 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
4167 ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
4168 ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
4169 ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
4170 #define ALLOWED_ANCHOR_IN_LB_NOT \
4171 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
4172 ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
4173 ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
4174 ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
4176 case ANCHOR_LOOK_BEHIND:
4178 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
4179 ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
4180 if (r < 0) return r;
4181 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4182 if (NTYPE(node) != NT_ANCHOR) goto restart;
4183 r = setup_tree(an->target, reg, state, env);
4184 if (r != 0) return r;
4185 r = setup_look_behind(node, reg, env);
4187 break;
4189 case ANCHOR_LOOK_BEHIND_NOT:
4191 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
4192 ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
4193 if (r < 0) return r;
4194 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4195 if (NTYPE(node) != NT_ANCHOR) goto restart;
4196 r = setup_tree(an->target, reg, (state | IN_NOT), env);
4197 if (r != 0) return r;
4198 r = setup_look_behind(node, reg, env);
4200 break;
4203 break;
4205 default:
4206 break;
4209 return r;
4212 #ifndef USE_SUNDAY_QUICK_SEARCH
4213 /* set skip map for Boyer-Moore search */
4214 static int
4215 set_bm_skip(UChar* s, UChar* end, regex_t* reg,
4216 UChar skip[], int** int_skip, int ignore_case)
4218 OnigDistance i, len;
4219 int clen, flen, n, j, k;
4220 UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
4221 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4222 OnigEncoding enc = reg->enc;
4224 len = end - s;
4225 if (len < ONIG_CHAR_TABLE_SIZE) {
4226 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )len;
4228 n = 0;
4229 for (i = 0; i < len - 1; i += clen) {
4230 p = s + i;
4231 if (ignore_case)
4232 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4233 p, end, items);
4234 clen = enclen(enc, p, end);
4235 if (p + clen > end)
4236 clen = (int )(end - p);
4238 for (j = 0; j < n; j++) {
4239 if ((items[j].code_len != 1) || (items[j].byte_len != clen))
4240 return 1; /* different length isn't supported. */
4241 flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
4242 if (flen != clen)
4243 return 1; /* different length isn't supported. */
4245 for (j = 0; j < clen; j++) {
4246 skip[s[i + j]] = (UChar )(len - 1 - i - j);
4247 for (k = 0; k < n; k++) {
4248 skip[buf[k][j]] = (UChar )(len - 1 - i - j);
4253 else {
4254 # if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE
4255 /* This should not happen. */
4256 return ONIGERR_TYPE_BUG;
4257 # else
4258 if (IS_NULL(*int_skip)) {
4259 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
4260 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
4262 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )len;
4264 n = 0;
4265 for (i = 0; i < len - 1; i += clen) {
4266 p = s + i;
4267 if (ignore_case)
4268 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4269 p, end, items);
4270 clen = enclen(enc, p, end);
4271 if (p + clen > end)
4272 clen = (int )(end - p);
4274 for (j = 0; j < n; j++) {
4275 if ((items[j].code_len != 1) || (items[j].byte_len != clen))
4276 return 1; /* different length isn't supported. */
4277 flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
4278 if (flen != clen)
4279 return 1; /* different length isn't supported. */
4281 for (j = 0; j < clen; j++) {
4282 (*int_skip)[s[i + j]] = (int )(len - 1 - i - j);
4283 for (k = 0; k < n; k++) {
4284 (*int_skip)[buf[k][j]] = (int )(len - 1 - i - j);
4288 # endif
4290 return 0;
4293 #else /* USE_SUNDAY_QUICK_SEARCH */
4295 /* set skip map for Sunday's quick search */
4296 static int
4297 set_bm_skip(UChar* s, UChar* end, regex_t* reg,
4298 UChar skip[], int** int_skip, int ignore_case)
4300 OnigDistance i, len;
4301 int clen, flen, n, j, k;
4302 UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
4303 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4304 OnigEncoding enc = reg->enc;
4306 len = end - s;
4307 if (len < ONIG_CHAR_TABLE_SIZE) {
4308 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )(len + 1);
4310 n = 0;
4311 for (i = 0; i < len; i += clen) {
4312 p = s + i;
4313 if (ignore_case)
4314 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4315 p, end, items);
4316 clen = enclen(enc, p, end);
4317 if (p + clen > end)
4318 clen = (int )(end - p);
4320 for (j = 0; j < n; j++) {
4321 if ((items[j].code_len != 1) || (items[j].byte_len != clen))
4322 return 1; /* different length isn't supported. */
4323 flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
4324 if (flen != clen)
4325 return 1; /* different length isn't supported. */
4327 for (j = 0; j < clen; j++) {
4328 skip[s[i + j]] = (UChar )(len - i - j);
4329 for (k = 0; k < n; k++) {
4330 skip[buf[k][j]] = (UChar )(len - i - j);
4335 else {
4336 # if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE
4337 /* This should not happen. */
4338 return ONIGERR_TYPE_BUG;
4339 # else
4340 if (IS_NULL(*int_skip)) {
4341 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
4342 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
4344 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )(len + 1);
4346 n = 0;
4347 for (i = 0; i < len; i += clen) {
4348 p = s + i;
4349 if (ignore_case)
4350 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4351 p, end, items);
4352 clen = enclen(enc, p, end);
4353 if (p + clen > end)
4354 clen = (int )(end - p);
4356 for (j = 0; j < n; j++) {
4357 if ((items[j].code_len != 1) || (items[j].byte_len != clen))
4358 return 1; /* different length isn't supported. */
4359 flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
4360 if (flen != clen)
4361 return 1; /* different length isn't supported. */
4363 for (j = 0; j < clen; j++) {
4364 (*int_skip)[s[i + j]] = (int )(len - i - j);
4365 for (k = 0; k < n; k++) {
4366 (*int_skip)[buf[k][j]] = (int )(len - i - j);
4370 # endif
4372 return 0;
4374 #endif /* USE_SUNDAY_QUICK_SEARCH */
4376 typedef struct {
4377 OnigDistance min; /* min byte length */
4378 OnigDistance max; /* max byte length */
4379 } MinMaxLen;
4381 typedef struct {
4382 MinMaxLen mmd;
4383 OnigEncoding enc;
4384 OnigOptionType options;
4385 OnigCaseFoldType case_fold_flag;
4386 ScanEnv* scan_env;
4387 } OptEnv;
4389 typedef struct {
4390 int left_anchor;
4391 int right_anchor;
4392 } OptAncInfo;
4394 typedef struct {
4395 MinMaxLen mmd; /* info position */
4396 OptAncInfo anc;
4398 int reach_end;
4399 int ignore_case; /* -1: unset, 0: case sensitive, 1: ignore case */
4400 int len;
4401 UChar s[OPT_EXACT_MAXLEN];
4402 } OptExactInfo;
4404 typedef struct {
4405 MinMaxLen mmd; /* info position */
4406 OptAncInfo anc;
4408 int value; /* weighted value */
4409 UChar map[ONIG_CHAR_TABLE_SIZE];
4410 } OptMapInfo;
4412 typedef struct {
4413 MinMaxLen len;
4415 OptAncInfo anc;
4416 OptExactInfo exb; /* boundary */
4417 OptExactInfo exm; /* middle */
4418 OptExactInfo expr; /* prec read (?=...) */
4420 OptMapInfo map; /* boundary */
4421 } NodeOptInfo;
4424 static int
4425 map_position_value(OnigEncoding enc, int i)
4427 static const short int ByteValTable[] = {
4428 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
4429 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4430 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
4431 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
4432 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4433 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
4434 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4435 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
4438 if (i < numberof(ByteValTable)) {
4439 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
4440 return 20;
4441 else
4442 return (int )ByteValTable[i];
4444 else
4445 return 4; /* Take it easy. */
4448 static int
4449 distance_value(MinMaxLen* mm)
4451 /* 1000 / (min-max-dist + 1) */
4452 static const short int dist_vals[] = {
4453 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4454 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4455 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4456 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4457 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4458 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4459 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4460 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4461 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4462 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4465 OnigDistance d;
4467 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4469 d = mm->max - mm->min;
4470 if (d < numberof(dist_vals))
4471 /* return dist_vals[d] * 16 / (mm->min + 12); */
4472 return (int )dist_vals[d];
4473 else
4474 return 1;
4477 static int
4478 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4480 if (v2 <= 0) return -1;
4481 if (v1 <= 0) return 1;
4483 v1 *= distance_value(d1);
4484 v2 *= distance_value(d2);
4486 if (v2 > v1) return 1;
4487 if (v2 < v1) return -1;
4489 if (d2->min < d1->min) return 1;
4490 if (d2->min > d1->min) return -1;
4491 return 0;
4494 static int
4495 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4497 return (a->min == b->min && a->max == b->max) ? 1 : 0;
4501 static void
4502 set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4504 mml->min = min;
4505 mml->max = max;
4508 static void
4509 clear_mml(MinMaxLen* mml)
4511 mml->min = mml->max = 0;
4514 static void
4515 copy_mml(MinMaxLen* to, MinMaxLen* from)
4517 to->min = from->min;
4518 to->max = from->max;
4521 static void
4522 add_mml(MinMaxLen* to, MinMaxLen* from)
4524 to->min = distance_add(to->min, from->min);
4525 to->max = distance_add(to->max, from->max);
4528 #if 0
4529 static void
4530 add_len_mml(MinMaxLen* to, OnigDistance len)
4532 to->min = distance_add(to->min, len);
4533 to->max = distance_add(to->max, len);
4535 #endif
4537 static void
4538 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4540 if (to->min > from->min) to->min = from->min;
4541 if (to->max < from->max) to->max = from->max;
4544 static void
4545 copy_opt_env(OptEnv* to, OptEnv* from)
4547 *to = *from;
4550 static void
4551 clear_opt_anc_info(OptAncInfo* anc)
4553 anc->left_anchor = 0;
4554 anc->right_anchor = 0;
4557 static void
4558 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4560 *to = *from;
4563 static void
4564 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4565 OnigDistance left_len, OnigDistance right_len)
4567 clear_opt_anc_info(to);
4569 to->left_anchor = left->left_anchor;
4570 if (left_len == 0) {
4571 to->left_anchor |= right->left_anchor;
4574 to->right_anchor = right->right_anchor;
4575 if (right_len == 0) {
4576 to->right_anchor |= left->right_anchor;
4578 else {
4579 to->right_anchor |= (left->right_anchor & ANCHOR_PREC_READ_NOT);
4583 static int
4584 is_left_anchor(int anc)
4586 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4587 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4588 anc == ANCHOR_PREC_READ_NOT)
4589 return 0;
4591 return 1;
4594 static int
4595 is_set_opt_anc_info(OptAncInfo* to, int anc)
4597 if ((to->left_anchor & anc) != 0) return 1;
4599 return ((to->right_anchor & anc) != 0 ? 1 : 0);
4602 static void
4603 add_opt_anc_info(OptAncInfo* to, int anc)
4605 if (is_left_anchor(anc))
4606 to->left_anchor |= anc;
4607 else
4608 to->right_anchor |= anc;
4611 static void
4612 remove_opt_anc_info(OptAncInfo* to, int anc)
4614 if (is_left_anchor(anc))
4615 to->left_anchor &= ~anc;
4616 else
4617 to->right_anchor &= ~anc;
4620 static void
4621 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4623 to->left_anchor &= add->left_anchor;
4624 to->right_anchor &= add->right_anchor;
4627 static int
4628 is_full_opt_exact_info(OptExactInfo* ex)
4630 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4633 static void
4634 clear_opt_exact_info(OptExactInfo* ex)
4636 clear_mml(&ex->mmd);
4637 clear_opt_anc_info(&ex->anc);
4638 ex->reach_end = 0;
4639 ex->ignore_case = -1; /* unset */
4640 ex->len = 0;
4641 ex->s[0] = '\0';
4644 static void
4645 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4647 *to = *from;
4650 static void
4651 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4653 int i, j, len;
4654 UChar *p, *end;
4655 OptAncInfo tanc;
4657 if (to->ignore_case < 0)
4658 to->ignore_case = add->ignore_case;
4659 else if (to->ignore_case != add->ignore_case)
4660 return ; /* avoid */
4662 p = add->s;
4663 end = p + add->len;
4664 for (i = to->len; p < end; ) {
4665 len = enclen(enc, p, end);
4666 if (i + len > OPT_EXACT_MAXLEN) break;
4667 for (j = 0; j < len && p < end; j++)
4668 to->s[i++] = *p++;
4671 to->len = i;
4672 to->reach_end = (p == end ? add->reach_end : 0);
4674 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4675 if (! to->reach_end) tanc.right_anchor = 0;
4676 copy_opt_anc_info(&to->anc, &tanc);
4679 static void
4680 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4681 int raw ARG_UNUSED, OnigEncoding enc)
4683 int i, j, len;
4684 UChar *p;
4686 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4687 len = enclen(enc, p, end);
4688 if (i + len > OPT_EXACT_MAXLEN) break;
4689 for (j = 0; j < len && p < end; j++)
4690 to->s[i++] = *p++;
4693 to->len = i;
4696 static void
4697 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4699 int i, j, len;
4701 if (add->len == 0 || to->len == 0) {
4702 clear_opt_exact_info(to);
4703 return ;
4706 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4707 clear_opt_exact_info(to);
4708 return ;
4711 for (i = 0; i < to->len && i < add->len; ) {
4712 if (to->s[i] != add->s[i]) break;
4713 len = enclen(env->enc, to->s + i, to->s + to->len);
4715 for (j = 1; j < len; j++) {
4716 if (to->s[i+j] != add->s[i+j]) break;
4718 if (j < len) break;
4719 i += len;
4722 if (! add->reach_end || i < add->len || i < to->len) {
4723 to->reach_end = 0;
4725 to->len = i;
4726 if (to->ignore_case < 0)
4727 to->ignore_case = add->ignore_case;
4728 else if (add->ignore_case >= 0)
4729 to->ignore_case |= add->ignore_case;
4731 alt_merge_opt_anc_info(&to->anc, &add->anc);
4732 if (! to->reach_end) to->anc.right_anchor = 0;
4735 static void
4736 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4738 int v1, v2;
4740 v1 = now->len;
4741 v2 = alt->len;
4743 if (v2 == 0) {
4744 return ;
4746 else if (v1 == 0) {
4747 copy_opt_exact_info(now, alt);
4748 return ;
4750 else if (v1 <= 2 && v2 <= 2) {
4751 /* ByteValTable[x] is big value --> low price */
4752 v2 = map_position_value(enc, now->s[0]);
4753 v1 = map_position_value(enc, alt->s[0]);
4755 if (now->len > 1) v1 += 5;
4756 if (alt->len > 1) v2 += 5;
4759 if (now->ignore_case <= 0) v1 *= 2;
4760 if (alt->ignore_case <= 0) v2 *= 2;
4762 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4763 copy_opt_exact_info(now, alt);
4766 static void
4767 clear_opt_map_info(OptMapInfo* map)
4769 static const OptMapInfo clean_info = {
4770 {0, 0}, {0, 0}, 0,
4772 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4773 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4774 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4775 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4776 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4777 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4778 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4779 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4780 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4781 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4783 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4787 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4791 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4794 static void
4795 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4797 *to = *from;
4800 static void
4801 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4803 if (map->map[c] == 0) {
4804 map->map[c] = 1;
4805 map->value += map_position_value(enc, c);
4809 static int
4810 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4811 OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4813 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4814 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4815 int i, n;
4817 add_char_opt_map_info(map, p[0], enc);
4819 case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4820 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4821 if (n < 0) return n;
4823 for (i = 0; i < n; i++) {
4824 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4825 add_char_opt_map_info(map, buf[0], enc);
4828 return 0;
4831 static void
4832 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4834 const int z = 1<<15; /* 32768: something big value */
4836 int v1, v2;
4838 if (alt->value == 0) return ;
4839 if (now->value == 0) {
4840 copy_opt_map_info(now, alt);
4841 return ;
4844 v1 = z / now->value;
4845 v2 = z / alt->value;
4846 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4847 copy_opt_map_info(now, alt);
4850 static int
4851 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4853 #define COMP_EM_BASE 20
4854 int ve, vm;
4856 if (m->value <= 0) return -1;
4858 ve = COMP_EM_BASE * e->len * (e->ignore_case > 0 ? 1 : 2);
4859 vm = COMP_EM_BASE * 5 * 2 / m->value;
4860 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4863 static void
4864 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4866 int i, val;
4868 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4869 if (to->value == 0) return ;
4870 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4871 clear_opt_map_info(to);
4872 return ;
4875 alt_merge_mml(&to->mmd, &add->mmd);
4877 val = 0;
4878 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4879 if (add->map[i])
4880 to->map[i] = 1;
4882 if (to->map[i])
4883 val += map_position_value(enc, i);
4885 to->value = val;
4887 alt_merge_opt_anc_info(&to->anc, &add->anc);
4890 static void
4891 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4893 copy_mml(&(opt->exb.mmd), mmd);
4894 copy_mml(&(opt->expr.mmd), mmd);
4895 copy_mml(&(opt->map.mmd), mmd);
4898 static void
4899 clear_node_opt_info(NodeOptInfo* opt)
4901 clear_mml(&opt->len);
4902 clear_opt_anc_info(&opt->anc);
4903 clear_opt_exact_info(&opt->exb);
4904 clear_opt_exact_info(&opt->exm);
4905 clear_opt_exact_info(&opt->expr);
4906 clear_opt_map_info(&opt->map);
4909 static void
4910 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4912 *to = *from;
4915 static void
4916 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4918 int exb_reach, exm_reach;
4919 OptAncInfo tanc;
4921 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4922 copy_opt_anc_info(&to->anc, &tanc);
4924 if (add->exb.len > 0 && to->len.max == 0) {
4925 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4926 to->len.max, add->len.max);
4927 copy_opt_anc_info(&add->exb.anc, &tanc);
4930 if (add->map.value > 0 && to->len.max == 0) {
4931 if (add->map.mmd.max == 0)
4932 add->map.anc.left_anchor |= to->anc.left_anchor;
4935 exb_reach = to->exb.reach_end;
4936 exm_reach = to->exm.reach_end;
4938 if (add->len.max != 0)
4939 to->exb.reach_end = to->exm.reach_end = 0;
4941 if (add->exb.len > 0) {
4942 if (exb_reach) {
4943 concat_opt_exact_info(&to->exb, &add->exb, enc);
4944 clear_opt_exact_info(&add->exb);
4946 else if (exm_reach) {
4947 concat_opt_exact_info(&to->exm, &add->exb, enc);
4948 clear_opt_exact_info(&add->exb);
4951 select_opt_exact_info(enc, &to->exm, &add->exb);
4952 select_opt_exact_info(enc, &to->exm, &add->exm);
4954 if (to->expr.len > 0) {
4955 if (add->len.max > 0) {
4956 if (to->expr.len > (int )add->len.max)
4957 to->expr.len = (int )add->len.max;
4959 if (to->expr.mmd.max == 0)
4960 select_opt_exact_info(enc, &to->exb, &to->expr);
4961 else
4962 select_opt_exact_info(enc, &to->exm, &to->expr);
4965 else if (add->expr.len > 0) {
4966 copy_opt_exact_info(&to->expr, &add->expr);
4969 select_opt_map_info(&to->map, &add->map);
4971 add_mml(&to->len, &add->len);
4974 static void
4975 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4977 alt_merge_opt_anc_info (&to->anc, &add->anc);
4978 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4979 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4980 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4981 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4983 alt_merge_mml(&to->len, &add->len);
4987 #define MAX_NODE_OPT_INFO_REF_COUNT 5
4989 static int
4990 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4992 int type;
4993 int r = 0;
4995 clear_node_opt_info(opt);
4996 set_bound_node_opt_info(opt, &env->mmd);
4998 type = NTYPE(node);
4999 switch (type) {
5000 case NT_LIST:
5002 OptEnv nenv;
5003 NodeOptInfo nopt;
5004 Node* nd = node;
5006 copy_opt_env(&nenv, env);
5007 do {
5008 r = optimize_node_left(NCAR(nd), &nopt, &nenv);
5009 if (r == 0) {
5010 add_mml(&nenv.mmd, &nopt.len);
5011 concat_left_node_opt_info(env->enc, opt, &nopt);
5013 } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
5015 break;
5017 case NT_ALT:
5019 NodeOptInfo nopt;
5020 Node* nd = node;
5022 do {
5023 r = optimize_node_left(NCAR(nd), &nopt, env);
5024 if (r == 0) {
5025 if (nd == node) copy_node_opt_info(opt, &nopt);
5026 else alt_merge_node_opt_info(opt, &nopt, env);
5028 } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
5030 break;
5032 case NT_STR:
5034 StrNode* sn = NSTR(node);
5035 OnigDistance slen = sn->end - sn->s;
5036 int is_raw = NSTRING_IS_RAW(node);
5038 if (! NSTRING_IS_AMBIG(node)) {
5039 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
5040 is_raw, env->enc);
5041 opt->exb.ignore_case = 0;
5042 if (slen > 0) {
5043 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
5045 set_mml(&opt->len, slen, slen);
5047 else {
5048 OnigDistance max;
5050 if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
5051 int n = onigenc_strlen(env->enc, sn->s, sn->end);
5052 max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * (OnigDistance)n;
5054 else {
5055 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
5056 is_raw, env->enc);
5057 opt->exb.ignore_case = 1;
5059 if (slen > 0) {
5060 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
5061 env->enc, env->case_fold_flag);
5062 if (r != 0) break;
5065 max = slen;
5068 set_mml(&opt->len, slen, max);
5071 if ((OnigDistance )opt->exb.len == slen)
5072 opt->exb.reach_end = 1;
5074 break;
5076 case NT_CCLASS:
5078 int i, z;
5079 CClassNode* cc = NCCLASS(node);
5081 /* no need to check ignore case. (set in setup_tree()) */
5083 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
5084 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
5085 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
5087 set_mml(&opt->len, min, max);
5089 else {
5090 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
5091 z = BITSET_AT(cc->bs, i);
5092 if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
5093 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5096 set_mml(&opt->len, 1, 1);
5099 break;
5101 case NT_CTYPE:
5103 int i, min, max;
5104 int maxcode;
5106 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
5108 if (max == 1) {
5109 min = 1;
5111 maxcode = NCTYPE(node)->ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
5112 switch (NCTYPE(node)->ctype) {
5113 case ONIGENC_CTYPE_WORD:
5114 if (NCTYPE(node)->not != 0) {
5115 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
5116 if (! ONIGENC_IS_CODE_WORD(env->enc, i) || i >= maxcode) {
5117 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5121 else {
5122 for (i = 0; i < maxcode; i++) {
5123 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
5124 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5128 break;
5131 else {
5132 min = ONIGENC_MBC_MINLEN(env->enc);
5134 set_mml(&opt->len, min, max);
5136 break;
5138 case NT_CANY:
5140 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
5141 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
5142 set_mml(&opt->len, min, max);
5144 break;
5146 case NT_ANCHOR:
5147 switch (NANCHOR(node)->type) {
5148 case ANCHOR_BEGIN_BUF:
5149 case ANCHOR_BEGIN_POSITION:
5150 case ANCHOR_BEGIN_LINE:
5151 case ANCHOR_END_BUF:
5152 case ANCHOR_SEMI_END_BUF:
5153 case ANCHOR_END_LINE:
5154 case ANCHOR_LOOK_BEHIND: /* just for (?<=x).* */
5155 case ANCHOR_PREC_READ_NOT: /* just for (?!x).* */
5156 add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
5157 break;
5159 case ANCHOR_PREC_READ:
5161 NodeOptInfo nopt;
5163 r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
5164 if (r == 0) {
5165 if (nopt.exb.len > 0)
5166 copy_opt_exact_info(&opt->expr, &nopt.exb);
5167 else if (nopt.exm.len > 0)
5168 copy_opt_exact_info(&opt->expr, &nopt.exm);
5170 opt->expr.reach_end = 0;
5172 if (nopt.map.value > 0)
5173 copy_opt_map_info(&opt->map, &nopt.map);
5176 break;
5178 case ANCHOR_LOOK_BEHIND_NOT:
5179 break;
5181 break;
5183 case NT_BREF:
5185 int i;
5186 int* backs;
5187 OnigDistance min, max, tmin, tmax;
5188 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
5189 BRefNode* br = NBREF(node);
5191 if (br->state & NST_RECURSION) {
5192 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5193 break;
5195 backs = BACKREFS_P(br);
5196 r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
5197 if (r != 0) break;
5198 r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
5199 if (r != 0) break;
5200 for (i = 1; i < br->back_num; i++) {
5201 r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
5202 if (r != 0) break;
5203 r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
5204 if (r != 0) break;
5205 if (min > tmin) min = tmin;
5206 if (max < tmax) max = tmax;
5208 if (r == 0) set_mml(&opt->len, min, max);
5210 break;
5212 #ifdef USE_SUBEXP_CALL
5213 case NT_CALL:
5214 if (IS_CALL_RECURSION(NCALL(node)))
5215 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5216 else {
5217 OnigOptionType save = env->options;
5218 env->options = NENCLOSE(NCALL(node)->target)->option;
5219 r = optimize_node_left(NCALL(node)->target, opt, env);
5220 env->options = save;
5222 break;
5223 #endif
5225 case NT_QTFR:
5227 int i;
5228 OnigDistance min, max;
5229 NodeOptInfo nopt;
5230 QtfrNode* qn = NQTFR(node);
5232 r = optimize_node_left(qn->target, &nopt, env);
5233 if (r) break;
5235 if (/*qn->lower == 0 &&*/ IS_REPEAT_INFINITE(qn->upper)) {
5236 if (env->mmd.max == 0 &&
5237 NTYPE(qn->target) == NT_CANY && qn->greedy) {
5238 if (IS_MULTILINE(env->options))
5239 /* implicit anchor: /.*a/ ==> /\A.*a/ */
5240 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
5241 else
5242 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
5245 else {
5246 if (qn->lower > 0) {
5247 copy_node_opt_info(opt, &nopt);
5248 if (nopt.exb.len > 0) {
5249 if (nopt.exb.reach_end) {
5250 for (i = 2; i <= qn->lower &&
5251 ! is_full_opt_exact_info(&opt->exb); i++) {
5252 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
5254 if (i < qn->lower) {
5255 opt->exb.reach_end = 0;
5260 if (qn->lower != qn->upper) {
5261 opt->exb.reach_end = 0;
5262 opt->exm.reach_end = 0;
5264 if (qn->lower > 1)
5265 opt->exm.reach_end = 0;
5269 min = distance_multiply(nopt.len.min, qn->lower);
5270 if (IS_REPEAT_INFINITE(qn->upper))
5271 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
5272 else
5273 max = distance_multiply(nopt.len.max, qn->upper);
5275 set_mml(&opt->len, min, max);
5277 break;
5279 case NT_ENCLOSE:
5281 EncloseNode* en = NENCLOSE(node);
5283 switch (en->type) {
5284 case ENCLOSE_OPTION:
5286 OnigOptionType save = env->options;
5288 env->options = en->option;
5289 r = optimize_node_left(en->target, opt, env);
5290 env->options = save;
5292 break;
5294 case ENCLOSE_MEMORY:
5295 #ifdef USE_SUBEXP_CALL
5296 en->opt_count++;
5297 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
5298 OnigDistance min, max;
5300 min = 0;
5301 max = ONIG_INFINITE_DISTANCE;
5302 if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
5303 if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
5304 set_mml(&opt->len, min, max);
5306 else
5307 #endif
5309 r = optimize_node_left(en->target, opt, env);
5311 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
5312 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
5313 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
5316 break;
5318 case ENCLOSE_STOP_BACKTRACK:
5319 case ENCLOSE_CONDITION:
5320 r = optimize_node_left(en->target, opt, env);
5321 break;
5323 case ENCLOSE_ABSENT:
5324 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5325 break;
5328 break;
5330 default:
5331 #ifdef ONIG_DEBUG
5332 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
5333 NTYPE(node));
5334 #endif
5335 r = ONIGERR_TYPE_BUG;
5336 break;
5339 return r;
5342 static int
5343 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
5345 int r;
5346 int allow_reverse;
5348 if (e->len == 0) return 0;
5350 reg->exact = (UChar* )xmalloc(e->len);
5351 CHECK_NULL_RETURN_MEMERR(reg->exact);
5352 xmemcpy(reg->exact, e->s, e->len);
5353 reg->exact_end = reg->exact + e->len;
5355 allow_reverse =
5356 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
5358 if (e->ignore_case > 0) {
5359 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
5360 r = set_bm_skip(reg->exact, reg->exact_end, reg,
5361 reg->map, &(reg->int_map), 1);
5362 if (r == 0) {
5363 reg->optimize = (allow_reverse != 0
5364 ? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC);
5366 else {
5367 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
5370 else {
5371 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
5374 else {
5375 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
5376 r = set_bm_skip(reg->exact, reg->exact_end, reg,
5377 reg->map, &(reg->int_map), 0);
5378 if (r == 0) {
5379 reg->optimize = (allow_reverse != 0
5380 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
5382 else {
5383 reg->optimize = ONIG_OPTIMIZE_EXACT;
5386 else {
5387 reg->optimize = ONIG_OPTIMIZE_EXACT;
5391 reg->dmin = e->mmd.min;
5392 reg->dmax = e->mmd.max;
5394 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
5395 reg->threshold_len = (int )(reg->dmin + (reg->exact_end - reg->exact));
5398 return 0;
5401 static void
5402 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
5404 int i;
5406 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5407 reg->map[i] = m->map[i];
5409 reg->optimize = ONIG_OPTIMIZE_MAP;
5410 reg->dmin = m->mmd.min;
5411 reg->dmax = m->mmd.max;
5413 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
5414 reg->threshold_len = (int )(reg->dmin + 1);
5418 static void
5419 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
5421 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
5422 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
5425 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5426 static void print_optimize_info(FILE* f, regex_t* reg);
5427 #endif
5429 static int
5430 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
5433 int r;
5434 NodeOptInfo opt;
5435 OptEnv env;
5437 env.enc = reg->enc;
5438 env.options = reg->options;
5439 env.case_fold_flag = reg->case_fold_flag;
5440 env.scan_env = scan_env;
5441 clear_mml(&env.mmd);
5443 r = optimize_node_left(node, &opt, &env);
5444 if (r) return r;
5446 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
5447 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML |
5448 ANCHOR_LOOK_BEHIND);
5450 if ((opt.anc.left_anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) != 0)
5451 reg->anchor &= ~ANCHOR_ANYCHAR_STAR_ML;
5453 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF |
5454 ANCHOR_PREC_READ_NOT);
5456 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
5457 reg->anchor_dmin = opt.len.min;
5458 reg->anchor_dmax = opt.len.max;
5461 if (opt.exb.len > 0 || opt.exm.len > 0) {
5462 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
5463 if (opt.map.value > 0 &&
5464 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
5465 goto set_map;
5467 else {
5468 r = set_optimize_exact_info(reg, &opt.exb);
5469 set_sub_anchor(reg, &opt.exb.anc);
5472 else if (opt.map.value > 0) {
5473 set_map:
5474 set_optimize_map_info(reg, &opt.map);
5475 set_sub_anchor(reg, &opt.map.anc);
5477 else {
5478 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5479 if (opt.len.max == 0)
5480 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5483 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5484 print_optimize_info(stderr, reg);
5485 #endif
5486 return r;
5489 static void
5490 clear_optimize_info(regex_t* reg)
5492 reg->optimize = ONIG_OPTIMIZE_NONE;
5493 reg->anchor = 0;
5494 reg->anchor_dmin = 0;
5495 reg->anchor_dmax = 0;
5496 reg->sub_anchor = 0;
5497 reg->exact_end = (UChar* )NULL;
5498 reg->threshold_len = 0;
5499 xfree(reg->exact);
5500 reg->exact = (UChar* )NULL;
5503 #ifdef ONIG_DEBUG
5505 static void print_enc_string(FILE* fp, OnigEncoding enc,
5506 const UChar *s, const UChar *end)
5508 fprintf(fp, "\nPATTERN: /");
5510 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5511 const UChar *p;
5512 OnigCodePoint code;
5514 p = s;
5515 while (p < end) {
5516 code = ONIGENC_MBC_TO_CODE(enc, p, end);
5517 if (code >= 0x80) {
5518 fprintf(fp, " 0x%04x ", (int )code);
5520 else {
5521 fputc((int )code, fp);
5524 p += enclen(enc, p, end);
5527 else {
5528 while (s < end) {
5529 fputc((int )*s, fp);
5530 s++;
5534 fprintf(fp, "/ (%s)\n", enc->name);
5536 #endif /* ONIG_DEBUG */
5538 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5539 static void
5540 print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5542 if (a == ONIG_INFINITE_DISTANCE)
5543 fputs("inf", f);
5544 else
5545 fprintf(f, "(%"PRIuPTR")", a);
5547 fputs("-", f);
5549 if (b == ONIG_INFINITE_DISTANCE)
5550 fputs("inf", f);
5551 else
5552 fprintf(f, "(%"PRIuPTR")", b);
5555 static void
5556 print_anchor(FILE* f, int anchor)
5558 int q = 0;
5560 fprintf(f, "[");
5562 if (anchor & ANCHOR_BEGIN_BUF) {
5563 fprintf(f, "begin-buf");
5564 q = 1;
5566 if (anchor & ANCHOR_BEGIN_LINE) {
5567 if (q) fprintf(f, ", ");
5568 q = 1;
5569 fprintf(f, "begin-line");
5571 if (anchor & ANCHOR_BEGIN_POSITION) {
5572 if (q) fprintf(f, ", ");
5573 q = 1;
5574 fprintf(f, "begin-pos");
5576 if (anchor & ANCHOR_END_BUF) {
5577 if (q) fprintf(f, ", ");
5578 q = 1;
5579 fprintf(f, "end-buf");
5581 if (anchor & ANCHOR_SEMI_END_BUF) {
5582 if (q) fprintf(f, ", ");
5583 q = 1;
5584 fprintf(f, "semi-end-buf");
5586 if (anchor & ANCHOR_END_LINE) {
5587 if (q) fprintf(f, ", ");
5588 q = 1;
5589 fprintf(f, "end-line");
5591 if (anchor & ANCHOR_ANYCHAR_STAR) {
5592 if (q) fprintf(f, ", ");
5593 q = 1;
5594 fprintf(f, "anychar-star");
5596 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5597 if (q) fprintf(f, ", ");
5598 fprintf(f, "anychar-star-ml");
5601 fprintf(f, "]");
5604 static void
5605 print_optimize_info(FILE* f, regex_t* reg)
5607 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5608 "EXACT_IC", "MAP",
5609 "EXACT_BM_IC", "EXACT_BM_NOT_REV_IC" };
5611 fprintf(f, "optimize: %s\n", on[reg->optimize]);
5612 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
5613 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5614 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5615 fprintf(f, "\n");
5617 if (reg->optimize) {
5618 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
5619 fprintf(f, "\n");
5621 fprintf(f, "\n");
5623 if (reg->exact) {
5624 UChar *p;
5625 fprintf(f, "exact: [");
5626 for (p = reg->exact; p < reg->exact_end; p++) {
5627 fputc(*p, f);
5629 fprintf(f, "]: length: %"PRIdPTR"\n", (reg->exact_end - reg->exact));
5631 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5632 int c, i, n = 0;
5634 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5635 if (reg->map[i]) n++;
5637 fprintf(f, "map: n=%d\n", n);
5638 if (n > 0) {
5639 c = 0;
5640 fputc('[', f);
5641 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5642 if (reg->map[i] != 0) {
5643 if (c > 0) fputs(", ", f);
5644 c++;
5645 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5646 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5647 fputc(i, f);
5648 else
5649 fprintf(f, "%d", i);
5652 fprintf(f, "]\n");
5656 #endif /* ONIG_DEBUG_COMPILE || ONIG_DEBUG_MATCH */
5659 extern void
5660 onig_free_body(regex_t* reg)
5662 if (IS_NOT_NULL(reg)) {
5663 xfree(reg->p);
5664 xfree(reg->exact);
5665 xfree(reg->int_map);
5666 xfree(reg->int_map_backward);
5667 xfree(reg->repeat_range);
5668 onig_free(reg->chain);
5670 #ifdef USE_NAMED_GROUP
5671 onig_names_free(reg);
5672 #endif
5676 extern void
5677 onig_free(regex_t* reg)
5679 if (IS_NOT_NULL(reg)) {
5680 onig_free_body(reg);
5681 xfree(reg);
5685 static void*
5686 dup_copy(const void *ptr, size_t size)
5688 void *newptr = xmalloc(size);
5689 if (IS_NOT_NULL(newptr)) {
5690 memcpy(newptr, ptr, size);
5692 return newptr;
5695 extern int
5696 onig_reg_copy(regex_t** nreg, regex_t* oreg)
5698 if (IS_NOT_NULL(oreg)) {
5699 regex_t *reg = *nreg = (regex_t* )xmalloc(sizeof(regex_t));
5700 if (IS_NULL(reg)) return ONIGERR_MEMORY;
5702 *reg = *oreg;
5704 # define COPY_FAILED(mem, size) IS_NULL(reg->mem = dup_copy(reg->mem, size))
5706 if (IS_NOT_NULL(reg->exact)) {
5707 size_t exact_size = reg->exact_end - reg->exact;
5708 if (COPY_FAILED(exact, exact_size))
5709 goto err;
5710 (reg)->exact_end = (reg)->exact + exact_size;
5713 if (IS_NOT_NULL(reg->int_map)) {
5714 if (COPY_FAILED(int_map, sizeof(int) * ONIG_CHAR_TABLE_SIZE))
5715 goto err_int_map;
5717 if (IS_NOT_NULL(reg->int_map_backward)) {
5718 if (COPY_FAILED(int_map_backward, sizeof(int) * ONIG_CHAR_TABLE_SIZE))
5719 goto err_int_map_backward;
5721 if (IS_NOT_NULL(reg->p)) {
5722 if (COPY_FAILED(p, reg->alloc))
5723 goto err_p;
5725 if (IS_NOT_NULL(reg->repeat_range)) {
5726 if (COPY_FAILED(repeat_range, reg->repeat_range_alloc * sizeof(OnigRepeatRange)))
5727 goto err_repeat_range;
5729 if (IS_NOT_NULL(reg->name_table)) {
5730 if (onig_names_copy(reg, oreg))
5731 goto err_name_table;
5733 if (IS_NOT_NULL(reg->chain)) {
5734 if (onig_reg_copy(&reg->chain, reg->chain))
5735 goto err_chain;
5737 return 0;
5738 # undef COPY_FAILED
5740 err_chain:
5741 onig_names_free(reg);
5742 err_name_table:
5743 xfree(reg->repeat_range);
5744 err_repeat_range:
5745 xfree(reg->p);
5746 err_p:
5747 xfree(reg->int_map_backward);
5748 err_int_map_backward:
5749 xfree(reg->int_map);
5750 err_int_map:
5751 xfree(reg->exact);
5752 err:
5753 xfree(reg);
5754 return ONIGERR_MEMORY;
5756 return 0;
5759 #ifdef RUBY
5760 size_t
5761 onig_memsize(const regex_t *reg)
5763 size_t size = sizeof(regex_t);
5764 if (IS_NULL(reg)) return 0;
5765 if (IS_NOT_NULL(reg->p)) size += reg->alloc;
5766 if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact;
5767 if (IS_NOT_NULL(reg->int_map)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
5768 if (IS_NOT_NULL(reg->int_map_backward)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
5769 if (IS_NOT_NULL(reg->repeat_range)) size += reg->repeat_range_alloc * sizeof(OnigRepeatRange);
5770 if (IS_NOT_NULL(reg->chain)) size += onig_memsize(reg->chain);
5772 return size;
5775 size_t
5776 onig_region_memsize(const OnigRegion *regs)
5778 size_t size = sizeof(*regs);
5779 if (IS_NULL(regs)) return 0;
5780 size += regs->allocated * (sizeof(*regs->beg) + sizeof(*regs->end));
5781 return size;
5783 #endif
5785 #define REGEX_TRANSFER(to,from) do {\
5786 onig_free_body(to);\
5787 xmemcpy(to, from, sizeof(regex_t));\
5788 xfree(from);\
5789 } while (0)
5791 #if 0
5792 extern void
5793 onig_transfer(regex_t* to, regex_t* from)
5795 REGEX_TRANSFER(to, from);
5797 #endif
5799 #ifdef ONIG_DEBUG_COMPILE
5800 static void print_compiled_byte_code_list(FILE* f, regex_t* reg);
5801 #endif
5802 #ifdef ONIG_DEBUG_PARSE_TREE
5803 static void print_tree(FILE* f, Node* node);
5804 #endif
5806 #ifdef RUBY
5807 extern int
5808 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5809 OnigErrorInfo* einfo)
5811 return onig_compile_ruby(reg, pattern, pattern_end, einfo, NULL, 0);
5813 #endif
5815 #ifdef RUBY
5816 extern int
5817 onig_compile_ruby(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5818 OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
5819 #else
5820 extern int
5821 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5822 OnigErrorInfo* einfo)
5823 #endif
5825 #define COMPILE_INIT_SIZE 20
5827 int r;
5828 OnigDistance init_size;
5829 Node* root;
5830 ScanEnv scan_env = {0};
5831 #ifdef USE_SUBEXP_CALL
5832 UnsetAddrList uslist;
5833 #endif
5835 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5837 #ifdef RUBY
5838 scan_env.sourcefile = sourcefile;
5839 scan_env.sourceline = sourceline;
5840 #endif
5842 #ifdef ONIG_DEBUG
5843 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5844 #endif
5846 if (reg->alloc == 0) {
5847 init_size = (pattern_end - pattern) * 2;
5848 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5849 r = BBUF_INIT(reg, init_size);
5850 if (r != 0) goto end;
5852 else
5853 reg->used = 0;
5855 reg->num_mem = 0;
5856 reg->num_repeat = 0;
5857 reg->num_null_check = 0;
5858 reg->repeat_range_alloc = 0;
5859 reg->repeat_range = (OnigRepeatRange* )NULL;
5860 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5861 reg->num_comb_exp_check = 0;
5862 #endif
5864 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5865 if (r != 0) goto err;
5867 #ifdef ONIG_DEBUG_PARSE_TREE
5868 # if 0
5869 fprintf(stderr, "ORIGINAL PARSE TREE:\n");
5870 print_tree(stderr, root);
5871 # endif
5872 #endif
5874 #ifdef USE_NAMED_GROUP
5875 /* mixed use named group and no-named group */
5876 if (scan_env.num_named > 0 &&
5877 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5878 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5879 if (scan_env.num_named != scan_env.num_mem)
5880 r = disable_noname_group_capture(&root, reg, &scan_env);
5881 else
5882 r = numbered_ref_check(root);
5884 if (r != 0) goto err;
5886 #endif
5888 #ifdef USE_SUBEXP_CALL
5889 if (scan_env.num_call > 0) {
5890 r = unset_addr_list_init(&uslist, scan_env.num_call);
5891 if (r != 0) goto err;
5892 scan_env.unset_addr_list = &uslist;
5893 r = setup_subexp_call(root, &scan_env);
5894 if (r != 0) goto err_unset;
5895 r = subexp_recursive_check_trav(root, &scan_env);
5896 if (r < 0) goto err_unset;
5897 r = subexp_inf_recursive_check_trav(root, &scan_env);
5898 if (r != 0) goto err_unset;
5900 reg->num_call = scan_env.num_call;
5902 else
5903 reg->num_call = 0;
5904 #endif
5906 r = setup_tree(root, reg, 0, &scan_env);
5907 if (r != 0) goto err_unset;
5909 #ifdef ONIG_DEBUG_PARSE_TREE
5910 print_tree(stderr, root);
5911 #endif
5913 reg->capture_history = scan_env.capture_history;
5914 reg->bt_mem_start = scan_env.bt_mem_start;
5915 reg->bt_mem_start |= reg->capture_history;
5916 if (IS_FIND_CONDITION(reg->options))
5917 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5918 else {
5919 reg->bt_mem_end = scan_env.bt_mem_end;
5920 reg->bt_mem_end |= reg->capture_history;
5923 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5924 if (scan_env.backrefed_mem == 0
5925 # ifdef USE_SUBEXP_CALL
5926 || scan_env.num_call == 0
5927 # endif
5929 setup_comb_exp_check(root, 0, &scan_env);
5930 # ifdef USE_SUBEXP_CALL
5931 if (scan_env.has_recursion != 0) {
5932 scan_env.num_comb_exp_check = 0;
5934 else
5935 # endif
5936 if (scan_env.comb_exp_max_regnum > 0) {
5937 int i;
5938 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5939 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5940 scan_env.num_comb_exp_check = 0;
5941 break;
5947 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5948 #endif
5950 clear_optimize_info(reg);
5951 #ifndef ONIG_DONT_OPTIMIZE
5952 r = set_optimize_info_from_tree(root, reg, &scan_env);
5953 if (r != 0) goto err_unset;
5954 #endif
5956 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5957 xfree(scan_env.mem_nodes_dynamic);
5958 scan_env.mem_nodes_dynamic = (Node** )NULL;
5961 r = compile_tree(root, reg);
5962 if (r == 0) {
5963 r = add_opcode(reg, OP_END);
5964 #ifdef USE_SUBEXP_CALL
5965 if (scan_env.num_call > 0) {
5966 r = unset_addr_list_fix(&uslist, reg);
5967 unset_addr_list_end(&uslist);
5968 if (r) goto err;
5970 #endif
5972 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5973 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5974 else {
5975 if (reg->bt_mem_start != 0)
5976 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5977 else
5978 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5981 #ifdef USE_SUBEXP_CALL
5982 else if (scan_env.num_call > 0) {
5983 unset_addr_list_end(&uslist);
5985 #endif
5986 onig_node_free(root);
5988 #ifdef ONIG_DEBUG_COMPILE
5989 # ifdef USE_NAMED_GROUP
5990 onig_print_names(stderr, reg);
5991 # endif
5992 print_compiled_byte_code_list(stderr, reg);
5993 #endif
5995 end:
5996 onig_reg_resize(reg);
5997 return r;
5999 err_unset:
6000 #ifdef USE_SUBEXP_CALL
6001 if (scan_env.num_call > 0) {
6002 unset_addr_list_end(&uslist);
6004 #endif
6005 err:
6006 if (IS_NOT_NULL(scan_env.error)) {
6007 if (IS_NOT_NULL(einfo)) {
6008 einfo->enc = scan_env.enc;
6009 einfo->par = scan_env.error;
6010 einfo->par_end = scan_env.error_end;
6014 onig_node_free(root);
6015 xfree(scan_env.mem_nodes_dynamic);
6017 return r;
6020 static int onig_inited = 0;
6022 extern int
6023 onig_reg_init(regex_t* reg, OnigOptionType option,
6024 OnigCaseFoldType case_fold_flag,
6025 OnigEncoding enc, const OnigSyntaxType* syntax)
6027 if (! onig_inited)
6028 onig_init();
6030 if (IS_NULL(reg))
6031 return ONIGERR_INVALID_ARGUMENT;
6033 if (ONIGENC_IS_UNDEF(enc))
6034 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SET;
6036 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
6037 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
6038 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
6041 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
6042 option |= syntax->options;
6043 option &= ~ONIG_OPTION_SINGLELINE;
6045 else
6046 option |= syntax->options;
6048 (reg)->enc = enc;
6049 (reg)->options = option;
6050 (reg)->syntax = syntax;
6051 (reg)->optimize = 0;
6052 (reg)->exact = (UChar* )NULL;
6053 (reg)->int_map = (int* )NULL;
6054 (reg)->int_map_backward = (int* )NULL;
6055 (reg)->chain = (regex_t* )NULL;
6057 (reg)->p = (UChar* )NULL;
6058 (reg)->alloc = 0;
6059 (reg)->used = 0;
6060 (reg)->name_table = (void* )NULL;
6062 (reg)->case_fold_flag = case_fold_flag;
6064 (reg)->timelimit = 0;
6066 return 0;
6069 extern int
6070 onig_new_without_alloc(regex_t* reg, const UChar* pattern,
6071 const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
6072 const OnigSyntaxType* syntax, OnigErrorInfo* einfo)
6074 int r;
6076 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
6077 if (r) return r;
6079 r = onig_compile(reg, pattern, pattern_end, einfo);
6080 return r;
6083 extern int
6084 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
6085 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
6086 OnigErrorInfo* einfo)
6088 *reg = (regex_t* )xmalloc(sizeof(regex_t));
6089 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
6091 int r = onig_new_without_alloc(*reg, pattern, pattern_end, option, enc, syntax, einfo);
6092 if (r) {
6093 onig_free(*reg);
6094 *reg = NULL;
6097 return r;
6100 extern int
6101 onig_initialize(OnigEncoding encodings[] ARG_UNUSED, int n ARG_UNUSED)
6103 return onig_init();
6106 extern int
6107 onig_init(void)
6109 if (onig_inited != 0)
6110 return 0;
6112 onig_inited = 1;
6114 #if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER)
6115 _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
6116 #endif
6118 onigenc_init();
6119 /* onigenc_set_default_caseconv_table((UChar* )0); */
6121 #ifdef ONIG_DEBUG_STATISTICS
6122 onig_statistics_init();
6123 #endif
6125 return 0;
6129 static OnigEndCallListItemType* EndCallTop;
6131 extern void onig_add_end_call(void (*func)(void))
6133 OnigEndCallListItemType* item;
6135 item = (OnigEndCallListItemType* )xmalloc(sizeof(*item));
6136 if (item == 0) return ;
6138 item->next = EndCallTop;
6139 item->func = func;
6141 EndCallTop = item;
6144 static void
6145 exec_end_call_list(void)
6147 OnigEndCallListItemType* prev;
6148 void (*func)(void);
6150 while (EndCallTop != 0) {
6151 func = EndCallTop->func;
6152 (*func)();
6154 prev = EndCallTop;
6155 EndCallTop = EndCallTop->next;
6156 xfree(prev);
6160 extern int
6161 onig_end(void)
6163 exec_end_call_list();
6165 #ifdef ONIG_DEBUG_STATISTICS
6166 onig_print_statistics(stderr);
6167 #endif
6169 #if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER)
6170 _CrtDumpMemoryLeaks();
6171 #endif
6173 onig_inited = 0;
6175 return 0;
6178 extern int
6179 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
6181 OnigCodePoint n, *data;
6182 OnigCodePoint low, high, x;
6184 GET_CODE_POINT(n, p);
6185 data = (OnigCodePoint* )p;
6186 data++;
6188 for (low = 0, high = n; low < high; ) {
6189 x = (low + high) >> 1;
6190 if (code > data[x * 2 + 1])
6191 low = x + 1;
6192 else
6193 high = x;
6196 return ((low < n && code >= data[low * 2]) ? 1 : 0);
6199 extern int
6200 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
6202 int found;
6204 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
6205 if (IS_NULL(cc->mbuf)) {
6206 found = 0;
6208 else {
6209 found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
6212 else {
6213 found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
6216 if (IS_NCCLASS_NOT(cc))
6217 return !found;
6218 else
6219 return found;
6222 extern int
6223 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
6225 int len;
6227 if (ONIGENC_MBC_MINLEN(enc) > 1) {
6228 len = 2;
6230 else {
6231 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
6233 return onig_is_code_in_cc_len(len, code, cc);
6237 #ifdef ONIG_DEBUG
6239 /* arguments type */
6240 # define ARG_SPECIAL -1
6241 # define ARG_NON 0
6242 # define ARG_RELADDR 1
6243 # define ARG_ABSADDR 2
6244 # define ARG_LENGTH 3
6245 # define ARG_MEMNUM 4
6246 # define ARG_OPTION 5
6247 # define ARG_STATE_CHECK 6
6249 OnigOpInfoType OnigOpInfo[] = {
6250 { OP_FINISH, "finish", ARG_NON },
6251 { OP_END, "end", ARG_NON },
6252 { OP_EXACT1, "exact1", ARG_SPECIAL },
6253 { OP_EXACT2, "exact2", ARG_SPECIAL },
6254 { OP_EXACT3, "exact3", ARG_SPECIAL },
6255 { OP_EXACT4, "exact4", ARG_SPECIAL },
6256 { OP_EXACT5, "exact5", ARG_SPECIAL },
6257 { OP_EXACTN, "exactn", ARG_SPECIAL },
6258 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
6259 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
6260 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
6261 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
6262 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
6263 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
6264 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
6265 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
6266 { OP_CCLASS, "cclass", ARG_SPECIAL },
6267 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
6268 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
6269 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
6270 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
6271 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
6272 { OP_ANYCHAR, "anychar", ARG_NON },
6273 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
6274 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
6275 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
6276 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
6277 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
6278 { OP_WORD, "word", ARG_NON },
6279 { OP_NOT_WORD, "not-word", ARG_NON },
6280 { OP_WORD_BOUND, "word-bound", ARG_NON },
6281 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
6282 { OP_WORD_BEGIN, "word-begin", ARG_NON },
6283 { OP_WORD_END, "word-end", ARG_NON },
6284 { OP_ASCII_WORD, "ascii-word", ARG_NON },
6285 { OP_NOT_ASCII_WORD, "not-ascii-word", ARG_NON },
6286 { OP_ASCII_WORD_BOUND, "ascii-word-bound", ARG_NON },
6287 { OP_NOT_ASCII_WORD_BOUND,"not-ascii-word-bound", ARG_NON },
6288 { OP_ASCII_WORD_BEGIN, "ascii-word-begin", ARG_NON },
6289 { OP_ASCII_WORD_END, "ascii-word-end", ARG_NON },
6290 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
6291 { OP_END_BUF, "end-buf", ARG_NON },
6292 { OP_BEGIN_LINE, "begin-line", ARG_NON },
6293 { OP_END_LINE, "end-line", ARG_NON },
6294 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
6295 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
6296 { OP_BACKREF1, "backref1", ARG_NON },
6297 { OP_BACKREF2, "backref2", ARG_NON },
6298 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
6299 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
6300 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
6301 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
6302 { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
6303 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
6304 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
6305 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
6306 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
6307 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
6308 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
6309 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
6310 { OP_SET_OPTION, "set-option", ARG_OPTION },
6311 { OP_KEEP, "keep", ARG_NON },
6312 { OP_FAIL, "fail", ARG_NON },
6313 { OP_JUMP, "jump", ARG_RELADDR },
6314 { OP_PUSH, "push", ARG_RELADDR },
6315 { OP_POP, "pop", ARG_NON },
6316 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
6317 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
6318 { OP_REPEAT, "repeat", ARG_SPECIAL },
6319 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
6320 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
6321 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
6322 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
6323 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
6324 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
6325 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
6326 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
6327 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
6328 { OP_PUSH_POS, "push-pos", ARG_NON },
6329 { OP_POP_POS, "pop-pos", ARG_NON },
6330 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
6331 { OP_FAIL_POS, "fail-pos", ARG_NON },
6332 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
6333 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
6334 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
6335 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
6336 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
6337 { OP_PUSH_ABSENT_POS, "push-absent-pos", ARG_NON },
6338 { OP_ABSENT, "absent", ARG_RELADDR },
6339 { OP_ABSENT_END, "absent-end", ARG_NON },
6340 { OP_CALL, "call", ARG_ABSADDR },
6341 { OP_RETURN, "return", ARG_NON },
6342 { OP_CONDITION, "condition", ARG_SPECIAL },
6343 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
6344 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
6345 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
6346 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
6347 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
6348 "state-check-anychar-ml*", ARG_STATE_CHECK },
6349 { -1, "", ARG_NON }
6352 static const char*
6353 op2name(int opcode)
6355 int i;
6357 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
6358 if (opcode == OnigOpInfo[i].opcode)
6359 return OnigOpInfo[i].name;
6361 return "";
6364 static int
6365 op2arg_type(int opcode)
6367 int i;
6369 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
6370 if (opcode == OnigOpInfo[i].opcode)
6371 return OnigOpInfo[i].arg_type;
6373 return ARG_SPECIAL;
6376 # ifdef ONIG_DEBUG_PARSE_TREE
6377 static void
6378 Indent(FILE* f, int indent)
6380 int i;
6381 for (i = 0; i < indent; i++) putc(' ', f);
6383 # endif /* ONIG_DEBUG_PARSE_TREE */
6385 static void
6386 p_string(FILE* f, ptrdiff_t len, UChar* s)
6388 fputs(":", f);
6389 while (len-- > 0) { fputc(*s++, f); }
6392 static void
6393 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
6395 int x = len * mb_len;
6397 fprintf(f, ":%d:", len);
6398 while (x-- > 0) { fputc(*s++, f); }
6401 extern void
6402 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp,
6403 OnigEncoding enc)
6405 int i, n, arg_type;
6406 RelAddrType addr;
6407 LengthType len;
6408 MemNumType mem;
6409 StateCheckNumType scn;
6410 OnigCodePoint code;
6411 UChar *q;
6413 fprintf(f, "[%s", op2name(*bp));
6414 arg_type = op2arg_type(*bp);
6415 if (arg_type != ARG_SPECIAL) {
6416 bp++;
6417 switch (arg_type) {
6418 case ARG_NON:
6419 break;
6420 case ARG_RELADDR:
6421 GET_RELADDR_INC(addr, bp);
6422 fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr);
6423 break;
6424 case ARG_ABSADDR:
6425 GET_ABSADDR_INC(addr, bp);
6426 fprintf(f, ":(%d)", addr);
6427 break;
6428 case ARG_LENGTH:
6429 GET_LENGTH_INC(len, bp);
6430 fprintf(f, ":%d", len);
6431 break;
6432 case ARG_MEMNUM:
6433 mem = *((MemNumType* )bp);
6434 bp += SIZE_MEMNUM;
6435 fprintf(f, ":%d", mem);
6436 break;
6437 case ARG_OPTION:
6439 OnigOptionType option = *((OnigOptionType* )bp);
6440 bp += SIZE_OPTION;
6441 fprintf(f, ":%d", option);
6443 break;
6445 case ARG_STATE_CHECK:
6446 scn = *((StateCheckNumType* )bp);
6447 bp += SIZE_STATE_CHECK_NUM;
6448 fprintf(f, ":%d", scn);
6449 break;
6452 else {
6453 switch (*bp++) {
6454 case OP_EXACT1:
6455 case OP_ANYCHAR_STAR_PEEK_NEXT:
6456 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
6457 p_string(f, 1, bp++); break;
6458 case OP_EXACT2:
6459 p_string(f, 2, bp); bp += 2; break;
6460 case OP_EXACT3:
6461 p_string(f, 3, bp); bp += 3; break;
6462 case OP_EXACT4:
6463 p_string(f, 4, bp); bp += 4; break;
6464 case OP_EXACT5:
6465 p_string(f, 5, bp); bp += 5; break;
6466 case OP_EXACTN:
6467 GET_LENGTH_INC(len, bp);
6468 p_len_string(f, len, 1, bp);
6469 bp += len;
6470 break;
6472 case OP_EXACTMB2N1:
6473 p_string(f, 2, bp); bp += 2; break;
6474 case OP_EXACTMB2N2:
6475 p_string(f, 4, bp); bp += 4; break;
6476 case OP_EXACTMB2N3:
6477 p_string(f, 6, bp); bp += 6; break;
6478 case OP_EXACTMB2N:
6479 GET_LENGTH_INC(len, bp);
6480 p_len_string(f, len, 2, bp);
6481 bp += len * 2;
6482 break;
6483 case OP_EXACTMB3N:
6484 GET_LENGTH_INC(len, bp);
6485 p_len_string(f, len, 3, bp);
6486 bp += len * 3;
6487 break;
6488 case OP_EXACTMBN:
6490 int mb_len;
6492 GET_LENGTH_INC(mb_len, bp);
6493 GET_LENGTH_INC(len, bp);
6494 fprintf(f, ":%d:%d:", mb_len, len);
6495 n = len * mb_len;
6496 while (n-- > 0) { fputc(*bp++, f); }
6498 break;
6500 case OP_EXACT1_IC:
6501 len = enclen(enc, bp, bpend);
6502 p_string(f, len, bp);
6503 bp += len;
6504 break;
6505 case OP_EXACTN_IC:
6506 GET_LENGTH_INC(len, bp);
6507 p_len_string(f, len, 1, bp);
6508 bp += len;
6509 break;
6511 case OP_CCLASS:
6512 n = bitset_on_num((BitSetRef )bp);
6513 bp += SIZE_BITSET;
6514 fprintf(f, ":%d", n);
6515 break;
6517 case OP_CCLASS_NOT:
6518 n = bitset_on_num((BitSetRef )bp);
6519 bp += SIZE_BITSET;
6520 fprintf(f, ":%d", n);
6521 break;
6523 case OP_CCLASS_MB:
6524 case OP_CCLASS_MB_NOT:
6525 GET_LENGTH_INC(len, bp);
6526 q = bp;
6527 # ifndef PLATFORM_UNALIGNED_WORD_ACCESS
6528 ALIGNMENT_RIGHT(q);
6529 # endif
6530 GET_CODE_POINT(code, q);
6531 bp += len;
6532 fprintf(f, ":%d:%d", (int )code, len);
6533 break;
6535 case OP_CCLASS_MIX:
6536 case OP_CCLASS_MIX_NOT:
6537 n = bitset_on_num((BitSetRef )bp);
6538 bp += SIZE_BITSET;
6539 GET_LENGTH_INC(len, bp);
6540 q = bp;
6541 # ifndef PLATFORM_UNALIGNED_WORD_ACCESS
6542 ALIGNMENT_RIGHT(q);
6543 # endif
6544 GET_CODE_POINT(code, q);
6545 bp += len;
6546 fprintf(f, ":%d:%d:%d", n, (int )code, len);
6547 break;
6549 case OP_BACKREFN_IC:
6550 mem = *((MemNumType* )bp);
6551 bp += SIZE_MEMNUM;
6552 fprintf(f, ":%d", mem);
6553 break;
6555 case OP_BACKREF_MULTI_IC:
6556 case OP_BACKREF_MULTI:
6557 fputs(" ", f);
6558 GET_LENGTH_INC(len, bp);
6559 for (i = 0; i < len; i++) {
6560 GET_MEMNUM_INC(mem, bp);
6561 if (i > 0) fputs(", ", f);
6562 fprintf(f, "%d", mem);
6564 break;
6566 case OP_BACKREF_WITH_LEVEL:
6568 OnigOptionType option;
6569 LengthType level;
6571 GET_OPTION_INC(option, bp);
6572 fprintf(f, ":%d", option);
6573 GET_LENGTH_INC(level, bp);
6574 fprintf(f, ":%d", level);
6576 fputs(" ", f);
6577 GET_LENGTH_INC(len, bp);
6578 for (i = 0; i < len; i++) {
6579 GET_MEMNUM_INC(mem, bp);
6580 if (i > 0) fputs(", ", f);
6581 fprintf(f, "%d", mem);
6584 break;
6586 case OP_REPEAT:
6587 case OP_REPEAT_NG:
6589 mem = *((MemNumType* )bp);
6590 bp += SIZE_MEMNUM;
6591 addr = *((RelAddrType* )bp);
6592 bp += SIZE_RELADDR;
6593 fprintf(f, ":%d:%d", mem, addr);
6595 break;
6597 case OP_PUSH_OR_JUMP_EXACT1:
6598 case OP_PUSH_IF_PEEK_NEXT:
6599 addr = *((RelAddrType* )bp);
6600 bp += SIZE_RELADDR;
6601 fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr);
6602 p_string(f, 1, bp);
6603 bp += 1;
6604 break;
6606 case OP_LOOK_BEHIND:
6607 GET_LENGTH_INC(len, bp);
6608 fprintf(f, ":%d", len);
6609 break;
6611 case OP_PUSH_LOOK_BEHIND_NOT:
6612 GET_RELADDR_INC(addr, bp);
6613 GET_LENGTH_INC(len, bp);
6614 fprintf(f, ":%d:(%s%d)", len, (addr >= 0) ? "+" : "", addr);
6615 break;
6617 case OP_STATE_CHECK_PUSH:
6618 case OP_STATE_CHECK_PUSH_OR_JUMP:
6619 scn = *((StateCheckNumType* )bp);
6620 bp += SIZE_STATE_CHECK_NUM;
6621 addr = *((RelAddrType* )bp);
6622 bp += SIZE_RELADDR;
6623 fprintf(f, ":%d:(%s%d)", scn, (addr >= 0) ? "+" : "", addr);
6624 break;
6626 case OP_CONDITION:
6627 GET_MEMNUM_INC(mem, bp);
6628 GET_RELADDR_INC(addr, bp);
6629 fprintf(f, ":%d:(%s%d)", mem, (addr >= 0) ? "+" : "", addr);
6630 break;
6632 default:
6633 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6634 bp[-1]);
6637 fputs("]", f);
6638 if (nextp) *nextp = bp;
6641 # ifdef ONIG_DEBUG_COMPILE
6642 static void
6643 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6645 int ncode;
6646 UChar* bp = reg->p;
6647 UChar* end = reg->p + reg->used;
6649 fprintf(f, "code length: %d", reg->used);
6651 ncode = -1;
6652 while (bp < end) {
6653 ncode++;
6654 if (ncode % 5 == 0)
6655 fprintf(f, "\n%ld:", bp - reg->p);
6656 else
6657 fprintf(f, " %ld:", bp - reg->p);
6658 onig_print_compiled_byte_code(f, bp, end, &bp, reg->enc);
6661 fprintf(f, "\n");
6663 # endif /* ONIG_DEBUG_COMPILE */
6665 # ifdef ONIG_DEBUG_PARSE_TREE
6666 static void
6667 print_indent_tree(FILE* f, Node* node, int indent)
6669 int i, type, container_p = 0;
6670 int add = 3;
6671 UChar* p;
6673 Indent(f, indent);
6674 if (IS_NULL(node)) {
6675 fprintf(f, "ERROR: null node!!!\n");
6676 exit (0);
6679 type = NTYPE(node);
6680 switch (type) {
6681 case NT_LIST:
6682 case NT_ALT:
6683 if (NTYPE(node) == NT_LIST)
6684 fprintf(f, "<list:%"PRIxPTR">\n", (intptr_t )node);
6685 else
6686 fprintf(f, "<alt:%"PRIxPTR">\n", (intptr_t )node);
6688 print_indent_tree(f, NCAR(node), indent + add);
6689 while (IS_NOT_NULL(node = NCDR(node))) {
6690 if (NTYPE(node) != type) {
6691 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6692 exit(0);
6694 print_indent_tree(f, NCAR(node), indent + add);
6696 break;
6698 case NT_STR:
6699 fprintf(f, "<string%s:%"PRIxPTR">",
6700 (NSTRING_IS_RAW(node) ? "-raw" : ""), (intptr_t )node);
6701 for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6702 if (*p >= 0x20 && *p < 0x7f)
6703 fputc(*p, f);
6704 else {
6705 fprintf(f, " 0x%02x", *p);
6708 break;
6710 case NT_CCLASS:
6711 fprintf(f, "<cclass:%"PRIxPTR">", (intptr_t )node);
6712 if (IS_NCCLASS_NOT(NCCLASS(node))) fputs("not ", f);
6713 if (NCCLASS(node)->mbuf) {
6714 BBuf* bbuf = NCCLASS(node)->mbuf;
6715 OnigCodePoint* data = (OnigCodePoint* )bbuf->p;
6716 OnigCodePoint* end = (OnigCodePoint* )(bbuf->p + bbuf->used);
6717 fprintf(f, "%d", *data++);
6718 for (; data < end; data+=2) {
6719 fprintf(f, ",");
6720 fprintf(f, "%04x-%04x", data[0], data[1]);
6723 break;
6725 case NT_CTYPE:
6726 fprintf(f, "<ctype:%"PRIxPTR"> ", (intptr_t )node);
6727 switch (NCTYPE(node)->ctype) {
6728 case ONIGENC_CTYPE_WORD:
6729 if (NCTYPE(node)->not != 0)
6730 fputs("not word", f);
6731 else
6732 fputs("word", f);
6733 break;
6735 default:
6736 fprintf(f, "ERROR: undefined ctype.\n");
6737 exit(0);
6739 break;
6741 case NT_CANY:
6742 fprintf(f, "<anychar:%"PRIxPTR">", (intptr_t )node);
6743 break;
6745 case NT_ANCHOR:
6746 fprintf(f, "<anchor:%"PRIxPTR"> ", (intptr_t )node);
6747 switch (NANCHOR(node)->type) {
6748 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
6749 case ANCHOR_END_BUF: fputs("end buf", f); break;
6750 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
6751 case ANCHOR_END_LINE: fputs("end line", f); break;
6752 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
6753 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6755 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
6756 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
6757 # ifdef USE_WORD_BEGIN_END
6758 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
6759 case ANCHOR_WORD_END: fputs("word end", f); break;
6760 # endif
6761 case ANCHOR_PREC_READ: fputs("prec read", f); container_p = TRUE; break;
6762 case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); container_p = TRUE; break;
6763 case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); container_p = TRUE; break;
6764 case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); container_p = TRUE; break;
6765 case ANCHOR_KEEP: fputs("keep",f); break;
6767 default:
6768 fprintf(f, "ERROR: undefined anchor type.\n");
6769 break;
6771 break;
6773 case NT_BREF:
6775 int* p;
6776 BRefNode* br = NBREF(node);
6777 p = BACKREFS_P(br);
6778 fprintf(f, "<backref:%"PRIxPTR">", (intptr_t )node);
6779 for (i = 0; i < br->back_num; i++) {
6780 if (i > 0) fputs(", ", f);
6781 fprintf(f, "%d", p[i]);
6784 break;
6786 # ifdef USE_SUBEXP_CALL
6787 case NT_CALL:
6789 CallNode* cn = NCALL(node);
6790 fprintf(f, "<call:%"PRIxPTR">", (intptr_t )node);
6791 p_string(f, cn->name_end - cn->name, cn->name);
6793 break;
6794 # endif
6796 case NT_QTFR:
6797 fprintf(f, "<quantifier:%"PRIxPTR">{%d,%d}%s\n", (intptr_t )node,
6798 NQTFR(node)->lower, NQTFR(node)->upper,
6799 (NQTFR(node)->greedy ? "" : "?"));
6800 print_indent_tree(f, NQTFR(node)->target, indent + add);
6801 break;
6803 case NT_ENCLOSE:
6804 fprintf(f, "<enclose:%"PRIxPTR"> ", (intptr_t )node);
6805 switch (NENCLOSE(node)->type) {
6806 case ENCLOSE_OPTION:
6807 fprintf(f, "option:%d", NENCLOSE(node)->option);
6808 break;
6809 case ENCLOSE_MEMORY:
6810 fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6811 break;
6812 case ENCLOSE_STOP_BACKTRACK:
6813 fprintf(f, "stop-bt");
6814 break;
6815 case ENCLOSE_CONDITION:
6816 fprintf(f, "condition:%d", NENCLOSE(node)->regnum);
6817 break;
6818 case ENCLOSE_ABSENT:
6819 fprintf(f, "absent");
6820 break;
6822 default:
6823 break;
6825 fprintf(f, "\n");
6826 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6827 break;
6829 default:
6830 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6831 break;
6834 if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6835 type != NT_ENCLOSE)
6836 fprintf(f, "\n");
6838 if (container_p) print_indent_tree(f, NANCHOR(node)->target, indent + add);
6840 fflush(f);
6843 static void
6844 print_tree(FILE* f, Node* node)
6846 print_indent_tree(f, node, 0);
6848 # endif /* ONIG_DEBUG_PARSE_TREE */
6849 #endif /* ONIG_DEBUG */