3 * David Leonard. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of David Leonard nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
30 /* $Id: regex.c 1282 2007-09-13 22:26:35Z d $ */
47 * Regular expression engine.
49 * This module contains a parser to compile ECMA-262 regular expressions
50 * into 'p-code', and a matcher engine that runs the p-code against
53 * NOTE: ECMA-262 "regular" expressions are not actually regular in
54 * a technical sense, since the presence of non-greedy and zero-width
55 * lookahead patterns make the matching process an NP-complete algorithm
56 * rather than a linear process implied by context-free finite automata.
57 * (See the NP-complete regex discussion at <http://perl.plover.com/NPC/>)
59 * The regex parser generates a 'SEE_RegExpr' which contains the bytecode
60 * ('p-code'), that when executed will try to match the pattern against the
61 * given string, commencing at the given index into the string.
72 ////////////////////////////////////////////////////////////////////////////////
74 # define dprintf(...) fprintf(stderr, __VA_ARGS__)
75 static void dprints (const char *str
, int len
) {
77 for (; len
> 0; len
--, str
++) {
78 uint8_t cc
= (uint8_t)(*str
);
79 if (cc
< ' ') fputc('.', stderr
); else fputc(cc
, stderr
);
89 ////////////////////////////////////////////////////////////////////////////////
90 typedef unsigned char SEE_unicode_t
;
91 typedef int SEE_boolean_t
;
93 #define UNICODE_TOUPPER(ch) toupper(ch)
96 ////////////////////////////////////////////////////////////////////////////////
100 int pos
; // next char to be taken
105 static XXInput
*xxiNew (const char *source
, int srcLen
) {
106 XXInput
*res
= calloc(1, sizeof(XXInput
));
107 if (srcLen
< 0) srcLen
= strlen(source
);
109 res
->strLen
= srcLen
;
111 res
->eof
= srcLen
<=0;
116 static void SEE_INPUT_CLOSE (XXInput
*input
) {
121 static SEE_unicode_t
xxiAhead (XXInput
*input
) {
122 if (input
->pos
>= input
->strLen
) {
126 return input
->str
[input
->pos
];
130 static void SEE_INPUT_NEXT (XXInput
*input
) {
132 if (input
->pos
>= input
->strLen
) {
133 input
->pos
= input
->strLen
;
139 static int SEE_input_lookahead_copy (XXInput
*input
, SEE_unicode_t
*buf
, int len
) {
140 uint32_t opos
= input
->pos
;
142 if (len
<= 0) return 0;
144 for (f
= 0; f
< len
; f
++) {
145 if (input
->pos
>= input
->strLen
) break;
146 buf
[f
] = input
->str
[input
->pos
++];
153 ////////////////////////////////////////////////////////////////////////////////
154 typedef struct rxGrowable
{
161 unsigned int *usedPtr
;
165 static void SEE_grow_init (rxGrowable
*gr
, void **xptr
, int elSize
, unsigned int *usedPtr
) {
166 if (elSize
< 1) elSize
= 1;
167 //fprintf(stderr, "GROW %p: elSize=%d\n", gr, elSize);
170 gr
->used
= gr
->size
= 0;
173 if (gr
->xptr
) *(gr
->xptr
) = gr
->data
;
174 gr
->usedPtr
= usedPtr
;
175 if (gr
->usedPtr
) *(gr
->usedPtr
) = gr
->used
;
178 #define SEE_GROW_INIT(gr, xptr, uptr) SEE_grow_init(gr, (void **)(&(xptr)), sizeof(*xptr), &uptr)
181 static void SEE_grow_to (rxGrowable
*gr
, int minUsed
) {
182 if (minUsed
>= gr
->size
) {
183 int nsz
= minUsed
+128;
184 void *p
= realloc(gr
->data
, nsz
*gr
->elSize
);
188 if (gr
->xptr
) *(gr
->xptr
) = gr
->data
;
191 if (gr
->usedPtr
) *(gr
->usedPtr
) = gr
->used
;
195 static void SEE_grow_deinit (rxGrowable
*gr
) {
197 if (gr
->data
) free(gr
->data
);
202 ////////////////////////////////////////////////////////////////////////////////
204 int SEE_regex_debug
= 1;
209 OP_FAIL
, /* match failed */
210 OP_SUCCEED
, /* match succeeded */
211 OP_CHAR
, /* match a char class instance */
212 OP_ZERO
, /* reset counter */
213 OP_REACH
, /* test counter over */
214 OP_NREACH
, /* test counter under */
215 OP_START
, /* enter a group */
216 OP_END
, /* exit a group */
217 OP_UNDEF
, /* reset a group */
218 OP_MARK
, /* record a position */
219 OP_FDIST
, /* position test */
220 OP_RDIST
, /* position and counter test */
221 OP_MNEXT
, /* max-loop */
222 OP_RNEXT
, /* reach-loop */
223 OP_GOTO
, /* branch */
224 OP_GS
, /* greedy success */
225 OP_NS
, /* non-greedy success */
226 OP_GF
, /* greedy fail */
227 OP_NF
, /* non-greedy fail */
228 OP_AS
, /* assert success */
229 OP_AF
, /* assert fail */
230 OP_BOL
, /* test beginning of line */
231 OP_EOL
, /* test end of line */
232 OP_BRK
, /* test word-break */
233 OP_NBRK
, /* test non-word-break */
234 OP_BACKREF
, /* backreference match */
240 typedef struct CharClassRange CharClassRange
;
241 typedef struct CharClass CharClass
;
244 struct CharClassRange
{
245 CharClassRange
*next
;
246 uint32_t lo
; /* simple range of chars, eg [a-z] */
248 CharClassRange
*prev
; // all allocated
253 CharClassRange
*ranges
; /* linked list of character ranges */
254 CharClass
*prev
; // all allocated
259 int ncaptures
, ncounters
, nmarks
, maxref
;
262 unsigned int codelen
;
268 CharClassRange
*crList
; // all allocated
274 typedef struct REContext
{
281 ////////////////////////////////////////////////////////////////////////////////
282 #define SEE_NEW(eng, type) calloc(1, sizeof(type))
283 #define SEE_ASSERT(eng, cond) assert(cond)
286 #define NEW1(t) SEE_NEW(REContext->eng, t)
287 #define NEXT xxiAhead(REContext->input)
288 #define SKIP SEE_INPUT_NEXT(REContext->input)
289 #define ATEOF (REContext->input->eof)
291 #define LOOKAHEAD(buf, len) SEE_input_lookahead_copy(REContext->input, buf, len)
293 // SEE_error_throw_string(REContext->eng, REContext->eng->SyntaxError, STR(regex_syntax_error))
294 #define SYNTAX_ERROR \
296 /*fprintf(stderr, "REGEX: syntax error\n");*/ \
297 longjmp(REContext->errJP, 666); \
302 if (ATEOF || NEXT != (c)) SYNTAX_ERROR; \
306 #define RELADDR(base, addr) ((addr)-(base))
307 #define ASSERT(x) SEE_ASSERT(REContext->eng, x)
308 #define CODE_ADD(c) code_add(REContext, c)
309 #define CODE_INSERT(pos, n) code_insert(REContext, pos, n)
310 #define CODE_POS REContext->regex->codelen
311 #define CODE_PATCH(pos, c) REContext->regex->code[pos] = c
313 #define CODE_ADDI(i) \
315 unsigned int _i = (i); \
316 CODE_ADD((_i >> 8) & 0xff); \
317 CODE_ADD(_i & 0xff); \
320 #define CODE_ADDA(i) CODE_ADDI(RELADDR(CODE_POS, i))
322 #define CODE_PATCHI(pos, i) \
323 do { CODE_PATCH(pos, ((i) >> 8) & 0xff); \
324 CODE_PATCH((pos)+1, (i) & 0xff); } while (0)
325 #define CODE_PATCHA(addr, i) CODE_PATCHI(addr, RELADDR(addr, i))
329 #define CODE_MAKEI(code, addr) ((code[addr] << 8) | code[(addr)+1])
330 #define CODE_MAKEA(code, addr) ((CODE_MAKEI(code, addr) + (addr)) & 0xffff)
332 #define CC_NEW() cc_new(REContext)
333 #define CC_ADDRANGE(cc, l, h) cc_add_range(REContext, cc, l, (h)+1)
334 #define CC_ADDCHAR(cc, ch) CC_ADDRANGE(cc, ch, ch)
335 #define CC_INVERT(cc) cc_invert(REContext, cc)
336 #define CC_ADDCC(dst, src) cc_add_cc(REContext, dst, src)
337 #define CC_INTERN(cc) cc_intern(REContext, cc)
339 #define UNDEFINED (-1)
340 #undef INFINITY /* XXX fixme - rename INFINITY in this file maybe? */
341 #define INFINITY (-1)
344 ////////////////////////////////////////////////////////////////////////////////
347 static CharClass
*cc_new (REContext
*);
348 static void cc_add_range (REContext
*, CharClass
*, SEE_unicode_t
, SEE_unicode_t
);
349 static void cc_invert (REContext
*, CharClass
*);
350 static void cc_add_cc (REContext
*, CharClass
*, CharClass
*);
351 static int cc_issingle (CharClass
*);
352 static uint32_t cc_count (CharClass
*);
353 static int cc_cmp (CharClass
*, CharClass
*);
354 static int cc_intern (REContext
*, CharClass
*);
355 static int cc_contains (CharClass
*, SEE_unicode_t
);
356 static SEE_RegExpr
*regex_new (REContext
*);
357 static void code_add (REContext
*, int);
358 static void code_insert (REContext
*, int, int);
360 static void Disjunction_parse (REContext
*);
361 static void Alternative_parse (REContext
*);
362 static void Term_parse (REContext
*);
363 static int Quantifier_is_next (REContext
*);
364 static int Integer_parse (REContext
*);
365 static void Atom_parse (REContext
*);
366 static unsigned char HexDigit_parse (REContext
*);
367 static CharClass
*ClassEscape_parse (REContext
*);
368 static CharClass
*ClassAtom_parse (REContext
*);
369 static CharClass
*CanonicalizeClass (REContext
*, CharClass
*);
370 static CharClass
*CharacterClass_parse (REContext
*);
373 static void dprint_ch (SEE_unicode_t
);
374 static void dprint_cc (CharClass
*);
375 static int dprint_code (SEE_RegExpr
*, int);
376 static void dprint_regex (SEE_RegExpr
*);
379 static SEE_unicode_t
Canonicalize (SEE_RegExpr
*, SEE_unicode_t
);
380 static SEE_boolean_t
pcode_run (SEE_RegExpr
*regex
, unsigned int addr
, const char *text
, int textLen
, char *state
);
381 static void optimize_regex (SEE_RegExpr
*);
384 ////////////////////////////////////////////////////////////////////////////////
387 // Create a new, empty CharClass
388 static CharClass
*cc_new (REContext
*REContext
) {
390 c
= NEW1(struct CharClass
);
392 c
->prev
= REContext
->regex
->ccList
;
393 REContext
->regex
->ccList
= c
;
398 // Add a range to a CharClass
399 static void cc_add_range (REContext
*REContext
, CharClass
*c
, SEE_unicode_t lo
, SEE_unicode_t hi
) {
400 CharClassRange
*s
, *newr
= NULL
;
402 for (rp
= &c
->ranges
; *rp
; rp
= &(*rp
)->next
) if (lo
<= (*rp
)->hi
) break;
403 if (!*rp
|| hi
< (*rp
)->lo
) {
404 newr
= NEW1(struct CharClassRange
);
409 newr
->prev
= REContext
->regex
->crList
;
410 REContext
->regex
->crList
= newr
;
412 if (lo
< (*rp
)->lo
) (*rp
)->lo
= lo
;
413 if (hi
> (*rp
)->hi
) {
416 while (s
&& s
->hi
< hi
) (*rp
)->next
= s
= s
->next
;
417 if (s
&& s
->lo
<= hi
) {
419 (*rp
)->next
= s
->next
;
426 // Invert a CharClass
427 static void cc_invert (REContext
*REContext
, CharClass
*c
) {
428 CharClassRange
*l
, *newlist
, *r
;
430 if (r
&& r
->lo
== 0 && r
->hi
== ~0) {
434 l
= newlist
= NEW1(struct CharClassRange
);
435 l
->prev
= REContext
->regex
->crList
;
436 REContext
->regex
->crList
= l
;
437 if (r
&& r
->lo
== 0) {
441 for (; r
; r
= r
->next
) {
448 l
= (l
->next
= NEW1(struct CharClassRange
));
450 l
->prev
= REContext
->regex
->crList
;
451 REContext
->regex
->crList
= l
;
461 static void cc_add_cc (REContext
*REContext
, CharClass
*dst
, CharClass
*src
) {
463 //FIXME: very inefficient
464 for (r
= src
->ranges
; r
; r
= r
->next
) cc_add_range(REContext
, dst
, r
->lo
, r
->hi
);
468 static int cc_issingle (CharClass
*c
) {
469 CharClassRange
*r
= c
->ranges
;
476 // Return the number of characters in the class
477 static uint32_t cc_count (CharClass
*c
) {
480 for (r
= c
->ranges
; r
; r
= r
->next
) count
+= r
->hi
-r
->lo
;
485 // Return 0 if two CharClasses are identical
486 static int cc_cmp (CharClass
*c1
, CharClass
*c2
) {
487 CharClassRange
*r1
, *r2
;
491 if (r1
->lo
!= r2
->lo
) return r1
->lo
-r2
->lo
;
492 if (r1
->hi
!= r2
->hi
) return r1
->hi
-r2
->hi
;
502 // Insert CharClass into REContext, returning unique ID
503 static int cc_intern (REContext
*REContext
, CharClass
*c
) {
504 SEE_RegExpr
*regex
= REContext
->regex
;
506 for (i
= 0; i
< regex
->cclen
; i
++) if (cc_cmp(c
, regex
->cc
[i
]) == 0) return i
;
508 SEE_grow_to(®ex
->ccgrow
, regex
->cclen
+1);
514 // Return true if CharClass c contains character ch
515 static int cc_contains (CharClass
*c
, SEE_unicode_t ch
) {
517 for (r
= c
->ranges
; r
; r
= r
->next
) {
518 if (ch
>= r
->lo
&& ch
< r
->hi
) return 1;
519 if (ch
< r
->lo
) return 0;
525 ////////////////////////////////////////////////////////////////////////////////
527 // Print a character in a readable form
528 static void dprint_ch (SEE_unicode_t ch
) {
530 case '{': case '}': case '[': case ']':
531 case '(': case ')': case '-':
532 case '.': case '^': case '$': case '|':
533 case '?': case '*': case '+': case '\\':
534 dprintf("\\%c", ch
& 0x7f); break;
535 case 0x0000: dprintf("\\0"); break;
536 case 0x0009: dprintf("\\t"); break;
537 case 0x000a: dprintf("\\n"); break;
538 case 0x000b: dprintf("\\v"); break;
539 case 0x000c: dprintf("\\f"); break;
540 case 0x000d: dprintf("\\r"); break;
542 if (ch
>= ' ' && ch
<= '~') dprintf("%c", ch
& 0x7f);
543 else if (ch
< 0x100) dprintf("\\x%02x", ch
& 0xff);
544 else dprintf("\\u%04x", ch
);
549 /// Print a character class in a readable form
550 static void dprint_cc (CharClass
*c
) {
553 if (c
->ranges
&& c
->ranges
->lo
== 0) {
555 for (r
= c
->ranges
; r
; r
= r
->next
) {
558 if (r
->next
->lo
!= r
->hi
+ 1) {
560 dprint_ch(r
->next
->lo
- 1);
562 } else if (r
->hi
!= ~0) {
569 for (r
= c
->ranges
; r
; r
= r
->next
) {
571 if (r
->hi
!= r
->lo
+ 1) {
573 dprint_ch(r
->hi
- 1);
582 ////////////////////////////////////////////////////////////////////////////////
583 // regex and pcode construction
587 * Allocate a new regex structure. This will contain everything needed
588 * to run and match a string, independent of the original pattern text.
590 static SEE_RegExpr
*regex_new (REContext
*REContext
) {
592 regex
= NEW1(struct SEE_RegExpr
);
593 regex
->ncaptures
= 0;
595 regex
->ncounters
= 0;
598 SEE_GROW_INIT(®ex
->codegrow
, regex
->code
, regex
->codelen
);
599 regex
->codegrow
.is_string
= 1;
600 SEE_GROW_INIT(®ex
->ccgrow
, regex
->cc
, regex
->cclen
);
606 // add to the end of the p-code array, resizing as needed
607 static void code_add (REContext
*REContext
, int c
) {
608 SEE_RegExpr
*regex
= REContext
->regex
;
611 SEE_grow_to(®ex
->codegrow
, i
+1);
616 // insert some bytes into the middle of the p-code, resizing as needed
617 static void code_insert (REContext
*REContext
, int pos
, int n
) {
618 SEE_RegExpr
*regex
= REContext
->regex
;
620 for (i
= 0; i
< n
; i
++) code_add(REContext
, 0);
621 for (i
= regex
->codelen
- n
; i
> pos
; i
--) regex
->code
[i
-1+n
] = regex
->code
[i
-1];
625 ////////////////////////////////////////////////////////////////////////////////
628 * This recursive descent parser builds a p-code array as it runs.
629 * During recursion, the p-code array is sometimes 'back-patched'
630 * because branch distances weren't known in advance. In some
631 * cases, p-code segments are also shifted. This all means that we
632 * have to be very careful that our p-code is quite relocatable,
633 * and not dependendent on absolute addresses.
636 // parse a source pattern, and return a filled-in regex structure
637 SEE_RegExpr
*SEE_regex_parse (const char *source
, int patLen
, int flags
) {
638 REContext
*REContext
;
640 REContext
= SEE_NEW(eng
, struct REContext
);
641 //REContext->input = SEE_input_lookahead(SEE_input_string(eng, source), 24);
642 REContext
->input
= xxiNew(source
, patLen
);
643 REContext
->regex
= regex
= regex_new(REContext
);
644 regex
->flags
= flags
;
645 regex
->ncaptures
= 1;
646 if (setjmp(REContext
->errJP
)) {
648 SEE_regex_free(regex
);
651 Disjunction_parse(REContext
);
652 if (!ATEOF
) SYNTAX_ERROR
;
653 CODE_ADD(OP_SUCCEED
);
654 // Check that no backreferences were too big
655 if (regex
->maxref
>= regex
->ncaptures
) SYNTAX_ERROR
;
656 //FIXME: XXX - should this close be enclosed in a 'finally'?
657 SEE_INPUT_CLOSE(REContext
->input
);
658 // compute the size of a captures context
660 regex
->ncaptures
*sizeof(Capture
)+
661 regex
->ncounters
*sizeof(int)+
662 regex
->nmarks
*sizeof(int);
663 optimize_regex(regex
);
665 if (SEE_regex_debug
) {
676 // Returns the number of capture parentheses in the compiled regex
677 int SEE_regex_count_captures (const SEE_RegExpr
*regex
) {
678 return regex
->ncaptures
;
682 // Returns the flags of the regex object
683 int SEE_regex_get_flags (const SEE_RegExpr
*regex
) {
689 * Disjunction :: Alternative
690 * Alternative | Disjunction
692 static void Disjunction_parse (REContext
*REContext
) {
694 Alternative_parse(REContext
);
695 if (!ATEOF
&& NEXT
== '|') {
696 int p
= pos
, p1
, p2
, x1
, x2
;
697 int insert
= 1+CODE_SZA
;
699 CODE_INSERT(pos
, insert
);
700 CODE_PATCH(p
, OP_GF
); p
++; /* GF x1 */
701 p1
= p
; p
+= CODE_SZA
;
702 ASSERT(p
== pos
+ insert
); /* (a) */
703 CODE_ADD(OP_GOTO
); /* GOTO x2 */
704 p2
= CODE_POS
; CODE_ADDA(0);
705 x1
= CODE_POS
; /* x1: (b) */
706 Disjunction_parse(REContext
);
707 x2
= CODE_POS
; /* x2: */
715 * Alternative :: [empty] -- lookahead in ) |
719 static void Alternative_parse (REContext
*REContext
) {
720 while (!(ATEOF
|| NEXT
== /*(*/')' || NEXT
== '|')) Term_parse(REContext
);
724 /*------------------------------------------------------------
726 * Assertion la = ^ $ \b \B
731 /* We have 24 bytes of lookahead, which is sufficient to
732 * scan for {2147483647,2147483647}. Anything larger will
733 * overflow the signed int type on 32 bit systems.
735 static int Quantifier_is_next (REContext
*REContext
) {
737 SEE_unicode_t lookahead
[24];
738 if (NEXT
!= '{') return 0;
740 * Strict ECMA-262 says that '{' is NOT a Pattern character,
741 * but Mozilla allows it
744 //if (!SEE_COMPAT_JS(REContext->eng, >=, JS11)) return 1;
745 len
= LOOKAHEAD(lookahead
, 24);
747 while (pos
< len
&& lookahead
[pos
] >= '0' && lookahead
[pos
] <= '9') pos
++;
748 if (pos
< len
&& lookahead
[pos
] == ',') pos
++;
749 else if (pos
< len
&& lookahead
[pos
] == '}') return pos
> 1;
751 while (pos
< len
&& lookahead
[pos
] >= '0' && lookahead
[pos
] <= '9') pos
++;
752 if (pos
< len
&& lookahead
[pos
] == '}') pos
++; else return 0;
757 static void Term_parse (REContext
*REContext
) {
758 int min
, max
, greedy
, pos
;
761 SEE_unicode_t lookahead
[2];
763 * parse Assertion inline since it is a bit special
764 * in terms of its lookahead
768 lookahead_len
= LOOKAHEAD(lookahead
, 2);
769 if (lookahead_len
> 1 && lookahead
[1] == 'b') {
774 if (lookahead_len
> 1 && lookahead
[1] == 'B') {
779 // some other kind of escape
789 // Lookaheads forbidden by the Atom production
790 case '*': case '+': case '?': case ')':
791 case ']': case '{': case '}': case '|':
795 oparen
= REContext
->regex
->ncaptures
;
796 Atom_parse(REContext
);
797 cparen
= REContext
->regex
->ncaptures
;
799 * parse Quantifier inline to save my sanity
803 } else if (NEXT
== '*') {
804 SKIP
; min
= 0; max
= INFINITY
;
805 } else if (NEXT
== '+') {
806 SKIP
; min
= 1; max
= INFINITY
;
807 } else if (NEXT
== '?') {
808 SKIP
; min
= 0; max
= 1;
809 } else if (Quantifier_is_next(REContext
)) {
811 min
= Integer_parse(REContext
);
812 if (!ATEOF
&& NEXT
== ',') {
814 if (!ATEOF
&& NEXT
== '}') max
= INFINITY
;
815 else max
= Integer_parse(REContext
);
821 if (!ATEOF
&& NEXT
== '?') {
824 if (min
== max
&& !greedy
) {
826 * XXX should we warn that the greedy modifiers to
827 * 'a{n,n}?' and 'a{n}?' are technically meaningless?
828 * We speed up our code by using greedy mode anyway.
832 /* Don't allow stupid ranges, such as 'a{7,3}' */
833 if (max
!= INFINITY
&& min
> max
) SYNTAX_ERROR
;
834 if (min
== 1 && max
== 1) return; /* a */
841 if (oparen
!= cparen
) {
843 * If the atom introduces capture parentheses,
844 * then insert code to reset them before each
847 CODE_INSERT(pos
, 1 + 2*CODE_SZI
);
848 CODE_PATCH(pos
, OP_UNDEF
);
849 CODE_PATCHI(pos
+ 1, oparen
);
850 CODE_PATCHI(pos
+ 1 + CODE_SZI
, cparen
);
853 * The following code generators all generate looping
854 * matchers. While every corresponding pattern could
855 * be written as the generalised 'a{n,m}' (where m could
856 * be INFINITY), some efficiency is gained by selecting cases
857 * where general code that would be a no-op is not emitted.
859 if (min
== max
) { /* a{m} */
861 int insert
= 1 + CODE_SZI
;
862 int c
= REContext
->regex
->ncounters
++;
863 CODE_INSERT(pos
, insert
);
864 CODE_PATCH(p
, OP_ZERO
); p
++; /* ZERO c; */
865 CODE_PATCHI(p
, c
); p
+= CODE_SZI
;
867 ASSERT(p
== pos
+ insert
); /* (a) */
868 CODE_ADD(OP_RNEXT
); /* RNEXT c,m,x */
874 if (min
== 0 && max
== 1) {
877 int insert
= 1 + CODE_SZA
;
878 CODE_INSERT(pos
, insert
);
879 CODE_PATCH(p
, greedy
? OP_GF
: OP_NF
); /* GF x */
880 p
++; p1
= p
; p
+= CODE_SZA
;
881 ASSERT(p
== pos
+ insert
); /* (a) */
882 px
= CODE_POS
; /* x: */
887 if (min
== 0 && max
== INFINITY
) {
889 int p
= pos
, px
, py
, p1
;
890 int insert
= 2 + CODE_SZA
+ CODE_SZI
;
891 int m
= REContext
->regex
->nmarks
++;
892 CODE_INSERT(pos
, insert
);
893 px
= p
; /* x: GF y */
894 CODE_PATCH(p
, greedy
? OP_GF
: OP_NF
); p
++;
895 p1
= p
; p
+= CODE_SZA
;
896 CODE_PATCH(p
, OP_MARK
); p
++; /* MARK m */
897 CODE_PATCHI(p
, m
); p
+= CODE_SZI
;
898 ASSERT(p
== pos
+ insert
); /* (a) */
899 CODE_ADD(OP_FDIST
); /* FDIST m */
901 CODE_ADD(OP_GOTO
); /* GOTO x */
903 py
= CODE_POS
; /* y: */
909 int p
= pos
, px
, py
, p1
;
910 int insert
= 3 + CODE_SZI
* 2 + CODE_SZA
;
911 int c
= REContext
->regex
->ncounters
++;
912 int k
= REContext
->regex
->nmarks
++;
913 CODE_INSERT(pos
, insert
);
914 CODE_PATCH(p
, OP_ZERO
); p
++; /* ZERO c */
915 CODE_PATCHI(p
, c
); p
+= CODE_SZI
;
916 px
= p
; /* x: GF y */
917 CODE_PATCH(p
, greedy
? OP_GF
: OP_NF
); p
++;
918 p1
= p
; p
+= CODE_SZA
;
919 CODE_PATCH(p
, OP_MARK
); p
++; /* MARK k */
920 CODE_PATCHI(p
, k
); p
+= CODE_SZI
;
921 ASSERT(p
== pos
+ insert
); /* (a) */
923 CODE_ADD(OP_RDIST
); /* RDIST k,c,n */
928 CODE_ADD(OP_FDIST
); /* FDIST k */
931 if (max
!= INFINITY
) {
932 CODE_ADD(OP_RNEXT
); /* RNEXT c,m,x */
937 CODE_ADD(OP_MNEXT
); /* MNEXT c,n,x */
942 py
= CODE_POS
; /* y: */
944 CODE_ADD(OP_REACH
); /* REACH c,n */
954 // Parse a simple integer; used for repetition ranges
955 static int Integer_parse (REContext
*REContext
) {
959 while (!ATEOF
&& NEXT
>= '0' && NEXT
<= '9') {
960 val
= 10 * val
+ (NEXT
-'0');
964 if (!hasdig
) SYNTAX_ERROR
;
970 * Atom:: la != ^ $ * + ? ) ] { } |
971 * pattern character la != ^ $ \ . * + ? ( ) [ ] { } |
980 static void Atom_parse (REContext
*REContext
) {
985 if (!ATEOF
&& NEXT
== '?') { /* (?... */
987 if (!ATEOF
&& NEXT
== ':') { /* (?:... */
989 Disjunction_parse(REContext
);
990 } else if (!ATEOF
&& (NEXT
== '=' || NEXT
== '!')) {
991 int px
, p1
; /* (?=... */
992 int neg
= (NEXT
== '!'); /* (?!... */
994 CODE_ADD(neg
? OP_AF
: OP_AS
); /* AS x */
995 p1
= CODE_POS
; CODE_ADDI(0);
996 Disjunction_parse(REContext
); /* (a) */
997 CODE_ADD(OP_SUCCEED
); /* SUCCEED */
998 px
= CODE_POS
; /* x: */
1000 } else SYNTAX_ERROR
;
1001 } else { /* (...) */
1002 i
= REContext
->regex
->ncaptures
++;
1003 CODE_ADD(OP_START
); /* START i */
1005 Disjunction_parse(REContext
); /* (a) */
1006 CODE_ADD(OP_END
); /* END i */
1013 * All other atoms compile to simple character class matches
1014 * (or backreferences)
1019 if (ATEOF
) SYNTAX_ERROR
;
1020 if (NEXT
>= '1' && NEXT
<= '9') {
1022 while (!ATEOF
&& (NEXT
>= '0' && NEXT
<= '9')) {
1023 i
= 10 * i
+ NEXT
- '0';
1026 CODE_ADD(OP_BACKREF
);
1028 if (i
> REContext
->regex
->maxref
) REContext
->regex
->maxref
= i
;
1031 c
= ClassEscape_parse(REContext
);
1034 c
= CharacterClass_parse(REContext
);
1039 CC_ADDCHAR(c
, 0x000a);
1040 CC_ADDCHAR(c
, 0x000d);
1041 //CC_ADDCHAR(c, 0x2028);
1042 //CC_ADDCHAR(c, 0x2029);
1047 CC_ADDCHAR(c
, Canonicalize(REContext
->regex
, NEXT
));
1057 static unsigned char HexDigit_parse (REContext
*REContext
) {
1059 if (ATEOF
) SYNTAX_ERROR
;
1061 if (c
>= '0' && c
<= '9') return c
-'0';
1062 if (c
>= 'a' && c
<= 'f') return c
-'a'+10;
1063 if (c
>= 'A' && c
<= 'F') return c
-'A'+10;
1068 static CharClass
*ClassEscape_parse (REContext
*REContext
) {
1070 SEE_unicode_t ch
, lookahead
[3];
1072 // EXPECT('\\'); // backslash already skipped
1074 if (NEXT
>= '0' && NEXT
<= '9') {
1075 // \0oo - 3 digit octal escapes
1077 if (/*SEE_COMPAT_JS(REContext->eng, >=, JS11) &&*/ NEXT
== '0' && LOOKAHEAD(lookahead
, 3) >= 2 &&
1078 lookahead
[1] >= '0' && lookahead
[1] < '8' && lookahead
[2] >= '0' && lookahead
[2] < '8') {
1079 ch
= (lookahead
[1] - '0') * 8 + (lookahead
[2] - '0');
1087 while (!ATEOF
&& NEXT
>= '0' && NEXT
<= '9') {
1088 i
= 10 * i
+ NEXT
- '0';
1092 * 15.10.2.1.9: "Using a backreference inside a ClassAtom
1095 if (i
!= 0) SYNTAX_ERROR
;
1102 case 'b': CC_ADDCHAR(c
, 0x0008); break;
1103 case 't': CC_ADDCHAR(c
, 0x0009); break;
1104 case 'n': CC_ADDCHAR(c
, 0x000a); break;
1105 case 'v': CC_ADDCHAR(c
, 0x000b); break;
1106 case 'f': CC_ADDCHAR(c
, 0x000c); break;
1107 case 'r': CC_ADDCHAR(c
, 0x000d); break;
1110 CC_ADDRANGE(c
, '0', '9');
1111 if (ch
== 'D') CC_INVERT(c
);
1115 CC_ADDRANGE(c
, 'a', 'z');
1116 CC_ADDRANGE(c
, 'A', 'Z');
1117 CC_ADDRANGE(c
, '0', '9');
1119 if (ch
== 'W') CC_INVERT(c
);
1123 #if WITH_UNICODE_TABLES
1124 for (i
= 0; i
< SEE_unicode_Zscodeslen
; i
++) CC_ADDCHAR(c
, SEE_unicode_Zscodes
[i
]);
1126 CC_ADDCHAR(c
, 0x0020);
1128 if (ch
== 'S') CC_INVERT(c
);
1131 if (ATEOF
) SYNTAX_ERROR
;
1133 if (('a' <= ch
&& ch
<= 'z') ||
1134 ('A' <= ch
&& ch
<= 'Z')) CC_ADDCHAR(c
, ch
% 32);
1139 i
= (ch
== 'x' ? 2 : 4);
1141 while (i
--) ch
= (ch
<< 4) | HexDigit_parse(REContext
);
1157 static CharClass
*ClassAtom_parse (REContext
*REContext
) {
1159 if (ATEOF
) SYNTAX_ERROR
;
1162 return ClassEscape_parse(REContext
);
1165 CC_ADDCHAR(c
, NEXT
);
1172 * Convert the CharClass into a canonicalised version. (SLOW!)
1173 * !!!Every character in the class has to be individually canonicalized!!!
1174 * !!!OUCH!!!OUCH!!!I!HATE!YOU!ALL!!!
1176 static CharClass
*CanonicalizeClass (REContext
*REContext
, CharClass
*c
) {
1179 SEE_unicode_t ch
, uch
;
1180 if (cc_count(c
) > (uint32_t)~0 / 2) {
1182 ccanon
= CanonicalizeClass(REContext
, c
);
1187 * Evil hack: if the CharClass range includes ['A'-0xEFFFF], then
1188 * there is no need to canonicalize because every uppercase character
1191 for (r
= c
->ranges
; r
; r
= r
->next
) if (r
->lo
<= 'A' && r
->hi
> 0xf0000) return c
;
1193 for (r
= c
->ranges
; r
; r
= r
->next
) {
1194 for (ch
= r
->lo
; ch
< r
->hi
; ch
++) {
1195 uch
= UNICODE_TOUPPER(ch
);
1196 CC_ADDCHAR(ccanon
, uch
);
1205 * '[' [^] ( ClassAtom | ClassAtom '-' ClassAtom ) * ']'
1207 static CharClass
*CharacterClass_parse (REContext
*REContext
) {
1208 CharClass
*c
= CC_NEW(), *a
, *b
;
1211 if (!ATEOF
&& NEXT
== '^') {
1215 while (!ATEOF
&& NEXT
!= ']') {
1216 a
= ClassAtom_parse(REContext
);
1217 if (!ATEOF
&& NEXT
== '-') {
1219 // Treat '-' literally if at end of ClassRanges 15.10.2.16
1220 if (!ATEOF
&& NEXT
== ']') {
1224 if (!cc_issingle(a
)) SYNTAX_ERROR
;
1225 b
= ClassAtom_parse(REContext
);
1226 if (!cc_issingle(b
)) SYNTAX_ERROR
;
1227 if (b
->ranges
->lo
< a
->ranges
->lo
) SYNTAX_ERROR
;
1228 a
->ranges
->hi
= b
->ranges
->hi
;
1235 //if (c->ranges == NULL) SYNTAX_ERROR;
1237 if (REContext
->regex
->flags
& SEERX_FLAG_IGNORECASE
) {
1238 c
= CanonicalizeClass(REContext
, c
);
1239 // free(c); // i.e the old c
1241 if (invertflag
) CC_INVERT(c
);
1246 ////////////////////////////////////////////////////////////////////////////////
1250 static int dprint_code (SEE_RegExpr
*regex
, int addr
) {
1252 const char *op
="", *opc
;
1253 unsigned char *code
= regex
->code
;
1254 dprintf("0x%04x: ", addr
);
1255 switch (code
[addr
]) {
1256 case OP_FAIL
: dprintf("FAIL"); op
= ""; break;
1257 case OP_SUCCEED
: dprintf("SUCCEED"); op
= ""; break;
1258 case OP_CHAR
: dprintf("CHAR"); op
= "i"; break;
1259 case OP_ZERO
: dprintf("ZERO"); op
= "i"; break;
1260 case OP_REACH
: dprintf("REACH"); op
= "ii"; break;
1261 case OP_NREACH
: dprintf("NREACH"); op
= "ii"; break;
1262 case OP_START
: dprintf("START"); op
= "i"; break;
1263 case OP_END
: dprintf("END"); op
= "i"; break;
1264 case OP_UNDEF
: dprintf("UNDEF"); op
= "ii"; break;
1265 case OP_MARK
: dprintf("MARK"); op
= "i"; break;
1266 case OP_FDIST
: dprintf("FDIST"); op
= "i"; break;
1267 case OP_RDIST
: dprintf("RDIST"); op
= "iii"; break;
1268 case OP_MNEXT
: dprintf("MNEXT"); op
= "iia"; break;
1269 case OP_RNEXT
: dprintf("RNEXT"); op
= "iia"; break;
1270 case OP_GOTO
: dprintf("GOTO"); op
= "a"; break;
1271 case OP_GS
: dprintf("GS"); op
= "a"; break;
1272 case OP_NS
: dprintf("NS"); op
= "a"; break;
1273 case OP_GF
: dprintf("GF"); op
= "a"; break;
1274 case OP_NF
: dprintf("NF"); op
= "a"; break;
1275 case OP_AS
: dprintf("AS"); op
= "a"; break;
1276 case OP_AF
: dprintf("AF"); op
= "a"; break;
1277 case OP_BOL
: dprintf("BOL"); op
= ""; break;
1278 case OP_EOL
: dprintf("EOL"); op
= ""; break;
1279 case OP_BRK
: dprintf("BRK"); op
= ""; break;
1280 case OP_NBRK
: dprintf("NBRK"); op
= ""; break;
1281 case OP_BACKREF
: dprintf("BACKREF"); op
= "i"; break;
1282 default: dprintf("*** %d", code
[addr
]); break;
1285 for (opc
= op
; *opc
; opc
++) {
1286 if (opc
!= op
) dprintf(",");
1290 i
= CODE_MAKEA(code
, addr
);
1291 dprintf("0x%04x", i
);
1292 i
= CODE_MAKEI(code
, addr
);
1293 dprintf(" [0x%04x]", i
);
1297 i
= CODE_MAKEI(code
, addr
); addr
+= CODE_SZI
;
1301 i
= CODE_MAKEI(code
, addr
); addr
+= CODE_SZI
;
1303 if (i
> regex
->cclen
) dprintf("**BAD**");
1304 else dprint_cc(regex
->cc
[i
]);
1313 static void dprint_regex (SEE_RegExpr
*regex
) {
1316 dprintf("regex %p\n", regex
);
1317 dprintf("\tncaptures = %d\n", regex
->ncaptures
);
1318 dprintf("\tcodelen = %d\n", regex
->codelen
);
1319 dprintf("\tcclen = %d\n", regex
->cclen
);
1320 dprintf("\tflags = 0x%x\n", regex
->flags
);
1322 for (i
= 0; i
< regex
->cclen
; i
++) {
1323 dprintf("\t\t%d = ", i
);
1324 dprint_cc(regex
->cc
[i
]);
1325 dprintf("\n\t\t = { ");
1326 for (r
= regex
->cc
[i
]->ranges
; r
; r
= r
->next
) dprintf("%x:%x ", r
->lo
, r
->hi
);
1329 dprintf("\tcode:\n");
1331 while (addr
< regex
->codelen
) addr
= dprint_code(regex
, addr
);
1336 ////////////////////////////////////////////////////////////////////////////////
1340 static SEE_unicode_t
Canonicalize (SEE_RegExpr
*regex
, SEE_unicode_t ch
) {
1341 if (regex
->flags
& SEERX_FLAG_IGNORECASE
) return UNICODE_TOUPPER(ch
);
1346 static SEE_boolean_t
pcode_run (SEE_RegExpr
*regex
, unsigned int addr
, const char *text
, int textLen
, char *state
) {
1347 SEE_boolean_t result
;
1348 int i
= 0, i2
= 0, i3
= 0;
1353 int *counter
, *mark
, statesz
;
1354 unsigned int newaddr
;
1356 //if (SEE_system.periodic) (*SEE_system.periodic)(eng);
1357 // Compute the offsets into the state structure
1359 capture
= (Capture
*)state
;
1360 statesz
+= regex
->ncaptures
* sizeof (Capture
);
1361 counter
= (int *)(state
+ statesz
);
1362 statesz
+= regex
->ncounters
* sizeof (int);
1363 mark
= (int *)(state
+ statesz
);
1364 statesz
+= regex
->nmarks
* sizeof (int);
1365 SEE_ASSERT(eng
, statesz
== regex
->statesz
);
1366 newstate
= calloc(statesz
+1, sizeof(char));//SEE_STRING_ALLOCA(eng, char, statesz);
1367 #define index (capture[0].end)
1369 /* Catch bad branches */
1370 if (addr
>= regex
->codelen
) {
1372 longjmp(regex
->errJP
, 666);
1374 //SEE_error_throw_string(eng, eng->Error, STR(internal_error));
1376 /* Read the opcode and its arguments */
1377 op
= regex
->code
[addr
];
1379 if (SEE_regex_debug
) {
1382 //mys.stringclass = 0;
1385 dprintf("index=%d captures=[", index
);
1386 for (x
= 0; x
< regex
->ncaptures
; x
++) {
1387 if (x
) dprintf(",");
1388 if (capture
[x
].start
== -1)
1390 else if (capture
[x
].start
+capture
[x
].end
> textLen
) {
1391 dprintf("bad<%x:%x>",
1392 capture
[x
].start
, capture
[x
].end
);
1394 int end
= capture
[x
].end
;
1395 if (end
== -1) end
= index
;
1397 mys.length = end-capture[x].start;
1398 mys.data = text+capture[x].start;
1403 dprints(text
+capture
[x
].start
, end
-capture
[x
].start
);
1404 if (capture
[x
].end
== -1) dprintf("+");
1408 if (op
== OP_ZERO
|| op
== OP_REACH
|| op
== OP_NREACH
|| op
== OP_MNEXT
|| op
== OP_RNEXT
) {
1409 dprintf(" counter=[");
1410 for (x
= 0; x
< regex
->ncounters
; x
++) {
1411 if (x
) dprintf(",");
1412 dprintf("%d", counter
[x
]);
1416 if (op
== OP_MARK
|| op
== OP_FDIST
|| op
== OP_RDIST
) {
1418 for (x
= 0; x
< regex
->nmarks
; x
++) {
1419 if (x
) dprintf(",");
1420 dprintf("%d", mark
[x
]);
1424 if (regex
->code
[addr
] == OP_CHAR
&& index
< textLen
) dprintf(" ch='%c'", Canonicalize(regex
, text
[index
]));
1426 (void)dprint_code(regex
, addr
);
1431 case OP_FAIL
: case OP_SUCCEED
: case OP_BOL
:
1432 case OP_EOL
: case OP_BRK
: case OP_NBRK
:
1434 case OP_CHAR
: case OP_ZERO
: case OP_START
:
1435 case OP_END
: case OP_MARK
: case OP_FDIST
:
1437 i
= CODE_MAKEI(regex
->code
, addr
); /* integer arg */
1440 case OP_REACH
: case OP_NREACH
: case OP_UNDEF
:
1441 i
= CODE_MAKEI(regex
->code
, addr
); /* integer arg */
1443 i2
= CODE_MAKEI(regex
->code
, addr
); /* integer arg */
1447 i
= CODE_MAKEI(regex
->code
, addr
); /* integer arg */
1449 i2
= CODE_MAKEI(regex
->code
, addr
); /* integer arg */
1451 i3
= CODE_MAKEI(regex
->code
, addr
); /* integer arg */
1454 case OP_MNEXT
: case OP_RNEXT
:
1455 i
= CODE_MAKEI(regex
->code
, addr
); /* integer arg */
1457 i2
= CODE_MAKEI(regex
->code
, addr
); /* integer arg */
1459 a
= CODE_MAKEA(regex
->code
, addr
); /* address arg */
1462 case OP_GOTO
: case OP_GS
: case OP_NS
:
1463 case OP_GF
: case OP_NF
: case OP_AS
:
1465 a
= CODE_MAKEA(regex
->code
, addr
); /* address arg */
1470 //SEE_error_throw_string(eng, eng->Error, STR(internal_error));
1473 longjmp(regex
->errJP
, 666);
1483 /* succeed if current character matches CharClass. index++ */
1485 if (index
< textLen
) {
1487 /* N.B. strings are UTF-16 encoded! */
1489 if ((ch & 0xfc00) == 0xd800 && index < textLen && (text[index] & 0xfc00) == 0xdc00)
1490 ch = (((ch & 0x3ff) << 10) | (text[index++] & 0x3ff))+0x10000;
1492 ch
= Canonicalize(regex
, ch
);
1493 if (!cc_contains(regex
->cc
[i
], ch
)) {
1502 /* reset an iteration counter */
1503 case OP_ZERO
: counter
[i
] = 0;
1505 /* fail if we havent reached a particular count */
1507 if (counter
[i
] < i2
) {
1512 /* fail if we reached a particular count */
1514 if (counter
[i
] >= i2
) {
1519 /* start a capture group at current index */
1521 capture
[i
].start
= index
;
1522 capture
[i
].end
= -1;
1524 /* finish a capture group at current index */
1526 capture
[i
].end
= index
;
1528 /* reset the given captures - usually done at a loop start */
1531 capture
[i
].start
= -1;
1532 capture
[i
].end
= -1;
1536 /* Set a mark to the current index */
1540 /* fail if we haven't advanced past the mark */
1542 if (mark
[i
] == index
) {
1547 /* fail if haven't advanced past mark AND counter has reached a limit */
1549 if (mark
[i
] == index
&& counter
[i2
] >= i3
) {
1554 /* increment counter if it is less than n. always branch */
1556 if (counter
[i
] < i2
) counter
[i
]++;
1559 /* increment counter. if it is less than n, then branch */
1562 if (counter
[i
] < i2
) addr
= a
;
1567 /* operations that push state and branch for backtracking */
1568 case OP_GS
: /* greedy success */
1569 case OP_NS
: /* non-greedy success */
1570 case OP_GF
: /* greedy fail */
1571 case OP_NF
: /* greedy fail */
1572 case OP_AS
: /* assert success */
1573 case OP_AF
: /* assert fail */
1574 newaddr
= (op
== OP_NS
|| op
== OP_NF
) ? a
: addr
;
1575 memcpy(newstate
, state
, statesz
);
1576 result
= pcode_run(regex
, newaddr
, text
, textLen
, newstate
);
1578 if (op
== OP_GF
|| op
== OP_NF
) {
1579 memcpy(state
, newstate
, statesz
);
1582 } else if (op
== OP_AF
) {
1585 } else if (op
== OP_AS
) {
1586 int index_save
= index
;
1587 memcpy(state
, newstate
, statesz
);
1590 } else if (op
== OP_GS
) {
1594 if (op
== OP_GS
|| op
== OP_NS
|| op
== OP_AS
) {
1597 } else if (op
== OP_GF
|| op
== OP_AF
) addr
= a
;
1600 /* succeed if we are at the beginning of a line */
1602 case OP_BOL
: /* ^ */
1603 if (index
== 0) ; /* succeed */
1604 else if ((regex
->flags
& SEERX_FLAG_MULTILINE
) == 0) { free(newstate
); return 0; }
1605 else if (text
[index
-1] == 0x000a || /*LF*/
1606 text
[index
-1] == 0x000d) // || /*CR*/
1607 //text[index-1] == 0x2028 || /*LS*/
1608 //text[index-1] == 0x2029) /*PS*/
1610 else { free(newstate
); return 0; }
1612 /* succeed if we are at the end of a line */
1613 case OP_EOL
: /* $ */
1614 if (index
== textLen
) ; /* succeed */
1615 else if ((regex
->flags
& SEERX_FLAG_MULTILINE
) == 0) { free(newstate
); return 0; }
1616 else if (text
[index
] == 0x000a || /*LF*/
1617 text
[index
] == 0x000d) // || /*CR*/
1618 //text[index] == 0x2028 || /*LS*/
1619 //text[index] == 0x2029) /*PS*/
1621 else { free(newstate
); return 0; }
1623 #define IsWordChar(e) ((e) >= 0 && (e) < textLen && ( \
1624 (text[e] >= 'a' && text[e] <= 'z') \
1625 || (text[e] >= 'A' && text[e] <= 'Z') \
1626 || (text[e] >= '0' && text[e] <= '9') \
1628 /* succeed if we are at a word break */
1632 a
= IsWordChar(index
- 1);
1633 b
= IsWordChar(index
);
1635 if (a
== b
) { free(newstate
); return 0; }
1637 if (a
!= b
) { free(newstate
); return 0; }
1640 /* succeed if we match a backreference */
1642 if (!SEERX_IS_CAPTURE_UNDEFINED(capture
[i
])) {
1644 br
= capture
[i
].start
;
1645 len
= capture
[i
].end
- br
;
1646 if (len
+index
> textLen
) { free(newstate
); return 0; }
1647 for (x
= 0; x
< len
; x
++)
1648 if (Canonicalize(regex
, text
[br
+x
]) != Canonicalize(regex
, text
[index
+x
])) {
1655 /* catch unexpected instructions */
1657 //SEE_error_throw_string(eng, eng->Error, STR(internal_error));
1660 longjmp(regex
->errJP
, 666);
1668 * Executes the regex on the text beginning at index.
1669 * Returns true of a match was successful.
1671 int SEE_regex_match (SEE_RegExpr
*regex
, const char *text
, int textLen
, int index
, Capture
*capture_ret
) {
1673 if (textLen
< 0) textLen
= text
?strlen(text
):0;
1674 char *state
= calloc(regex
->statesz
+1, sizeof(char));//SEE_STRING_ALLOCA(eng, char, regex->statesz);
1675 Capture
*capture
= (Capture
*)state
;
1677 memset(state
, 0xd0, regex
->statesz
); /* catch bugs */
1679 capture
[0].start
= index
;
1680 capture
[0].end
= index
;
1681 for (i
= 1; i
< regex
->ncaptures
; i
++) {
1682 capture
[i
].start
= -1;
1683 capture
[i
].end
= -1;
1685 if (setjmp(regex
->errJP
)) {
1689 success
= pcode_run(regex
, 0, text
, textLen
, state
);
1691 if (SEE_regex_debug
) dprintf(". %s\n", success
? "success" : "failure");
1693 if (success
&& capture_ret
) memcpy(capture_ret
, capture
, regex
->ncaptures
*sizeof(Capture
));
1699 /*------------------------------------------------------------
1702 static void optimize_regex (SEE_RegExpr
*regex
) {
1704 * (nothing here yet)
1706 * possible optimisations include branch short-cuts,
1707 * and compiling the p-code to native machine instructions.
1712 void SEE_regex_free (SEE_RegExpr
*regex
) {
1714 CharClassRange
*r
, *n
;
1715 for (n
= NULL
, r
= regex
->crList
; r
; r
= n
) {
1720 for (cn
= NULL
, cr
= regex
->ccList
; cr
; cr
= cn
) {
1724 SEE_grow_deinit(®ex
->codegrow
);
1725 SEE_grow_deinit(®ex
->ccgrow
);