fixed some bugs in new 'g'o system
[k8-i-v-a-n.git] / src / felib / regex.c
blobf66a4309c5791af3601dcd06d2e64a72d928ab9f
1 /*
2 * Copyright (c) 2003
3 * David Leonard. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of David Leonard nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
30 /* $Id: regex.c 1282 2007-09-13 22:26:35Z d $ */
31 #ifdef __cplusplus
32 extern "C" {
33 #endif
35 #include <assert.h>
36 #include <ctype.h>
37 #include <setjmp.h>
38 #include <stdint.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <unistd.h>
44 #include "regex.h"
47 * Regular expression engine.
49 * This module contains a parser to compile ECMA-262 regular expressions
50 * into 'p-code', and a matcher engine that runs the p-code against
51 * strings.
53 * NOTE: ECMA-262 "regular" expressions are not actually regular in
54 * a technical sense, since the presence of non-greedy and zero-width
55 * lookahead patterns make the matching process an NP-complete algorithm
56 * rather than a linear process implied by context-free finite automata.
57 * (See the NP-complete regex discussion at <http://perl.plover.com/NPC/>)
59 * The regex parser generates a 'SEE_RegExpr' which contains the bytecode
60 * ('p-code'), that when executed will try to match the pattern against the
61 * given string, commencing at the given index into the string.
63 #ifndef NDEBUG
64 # define NDEBUG
65 #endif
67 #ifdef TJS_RE_DUMP
68 # undef NDEBUG
69 #endif
72 ////////////////////////////////////////////////////////////////////////////////
73 #ifndef NDEBUG
74 # define dprintf(...) fprintf(stderr, __VA_ARGS__)
75 static void dprints (const char *str, int len) {
76 fputc('"', stderr);
77 for (; len > 0; len--, str++) {
78 uint8_t cc = (uint8_t)(*str);
79 if (cc < ' ') fputc('.', stderr); else fputc(cc, stderr);
81 fputc('"', stderr);
83 #else
84 # define dprintf(...)
85 # define dprints(...)
86 #endif
89 ////////////////////////////////////////////////////////////////////////////////
90 typedef unsigned char SEE_unicode_t;
91 typedef int SEE_boolean_t;
93 #define UNICODE_TOUPPER(ch) toupper(ch)
96 ////////////////////////////////////////////////////////////////////////////////
97 typedef struct {
98 const char *str;
99 int strLen;
100 int pos; // next char to be taken
101 int eof;
102 } XXInput;
105 static XXInput *xxiNew (const char *source, int srcLen) {
106 XXInput *res = calloc(1, sizeof(XXInput));
107 if (srcLen < 0) srcLen = strlen(source);
108 res->str = source;
109 res->strLen = srcLen;
110 res->pos = 0;
111 res->eof = srcLen<=0;
112 return res;
116 static void SEE_INPUT_CLOSE (XXInput *input) {
117 free(input);
121 static SEE_unicode_t xxiAhead (XXInput *input) {
122 if (input->pos >= input->strLen) {
123 input->eof = 1;
124 return 0;
126 return input->str[input->pos];
130 static void SEE_INPUT_NEXT (XXInput *input) {
131 input->pos++;
132 if (input->pos >= input->strLen) {
133 input->pos = input->strLen;
134 input->eof = 1;
139 static int SEE_input_lookahead_copy (XXInput *input, SEE_unicode_t *buf, int len) {
140 uint32_t opos = input->pos;
141 int f;
142 if (len <= 0) return 0;
143 memset(buf, 0, len);
144 for (f = 0; f < len; f++) {
145 if (input->pos >= input->strLen) break;
146 buf[f] = input->str[input->pos++];
148 input->pos = opos;
149 return f;
153 ////////////////////////////////////////////////////////////////////////////////
154 typedef struct rxGrowable {
155 uint8_t *data;
156 int used;
157 int size;
158 int is_string;
159 void **xptr;
160 int elSize;
161 unsigned int *usedPtr;
162 } rxGrowable;
165 static void SEE_grow_init (rxGrowable *gr, void **xptr, int elSize, unsigned int *usedPtr) {
166 if (elSize < 1) elSize = 1;
167 //fprintf(stderr, "GROW %p: elSize=%d\n", gr, elSize);
168 gr->elSize = elSize;
169 gr->data = NULL;
170 gr->used = gr->size = 0;
171 gr->is_string = 0;
172 gr->xptr = xptr;
173 if (gr->xptr) *(gr->xptr) = gr->data;
174 gr->usedPtr = usedPtr;
175 if (gr->usedPtr) *(gr->usedPtr) = gr->used;
178 #define SEE_GROW_INIT(gr, xptr, uptr) SEE_grow_init(gr, (void **)(&(xptr)), sizeof(*xptr), &uptr)
181 static void SEE_grow_to (rxGrowable *gr, int minUsed) {
182 if (minUsed >= gr->size) {
183 int nsz = minUsed+128;
184 void *p = realloc(gr->data, nsz*gr->elSize);
185 if (!p) abort();
186 gr->data = p;
187 gr->size = nsz;
188 if (gr->xptr) *(gr->xptr) = gr->data;
190 gr->used = minUsed;
191 if (gr->usedPtr) *(gr->usedPtr) = gr->used;
195 static void SEE_grow_deinit (rxGrowable *gr) {
196 if (gr) {
197 if (gr->data) free(gr->data);
202 ////////////////////////////////////////////////////////////////////////////////
203 #ifndef NDEBUG
204 int SEE_regex_debug = 1;
205 #endif
208 enum {
209 OP_FAIL, /* match failed */
210 OP_SUCCEED, /* match succeeded */
211 OP_CHAR, /* match a char class instance */
212 OP_ZERO, /* reset counter */
213 OP_REACH, /* test counter over */
214 OP_NREACH, /* test counter under */
215 OP_START, /* enter a group */
216 OP_END, /* exit a group */
217 OP_UNDEF, /* reset a group */
218 OP_MARK, /* record a position */
219 OP_FDIST, /* position test */
220 OP_RDIST, /* position and counter test */
221 OP_MNEXT, /* max-loop */
222 OP_RNEXT, /* reach-loop */
223 OP_GOTO, /* branch */
224 OP_GS, /* greedy success */
225 OP_NS, /* non-greedy success */
226 OP_GF, /* greedy fail */
227 OP_NF, /* non-greedy fail */
228 OP_AS, /* assert success */
229 OP_AF, /* assert fail */
230 OP_BOL, /* test beginning of line */
231 OP_EOL, /* test end of line */
232 OP_BRK, /* test word-break */
233 OP_NBRK, /* test non-word-break */
234 OP_BACKREF, /* backreference match */
236 OP_LAST
240 typedef struct CharClassRange CharClassRange;
241 typedef struct CharClass CharClass;
244 struct CharClassRange {
245 CharClassRange *next;
246 uint32_t lo; /* simple range of chars, eg [a-z] */
247 uint32_t hi;
248 CharClassRange *prev; // all allocated
252 struct CharClass {
253 CharClassRange *ranges; /* linked list of character ranges */
254 CharClass *prev; // all allocated
258 struct SEE_RegExpr {
259 int ncaptures, ncounters, nmarks, maxref;
260 int statesz;
261 unsigned char *code;
262 unsigned int codelen;
263 rxGrowable codegrow;
264 CharClass **cc;
265 unsigned int cclen;
266 rxGrowable ccgrow;
267 int flags;
268 CharClassRange *crList; // all allocated
269 CharClass *ccList;
270 jmp_buf errJP;
274 typedef struct REContext {
275 XXInput *input;
276 SEE_RegExpr *regex;
277 jmp_buf errJP;
278 } REContext;
281 ////////////////////////////////////////////////////////////////////////////////
282 #define SEE_NEW(eng, type) calloc(1, sizeof(type))
283 #define SEE_ASSERT(eng, cond) assert(cond)
286 #define NEW1(t) SEE_NEW(REContext->eng, t)
287 #define NEXT xxiAhead(REContext->input)
288 #define SKIP SEE_INPUT_NEXT(REContext->input)
289 #define ATEOF (REContext->input->eof)
291 #define LOOKAHEAD(buf, len) SEE_input_lookahead_copy(REContext->input, buf, len)
293 // SEE_error_throw_string(REContext->eng, REContext->eng->SyntaxError, STR(regex_syntax_error))
294 #define SYNTAX_ERROR \
296 /*fprintf(stderr, "REGEX: syntax error\n");*/ \
297 longjmp(REContext->errJP, 666); \
300 #define EXPECT(c) \
301 do { \
302 if (ATEOF || NEXT != (c)) SYNTAX_ERROR; \
303 SKIP; \
304 } while (0)
306 #define RELADDR(base, addr) ((addr)-(base))
307 #define ASSERT(x) SEE_ASSERT(REContext->eng, x)
308 #define CODE_ADD(c) code_add(REContext, c)
309 #define CODE_INSERT(pos, n) code_insert(REContext, pos, n)
310 #define CODE_POS REContext->regex->codelen
311 #define CODE_PATCH(pos, c) REContext->regex->code[pos] = c
313 #define CODE_ADDI(i) \
314 do { \
315 unsigned int _i = (i); \
316 CODE_ADD((_i >> 8) & 0xff); \
317 CODE_ADD(_i & 0xff); \
318 } while (0)
320 #define CODE_ADDA(i) CODE_ADDI(RELADDR(CODE_POS, i))
322 #define CODE_PATCHI(pos, i) \
323 do { CODE_PATCH(pos, ((i) >> 8) & 0xff); \
324 CODE_PATCH((pos)+1, (i) & 0xff); } while (0)
325 #define CODE_PATCHA(addr, i) CODE_PATCHI(addr, RELADDR(addr, i))
326 #define CODE_SZA 2
327 #define CODE_SZI 2
329 #define CODE_MAKEI(code, addr) ((code[addr] << 8) | code[(addr)+1])
330 #define CODE_MAKEA(code, addr) ((CODE_MAKEI(code, addr) + (addr)) & 0xffff)
332 #define CC_NEW() cc_new(REContext)
333 #define CC_ADDRANGE(cc, l, h) cc_add_range(REContext, cc, l, (h)+1)
334 #define CC_ADDCHAR(cc, ch) CC_ADDRANGE(cc, ch, ch)
335 #define CC_INVERT(cc) cc_invert(REContext, cc)
336 #define CC_ADDCC(dst, src) cc_add_cc(REContext, dst, src)
337 #define CC_INTERN(cc) cc_intern(REContext, cc)
339 #define UNDEFINED (-1)
340 #undef INFINITY /* XXX fixme - rename INFINITY in this file maybe? */
341 #define INFINITY (-1)
344 ////////////////////////////////////////////////////////////////////////////////
345 // Prototypes
347 static CharClass *cc_new (REContext *);
348 static void cc_add_range (REContext *, CharClass *, SEE_unicode_t, SEE_unicode_t);
349 static void cc_invert (REContext *, CharClass *);
350 static void cc_add_cc (REContext *, CharClass *, CharClass *);
351 static int cc_issingle (CharClass *);
352 static uint32_t cc_count (CharClass *);
353 static int cc_cmp (CharClass *, CharClass *);
354 static int cc_intern (REContext *, CharClass *);
355 static int cc_contains (CharClass *, SEE_unicode_t);
356 static SEE_RegExpr *regex_new (REContext *);
357 static void code_add (REContext *, int);
358 static void code_insert (REContext *, int, int);
360 static void Disjunction_parse (REContext *);
361 static void Alternative_parse (REContext *);
362 static void Term_parse (REContext *);
363 static int Quantifier_is_next (REContext *);
364 static int Integer_parse (REContext *);
365 static void Atom_parse (REContext *);
366 static unsigned char HexDigit_parse (REContext *);
367 static CharClass *ClassEscape_parse (REContext *);
368 static CharClass *ClassAtom_parse (REContext *);
369 static CharClass *CanonicalizeClass (REContext *, CharClass *);
370 static CharClass *CharacterClass_parse (REContext *);
372 #ifndef NDEBUG
373 static void dprint_ch (SEE_unicode_t);
374 static void dprint_cc (CharClass *);
375 static int dprint_code (SEE_RegExpr *, int);
376 static void dprint_regex (SEE_RegExpr *);
377 #endif
379 static SEE_unicode_t Canonicalize (SEE_RegExpr *, SEE_unicode_t);
380 static SEE_boolean_t pcode_run (SEE_RegExpr *regex, unsigned int addr, const char *text, int textLen, char *state);
381 static void optimize_regex (SEE_RegExpr *);
384 ////////////////////////////////////////////////////////////////////////////////
385 // CharClass
387 // Create a new, empty CharClass
388 static CharClass *cc_new (REContext *REContext) {
389 CharClass *c;
390 c = NEW1(struct CharClass);
391 c->ranges = NULL;
392 c->prev = REContext->regex->ccList;
393 REContext->regex->ccList = c;
394 return c;
398 // Add a range to a CharClass
399 static void cc_add_range (REContext *REContext, CharClass *c, SEE_unicode_t lo, SEE_unicode_t hi) {
400 CharClassRange *s, *newr = NULL;
401 CharClassRange **rp;
402 for (rp = &c->ranges; *rp; rp = &(*rp)->next) if (lo <= (*rp)->hi) break;
403 if (!*rp || hi < (*rp)->lo) {
404 newr = NEW1(struct CharClassRange);
405 newr->lo = lo;
406 newr->hi = hi;
407 newr->next = *rp;
408 *rp = newr;
409 newr->prev = REContext->regex->crList;
410 REContext->regex->crList = newr;
411 } else {
412 if (lo < (*rp)->lo) (*rp)->lo = lo;
413 if (hi > (*rp)->hi) {
414 (*rp)->hi = hi;
415 s = (*rp)->next;
416 while (s && s->hi < hi) (*rp)->next = s = s->next;
417 if (s && s->lo <= hi) {
418 (*rp)->hi = s->hi;
419 (*rp)->next = s->next;
426 // Invert a CharClass
427 static void cc_invert (REContext *REContext, CharClass *c) {
428 CharClassRange *l, *newlist, *r;
429 r = c->ranges;
430 if (r && r->lo == 0 && r->hi == ~0) {
431 c->ranges = NULL;
432 return;
434 l = newlist = NEW1(struct CharClassRange);
435 l->prev = REContext->regex->crList;
436 REContext->regex->crList = l;
437 if (r && r->lo == 0) {
438 l->lo = r->hi;
439 r = r->next;
440 } else l->lo = 0;
441 for (; r; r = r->next) {
442 l->hi = r->lo;
443 if (r->hi == ~0) {
444 l->next = NULL;
445 l = NULL;
446 break;
448 l = (l->next = NEW1(struct CharClassRange));
449 l->lo = r->hi;
450 l->prev = REContext->regex->crList;
451 REContext->regex->crList = l;
453 if (l) {
454 l->hi = ~0;
455 l->next = NULL;
457 c->ranges = newlist;
461 static void cc_add_cc (REContext *REContext, CharClass *dst, CharClass *src) {
462 CharClassRange *r;
463 //FIXME: very inefficient
464 for (r = src->ranges; r; r = r->next) cc_add_range(REContext, dst, r->lo, r->hi);
468 static int cc_issingle (CharClass *c) {
469 CharClassRange *r = c->ranges;
470 return r != NULL &&
471 r->next == NULL &&
472 r->lo+1 == r->hi;
476 // Return the number of characters in the class
477 static uint32_t cc_count (CharClass *c) {
478 uint32_t count = 0;
479 CharClassRange *r;
480 for (r = c->ranges; r; r = r->next) count += r->hi-r->lo;
481 return count;
485 // Return 0 if two CharClasses are identical
486 static int cc_cmp (CharClass *c1, CharClass *c2) {
487 CharClassRange *r1, *r2;
488 r1 = c1->ranges;
489 r2 = c2->ranges;
490 while (r1 && r2) {
491 if (r1->lo != r2->lo) return r1->lo-r2->lo;
492 if (r1->hi != r2->hi) return r1->hi-r2->hi;
493 r1 = r1->next;
494 r2 = r2->next;
496 if (r1) return 1;
497 if (r2) return -1;
498 return 0;
502 // Insert CharClass into REContext, returning unique ID
503 static int cc_intern (REContext *REContext, CharClass *c) {
504 SEE_RegExpr *regex = REContext->regex;
505 int i;
506 for (i = 0; i < regex->cclen; i++) if (cc_cmp(c, regex->cc[i]) == 0) return i;
507 i = regex->cclen;
508 SEE_grow_to(&regex->ccgrow, regex->cclen+1);
509 regex->cc[i] = c;
510 return i;
514 // Return true if CharClass c contains character ch
515 static int cc_contains (CharClass *c, SEE_unicode_t ch) {
516 CharClassRange *r;
517 for (r = c->ranges; r; r = r->next) {
518 if (ch >= r->lo && ch < r->hi) return 1;
519 if (ch < r->lo) return 0;
521 return 0;
525 ////////////////////////////////////////////////////////////////////////////////
526 #ifndef NDEBUG
527 // Print a character in a readable form
528 static void dprint_ch (SEE_unicode_t ch) {
529 switch (ch) {
530 case '{': case '}': case '[': case ']':
531 case '(': case ')': case '-':
532 case '.': case '^': case '$': case '|':
533 case '?': case '*': case '+': case '\\':
534 dprintf("\\%c", ch & 0x7f); break;
535 case 0x0000: dprintf("\\0"); break;
536 case 0x0009: dprintf("\\t"); break;
537 case 0x000a: dprintf("\\n"); break;
538 case 0x000b: dprintf("\\v"); break;
539 case 0x000c: dprintf("\\f"); break;
540 case 0x000d: dprintf("\\r"); break;
541 default:
542 if (ch >= ' ' && ch <= '~') dprintf("%c", ch & 0x7f);
543 else if (ch < 0x100) dprintf("\\x%02x", ch & 0xff);
544 else dprintf("\\u%04x", ch);
549 /// Print a character class in a readable form
550 static void dprint_cc (CharClass *c) {
551 CharClassRange *r;
552 dprintf("[");
553 if (c->ranges && c->ranges->lo == 0) {
554 dprintf("^");
555 for (r = c->ranges; r; r = r->next) {
556 if (r->next) {
557 dprint_ch(r->hi);
558 if (r->next->lo != r->hi + 1) {
559 dprintf("-");
560 dprint_ch(r->next->lo - 1);
562 } else if (r->hi != ~0) {
563 dprint_ch(r->hi);
564 dprintf("-");
565 dprint_ch(~0);
568 } else {
569 for (r = c->ranges; r; r = r->next) {
570 dprint_ch(r->lo);
571 if (r->hi != r->lo + 1) {
572 dprintf("-");
573 dprint_ch(r->hi - 1);
577 dprintf("]");
579 #endif
582 ////////////////////////////////////////////////////////////////////////////////
583 // regex and pcode construction
587 * Allocate a new regex structure. This will contain everything needed
588 * to run and match a string, independent of the original pattern text.
590 static SEE_RegExpr *regex_new (REContext *REContext) {
591 SEE_RegExpr *regex;
592 regex = NEW1(struct SEE_RegExpr);
593 regex->ncaptures = 0;
594 regex->maxref = 0;
595 regex->ncounters = 0;
596 regex->nmarks = 0;
597 regex->statesz = 0;
598 SEE_GROW_INIT(&regex->codegrow, regex->code, regex->codelen);
599 regex->codegrow.is_string = 1;
600 SEE_GROW_INIT(&regex->ccgrow, regex->cc, regex->cclen);
601 regex->flags = 0;
602 return regex;
606 // add to the end of the p-code array, resizing as needed
607 static void code_add (REContext *REContext, int c) {
608 SEE_RegExpr *regex = REContext->regex;
609 unsigned int i;
610 i = regex->codelen;
611 SEE_grow_to(&regex->codegrow, i+1);
612 regex->code[i] = c;
616 // insert some bytes into the middle of the p-code, resizing as needed
617 static void code_insert (REContext *REContext, int pos, int n) {
618 SEE_RegExpr *regex = REContext->regex;
619 int i;
620 for (i = 0; i < n; i++) code_add(REContext, 0);
621 for (i = regex->codelen - n; i > pos; i--) regex->code[i-1+n] = regex->code[i-1];
625 ////////////////////////////////////////////////////////////////////////////////
626 // Parser
628 * This recursive descent parser builds a p-code array as it runs.
629 * During recursion, the p-code array is sometimes 'back-patched'
630 * because branch distances weren't known in advance. In some
631 * cases, p-code segments are also shifted. This all means that we
632 * have to be very careful that our p-code is quite relocatable,
633 * and not dependendent on absolute addresses.
636 // parse a source pattern, and return a filled-in regex structure
637 SEE_RegExpr *SEE_regex_parse (const char *source, int patLen, int flags) {
638 REContext *REContext;
639 SEE_RegExpr *regex;
640 REContext = SEE_NEW(eng, struct REContext);
641 //REContext->input = SEE_input_lookahead(SEE_input_string(eng, source), 24);
642 REContext->input = xxiNew(source, patLen);
643 REContext->regex = regex = regex_new(REContext);
644 regex->flags = flags;
645 regex->ncaptures = 1;
646 if (setjmp(REContext->errJP)) {
647 free(REContext);
648 SEE_regex_free(regex);
649 return NULL;
651 Disjunction_parse(REContext);
652 if (!ATEOF) SYNTAX_ERROR;
653 CODE_ADD(OP_SUCCEED);
654 // Check that no backreferences were too big
655 if (regex->maxref >= regex->ncaptures) SYNTAX_ERROR;
656 //FIXME: XXX - should this close be enclosed in a 'finally'?
657 SEE_INPUT_CLOSE(REContext->input);
658 // compute the size of a captures context
659 regex->statesz =
660 regex->ncaptures*sizeof(Capture)+
661 regex->ncounters*sizeof(int)+
662 regex->nmarks*sizeof(int);
663 optimize_regex(regex);
664 #ifndef NDEBUG
665 if (SEE_regex_debug) {
666 dprintf("regex:");
667 dprint_regex(regex);
668 dprintf(".\n");
670 #endif
671 free(REContext);
672 return regex;
676 // Returns the number of capture parentheses in the compiled regex
677 int SEE_regex_count_captures (const SEE_RegExpr *regex) {
678 return regex->ncaptures;
682 // Returns the flags of the regex object
683 int SEE_regex_get_flags (const SEE_RegExpr *regex) {
684 return regex->flags;
689 * Disjunction :: Alternative
690 * Alternative | Disjunction
692 static void Disjunction_parse (REContext *REContext) {
693 int pos = CODE_POS;
694 Alternative_parse(REContext);
695 if (!ATEOF && NEXT == '|') {
696 int p = pos, p1, p2, x1, x2;
697 int insert = 1+CODE_SZA;
698 SKIP;
699 CODE_INSERT(pos, insert);
700 CODE_PATCH(p, OP_GF); p++; /* GF x1 */
701 p1 = p; p += CODE_SZA;
702 ASSERT(p == pos + insert); /* (a) */
703 CODE_ADD(OP_GOTO); /* GOTO x2 */
704 p2 = CODE_POS; CODE_ADDA(0);
705 x1 = CODE_POS; /* x1: (b) */
706 Disjunction_parse(REContext);
707 x2 = CODE_POS; /* x2: */
708 CODE_PATCHA(p1, x1);
709 CODE_PATCHA(p2, x2);
715 * Alternative :: [empty] -- lookahead in ) |
716 * Term
717 * Term Alternative
719 static void Alternative_parse (REContext *REContext) {
720 while (!(ATEOF || NEXT == /*(*/')' || NEXT == '|')) Term_parse(REContext);
724 /*------------------------------------------------------------
725 * Term ::
726 * Assertion la = ^ $ \b \B
727 * Atom
728 * Atom Quantifier
731 /* We have 24 bytes of lookahead, which is sufficient to
732 * scan for {2147483647,2147483647}. Anything larger will
733 * overflow the signed int type on 32 bit systems.
735 static int Quantifier_is_next (REContext *REContext) {
736 int pos, len;
737 SEE_unicode_t lookahead[24];
738 if (NEXT != '{') return 0;
740 * Strict ECMA-262 says that '{' is NOT a Pattern character,
741 * but Mozilla allows it
743 /*EXT:24*/
744 //if (!SEE_COMPAT_JS(REContext->eng, >=, JS11)) return 1;
745 len = LOOKAHEAD(lookahead, 24);
746 pos = 1;
747 while (pos < len && lookahead[pos] >= '0' && lookahead[pos] <= '9') pos++;
748 if (pos < len && lookahead[pos] == ',') pos++;
749 else if (pos < len && lookahead[pos] == '}') return pos > 1;
750 else return 0;
751 while (pos < len && lookahead[pos] >= '0' && lookahead[pos] <= '9') pos++;
752 if (pos < len && lookahead[pos] == '}') pos++; else return 0;
753 return 1;
757 static void Term_parse (REContext *REContext) {
758 int min, max, greedy, pos;
759 int oparen, cparen;
760 int lookahead_len;
761 SEE_unicode_t lookahead[2];
763 * parse Assertion inline since it is a bit special
764 * in terms of its lookahead
766 switch (NEXT) {
767 case '\\':
768 lookahead_len = LOOKAHEAD(lookahead, 2);
769 if (lookahead_len > 1 && lookahead[1] == 'b') {
770 SKIP; SKIP;
771 CODE_ADD(OP_BRK);
772 return;
774 if (lookahead_len > 1 && lookahead[1] == 'B') {
775 SKIP; SKIP;
776 CODE_ADD(OP_NBRK);
777 return;
779 // some other kind of escape
780 break;
781 case '^':
782 SKIP;
783 CODE_ADD(OP_BOL);
784 return;
785 case '$':
786 SKIP;
787 CODE_ADD(OP_EOL);
788 return;
789 // Lookaheads forbidden by the Atom production
790 case '*': case '+': case '?': case ')':
791 case ']': case '{': case '}': case '|':
792 SYNTAX_ERROR;
794 pos = CODE_POS;
795 oparen = REContext->regex->ncaptures;
796 Atom_parse(REContext);
797 cparen = REContext->regex->ncaptures;
799 * parse Quantifier inline to save my sanity
801 if (ATEOF) {
802 min = max = 1;
803 } else if (NEXT == '*') {
804 SKIP; min = 0; max = INFINITY;
805 } else if (NEXT == '+') {
806 SKIP; min = 1; max = INFINITY;
807 } else if (NEXT == '?') {
808 SKIP; min = 0; max = 1;
809 } else if (Quantifier_is_next(REContext)) {
810 SKIP;
811 min = Integer_parse(REContext);
812 if (!ATEOF && NEXT == ',') {
813 EXPECT(',');
814 if (!ATEOF && NEXT == '}') max = INFINITY;
815 else max = Integer_parse(REContext);
816 } else max = min;
817 EXPECT('}');
818 } else {
819 min = max = 1;
821 if (!ATEOF && NEXT == '?') {
822 SKIP; greedy = 0;
823 } else greedy = 1;
824 if (min == max && !greedy) {
826 * XXX should we warn that the greedy modifiers to
827 * 'a{n,n}?' and 'a{n}?' are technically meaningless?
828 * We speed up our code by using greedy mode anyway.
830 greedy = 1;
832 /* Don't allow stupid ranges, such as 'a{7,3}' */
833 if (max != INFINITY && min > max) SYNTAX_ERROR;
834 if (min == 1 && max == 1) return; /* a */
835 if (max == 0) {
836 /* a{0} */
837 /* Undo! */
838 CODE_POS = pos;
839 return;
841 if (oparen != cparen) {
843 * If the atom introduces capture parentheses,
844 * then insert code to reset them before each
845 * iteration.
847 CODE_INSERT(pos, 1 + 2*CODE_SZI);
848 CODE_PATCH(pos, OP_UNDEF);
849 CODE_PATCHI(pos + 1, oparen);
850 CODE_PATCHI(pos + 1 + CODE_SZI, cparen);
853 * The following code generators all generate looping
854 * matchers. While every corresponding pattern could
855 * be written as the generalised 'a{n,m}' (where m could
856 * be INFINITY), some efficiency is gained by selecting cases
857 * where general code that would be a no-op is not emitted.
859 if (min == max) { /* a{m} */
860 int p = pos, px;
861 int insert = 1 + CODE_SZI;
862 int c = REContext->regex->ncounters++;
863 CODE_INSERT(pos, insert);
864 CODE_PATCH(p, OP_ZERO); p++; /* ZERO c; */
865 CODE_PATCHI(p, c); p += CODE_SZI;
866 px = p; /* x: */
867 ASSERT(p == pos + insert); /* (a) */
868 CODE_ADD(OP_RNEXT); /* RNEXT c,m,x */
869 CODE_ADDI(c);
870 CODE_ADDI(max);
871 CODE_ADDA(px);
872 return;
874 if (min == 0 && max == 1) {
875 /* a? */
876 int p = pos, p1, px;
877 int insert = 1 + CODE_SZA;
878 CODE_INSERT(pos, insert);
879 CODE_PATCH(p, greedy ? OP_GF : OP_NF); /* GF x */
880 p++; p1 = p; p += CODE_SZA;
881 ASSERT(p == pos + insert); /* (a) */
882 px = CODE_POS; /* x: */
884 CODE_PATCHA(p1, px);
885 return;
887 if (min == 0 && max == INFINITY) {
888 /* a* */
889 int p = pos, px, py, p1;
890 int insert = 2 + CODE_SZA + CODE_SZI;
891 int m = REContext->regex->nmarks++;
892 CODE_INSERT(pos, insert);
893 px = p; /* x: GF y */
894 CODE_PATCH(p, greedy ? OP_GF : OP_NF); p++;
895 p1 = p; p += CODE_SZA;
896 CODE_PATCH(p, OP_MARK); p++; /* MARK m */
897 CODE_PATCHI(p, m); p += CODE_SZI;
898 ASSERT(p == pos + insert); /* (a) */
899 CODE_ADD(OP_FDIST); /* FDIST m */
900 CODE_ADDI(m);
901 CODE_ADD(OP_GOTO); /* GOTO x */
902 CODE_ADDA(px);
903 py = CODE_POS; /* y: */
904 CODE_PATCHA(p1, py);
905 return;
908 /* a{n,m} */
909 int p = pos, px, py, p1;
910 int insert = 3 + CODE_SZI * 2 + CODE_SZA;
911 int c = REContext->regex->ncounters++;
912 int k = REContext->regex->nmarks++;
913 CODE_INSERT(pos, insert);
914 CODE_PATCH(p, OP_ZERO); p++; /* ZERO c */
915 CODE_PATCHI(p, c); p += CODE_SZI;
916 px = p; /* x: GF y */
917 CODE_PATCH(p, greedy ? OP_GF : OP_NF); p++;
918 p1 = p; p += CODE_SZA;
919 CODE_PATCH(p, OP_MARK); p++; /* MARK k */
920 CODE_PATCHI(p, k); p += CODE_SZI;
921 ASSERT(p == pos + insert); /* (a) */
922 if (min) {
923 CODE_ADD(OP_RDIST); /* RDIST k,c,n */
924 CODE_ADDI(k);
925 CODE_ADDI(c);
926 CODE_ADDI(min);
927 } else {
928 CODE_ADD(OP_FDIST); /* FDIST k */
929 CODE_ADDI(k);
931 if (max != INFINITY) {
932 CODE_ADD(OP_RNEXT); /* RNEXT c,m,x */
933 CODE_ADDI(c);
934 CODE_ADDI(max);
935 CODE_ADDA(px);
936 } else {
937 CODE_ADD(OP_MNEXT); /* MNEXT c,n,x */
938 CODE_ADDI(c);
939 CODE_ADDI(min);
940 CODE_ADDA(px);
942 py = CODE_POS; /* y: */
943 if (min) {
944 CODE_ADD(OP_REACH); /* REACH c,n */
945 CODE_ADDI(c);
946 CODE_ADDI(min);
948 CODE_PATCHA(p1, py);
949 return;
954 // Parse a simple integer; used for repetition ranges
955 static int Integer_parse (REContext *REContext) {
956 int val;
957 int hasdig = 0;
958 val = 0;
959 while (!ATEOF && NEXT >= '0' && NEXT <= '9') {
960 val = 10 * val + (NEXT-'0');
961 hasdig = 1;
962 SKIP;
964 if (!hasdig) SYNTAX_ERROR;
965 return val;
970 * Atom:: la != ^ $ * + ? ) ] { } |
971 * pattern character la != ^ $ \ . * + ? ( ) [ ] { } |
973 * \ AtomEscape
974 * [ CharacterClass ]
975 * ( Disjunction )
976 * ( ?: Disjunction )
977 * ( ?= Disjunction )
978 * ( ?! Disjunction )
980 static void Atom_parse (REContext *REContext) {
981 CharClass *c;
982 int i;
983 if (NEXT == '(') {
984 SKIP;
985 if (!ATEOF && NEXT == '?') { /* (?... */
986 SKIP;
987 if (!ATEOF && NEXT == ':') { /* (?:... */
988 SKIP;
989 Disjunction_parse(REContext);
990 } else if (!ATEOF && (NEXT == '=' || NEXT == '!')) {
991 int px, p1; /* (?=... */
992 int neg = (NEXT == '!'); /* (?!... */
993 SKIP;
994 CODE_ADD(neg ? OP_AF : OP_AS); /* AS x */
995 p1 = CODE_POS; CODE_ADDI(0);
996 Disjunction_parse(REContext); /* (a) */
997 CODE_ADD(OP_SUCCEED); /* SUCCEED */
998 px = CODE_POS; /* x: */
999 CODE_PATCHA(p1, px);
1000 } else SYNTAX_ERROR;
1001 } else { /* (...) */
1002 i = REContext->regex->ncaptures++;
1003 CODE_ADD(OP_START); /* START i */
1004 CODE_ADDI(i);
1005 Disjunction_parse(REContext); /* (a) */
1006 CODE_ADD(OP_END); /* END i */
1007 CODE_ADDI(i);
1009 EXPECT(')');
1010 return;
1013 * All other atoms compile to simple character class matches
1014 * (or backreferences)
1016 switch (NEXT) {
1017 case '\\':
1018 SKIP;
1019 if (ATEOF) SYNTAX_ERROR;
1020 if (NEXT >= '1' && NEXT <= '9') {
1021 i = 0;
1022 while (!ATEOF && (NEXT >= '0' && NEXT <= '9')) {
1023 i = 10 * i + NEXT - '0';
1024 SKIP;
1026 CODE_ADD(OP_BACKREF);
1027 CODE_ADDI(i);
1028 if (i > REContext->regex->maxref) REContext->regex->maxref = i;
1029 return;
1031 c = ClassEscape_parse(REContext);
1032 break;
1033 case '[':
1034 c = CharacterClass_parse(REContext);
1035 break;
1036 case '.':
1037 SKIP;
1038 c = CC_NEW();
1039 CC_ADDCHAR(c, 0x000a);
1040 CC_ADDCHAR(c, 0x000d);
1041 //CC_ADDCHAR(c, 0x2028);
1042 //CC_ADDCHAR(c, 0x2029);
1043 CC_INVERT(c);
1044 break;
1045 default:
1046 c = CC_NEW();
1047 CC_ADDCHAR(c, Canonicalize(REContext->regex, NEXT));
1048 SKIP;
1049 break;
1051 i = CC_INTERN(c);
1052 CODE_ADD(OP_CHAR);
1053 CODE_ADDI(i);
1057 static unsigned char HexDigit_parse (REContext *REContext) {
1058 SEE_unicode_t c;
1059 if (ATEOF) SYNTAX_ERROR;
1060 c = NEXT; SKIP;
1061 if (c >= '0' && c <= '9') return c-'0';
1062 if (c >= 'a' && c <= 'f') return c-'a'+10;
1063 if (c >= 'A' && c <= 'F') return c-'A'+10;
1064 SYNTAX_ERROR;
1068 static CharClass *ClassEscape_parse (REContext *REContext) {
1069 CharClass *c;
1070 SEE_unicode_t ch, lookahead[3];
1071 int i;
1072 // EXPECT('\\'); // backslash already skipped
1073 c = CC_NEW();
1074 if (NEXT >= '0' && NEXT <= '9') {
1075 // \0oo - 3 digit octal escapes
1076 /*EXT:25*/
1077 if (/*SEE_COMPAT_JS(REContext->eng, >=, JS11) &&*/ NEXT == '0' && LOOKAHEAD(lookahead, 3) >= 2 &&
1078 lookahead[1] >= '0' && lookahead[1] < '8' && lookahead[2] >= '0' && lookahead[2] < '8') {
1079 ch = (lookahead[1] - '0') * 8 + (lookahead[2] - '0');
1080 CC_ADDCHAR(c, ch);
1081 SKIP;
1082 SKIP;
1083 SKIP;
1084 return c;
1086 i = 0;
1087 while (!ATEOF && NEXT >= '0' && NEXT <= '9') {
1088 i = 10 * i + NEXT - '0';
1089 SKIP;
1092 * 15.10.2.1.9: "Using a backreference inside a ClassAtom
1093 * causes an error"
1095 if (i != 0) SYNTAX_ERROR;
1096 CC_ADDCHAR(c, i);
1097 return c;
1099 ch = NEXT;
1100 SKIP;
1101 switch (ch) {
1102 case 'b': CC_ADDCHAR(c, 0x0008); break;
1103 case 't': CC_ADDCHAR(c, 0x0009); break;
1104 case 'n': CC_ADDCHAR(c, 0x000a); break;
1105 case 'v': CC_ADDCHAR(c, 0x000b); break;
1106 case 'f': CC_ADDCHAR(c, 0x000c); break;
1107 case 'r': CC_ADDCHAR(c, 0x000d); break;
1108 case 'D':
1109 case 'd':
1110 CC_ADDRANGE(c, '0', '9');
1111 if (ch == 'D') CC_INVERT(c);
1112 break;
1113 case 'W':
1114 case 'w':
1115 CC_ADDRANGE(c, 'a', 'z');
1116 CC_ADDRANGE(c, 'A', 'Z');
1117 CC_ADDRANGE(c, '0', '9');
1118 CC_ADDCHAR(c, '_');
1119 if (ch == 'W') CC_INVERT(c);
1120 break;
1121 case 'S':
1122 case 's':
1123 #if WITH_UNICODE_TABLES
1124 for (i = 0; i < SEE_unicode_Zscodeslen; i++) CC_ADDCHAR(c, SEE_unicode_Zscodes[i]);
1125 #else
1126 CC_ADDCHAR(c, 0x0020);
1127 #endif
1128 if (ch == 'S') CC_INVERT(c);
1129 break;
1130 case 'c':
1131 if (ATEOF) SYNTAX_ERROR;
1132 ch = NEXT; SKIP;
1133 if (('a' <= ch && ch <= 'z') ||
1134 ('A' <= ch && ch <= 'Z')) CC_ADDCHAR(c, ch % 32);
1135 else SYNTAX_ERROR;
1136 break;
1137 case 'u':
1138 case 'x':
1139 i = (ch == 'x' ? 2 : 4);
1140 ch = 0;
1141 while (i--) ch = (ch << 4) | HexDigit_parse(REContext);
1142 CC_ADDCHAR(c, ch);
1143 break;
1144 default:
1145 CC_ADDCHAR(c, ch);
1146 break;
1148 return c;
1153 * ClassAtom :
1154 * '\' ClassEscape
1155 * anychar
1157 static CharClass *ClassAtom_parse (REContext *REContext) {
1158 CharClass *c;
1159 if (ATEOF) SYNTAX_ERROR;
1160 if (NEXT == '\\') {
1161 SKIP;
1162 return ClassEscape_parse(REContext);
1164 c = CC_NEW();
1165 CC_ADDCHAR(c, NEXT);
1166 SKIP;
1167 return c;
1172 * Convert the CharClass into a canonicalised version. (SLOW!)
1173 * !!!Every character in the class has to be individually canonicalized!!!
1174 * !!!OUCH!!!OUCH!!!I!HATE!YOU!ALL!!!
1176 static CharClass *CanonicalizeClass (REContext *REContext, CharClass *c) {
1177 CharClass *ccanon;
1178 CharClassRange *r;
1179 SEE_unicode_t ch, uch;
1180 if (cc_count(c) > (uint32_t)~0 / 2) {
1181 CC_INVERT(c);
1182 ccanon = CanonicalizeClass(REContext, c);
1183 CC_INVERT(ccanon);
1184 return ccanon;
1187 * Evil hack: if the CharClass range includes ['A'-0xEFFFF], then
1188 * there is no need to canonicalize because every uppercase character
1189 * is already there.
1191 for (r = c->ranges; r; r = r->next) if (r->lo <= 'A' && r->hi > 0xf0000) return c;
1192 ccanon = CC_NEW();
1193 for (r = c->ranges; r; r = r->next) {
1194 for (ch = r->lo; ch < r->hi; ch++) {
1195 uch = UNICODE_TOUPPER(ch);
1196 CC_ADDCHAR(ccanon, uch);
1199 return ccanon;
1204 * CharacterClass ::
1205 * '[' [^] ( ClassAtom | ClassAtom '-' ClassAtom ) * ']'
1207 static CharClass *CharacterClass_parse (REContext *REContext) {
1208 CharClass *c = CC_NEW(), *a, *b;
1209 int invertflag = 0;
1210 EXPECT('[');
1211 if (!ATEOF && NEXT == '^') {
1212 invertflag = 1;
1213 SKIP;
1215 while (!ATEOF && NEXT != ']') {
1216 a = ClassAtom_parse(REContext);
1217 if (!ATEOF && NEXT == '-') {
1218 SKIP;
1219 // Treat '-' literally if at end of ClassRanges 15.10.2.16
1220 if (!ATEOF && NEXT == ']') {
1221 CC_ADDCHAR(a, '-');
1222 goto out;
1224 if (!cc_issingle(a)) SYNTAX_ERROR;
1225 b = ClassAtom_parse(REContext);
1226 if (!cc_issingle(b)) SYNTAX_ERROR;
1227 if (b->ranges->lo < a->ranges->lo) SYNTAX_ERROR;
1228 a->ranges->hi = b->ranges->hi;
1229 // free(b)
1231 out:
1232 CC_ADDCC(c, a);
1233 // free(a)
1235 //if (c->ranges == NULL) SYNTAX_ERROR;
1236 EXPECT(']');
1237 if (REContext->regex->flags & SEERX_FLAG_IGNORECASE) {
1238 c = CanonicalizeClass(REContext, c);
1239 // free(c); // i.e the old c
1241 if (invertflag) CC_INVERT(c);
1242 return c;
1246 ////////////////////////////////////////////////////////////////////////////////
1247 // P-code
1249 #ifndef NDEBUG
1250 static int dprint_code (SEE_RegExpr *regex, int addr) {
1251 int i;
1252 const char *op="", *opc;
1253 unsigned char *code = regex->code;
1254 dprintf("0x%04x: ", addr);
1255 switch (code[addr]) {
1256 case OP_FAIL: dprintf("FAIL"); op = ""; break;
1257 case OP_SUCCEED: dprintf("SUCCEED"); op = ""; break;
1258 case OP_CHAR: dprintf("CHAR"); op = "i"; break;
1259 case OP_ZERO: dprintf("ZERO"); op = "i"; break;
1260 case OP_REACH: dprintf("REACH"); op = "ii"; break;
1261 case OP_NREACH: dprintf("NREACH"); op = "ii"; break;
1262 case OP_START: dprintf("START"); op = "i"; break;
1263 case OP_END: dprintf("END"); op = "i"; break;
1264 case OP_UNDEF: dprintf("UNDEF"); op = "ii"; break;
1265 case OP_MARK: dprintf("MARK"); op = "i"; break;
1266 case OP_FDIST: dprintf("FDIST"); op = "i"; break;
1267 case OP_RDIST: dprintf("RDIST"); op = "iii"; break;
1268 case OP_MNEXT: dprintf("MNEXT"); op = "iia"; break;
1269 case OP_RNEXT: dprintf("RNEXT"); op = "iia"; break;
1270 case OP_GOTO: dprintf("GOTO"); op = "a"; break;
1271 case OP_GS: dprintf("GS"); op = "a"; break;
1272 case OP_NS: dprintf("NS"); op = "a"; break;
1273 case OP_GF: dprintf("GF"); op = "a"; break;
1274 case OP_NF: dprintf("NF"); op = "a"; break;
1275 case OP_AS: dprintf("AS"); op = "a"; break;
1276 case OP_AF: dprintf("AF"); op = "a"; break;
1277 case OP_BOL: dprintf("BOL"); op = ""; break;
1278 case OP_EOL: dprintf("EOL"); op = ""; break;
1279 case OP_BRK: dprintf("BRK"); op = ""; break;
1280 case OP_NBRK: dprintf("NBRK"); op = ""; break;
1281 case OP_BACKREF: dprintf("BACKREF"); op = "i"; break;
1282 default: dprintf("*** %d", code[addr]); break;
1284 addr++;
1285 for (opc = op; *opc; opc++) {
1286 if (opc != op) dprintf(",");
1287 dprintf(" ");
1288 switch (*opc) {
1289 case 'a':
1290 i = CODE_MAKEA(code, addr);
1291 dprintf("0x%04x", i);
1292 i = CODE_MAKEI(code, addr);
1293 dprintf(" [0x%04x]", i);
1294 addr += CODE_SZA;
1295 break;
1296 case 'i':
1297 i = CODE_MAKEI(code, addr); addr += CODE_SZI;
1298 dprintf("%d", i);
1299 break;
1300 case 'c':
1301 i = CODE_MAKEI(code, addr); addr += CODE_SZI;
1302 dprintf("%d=", i);
1303 if (i > regex->cclen) dprintf("**BAD**");
1304 else dprint_cc(regex->cc[i]);
1305 break;
1308 dprintf("\n");
1309 return addr;
1313 static void dprint_regex (SEE_RegExpr *regex) {
1314 int i, addr;
1315 CharClassRange *r;
1316 dprintf("regex %p\n", regex);
1317 dprintf("\tncaptures = %d\n", regex->ncaptures);
1318 dprintf("\tcodelen = %d\n", regex->codelen);
1319 dprintf("\tcclen = %d\n", regex->cclen);
1320 dprintf("\tflags = 0x%x\n", regex->flags);
1321 dprintf("\tcc:\n");
1322 for (i = 0; i < regex->cclen; i++) {
1323 dprintf("\t\t%d = ", i);
1324 dprint_cc(regex->cc[i]);
1325 dprintf("\n\t\t = { ");
1326 for (r = regex->cc[i]->ranges; r; r = r->next) dprintf("%x:%x ", r->lo, r->hi);
1327 dprintf("}\n");
1329 dprintf("\tcode:\n");
1330 addr = 0;
1331 while (addr < regex->codelen) addr = dprint_code(regex, addr);
1333 #endif
1336 ////////////////////////////////////////////////////////////////////////////////
1337 // pcode-execution
1339 /* 15.10.2.8 */
1340 static SEE_unicode_t Canonicalize (SEE_RegExpr *regex, SEE_unicode_t ch) {
1341 if (regex->flags & SEERX_FLAG_IGNORECASE) return UNICODE_TOUPPER(ch);
1342 return ch;
1346 static SEE_boolean_t pcode_run (SEE_RegExpr *regex, unsigned int addr, const char *text, int textLen, char *state) {
1347 SEE_boolean_t result;
1348 int i = 0, i2 = 0, i3 = 0;
1349 unsigned int a = 0;
1350 unsigned char op;
1351 SEE_unicode_t ch;
1352 Capture *capture;
1353 int *counter, *mark, statesz;
1354 unsigned int newaddr;
1355 char *newstate;
1356 //if (SEE_system.periodic) (*SEE_system.periodic)(eng);
1357 // Compute the offsets into the state structure
1358 statesz = 0;
1359 capture = (Capture *)state;
1360 statesz += regex->ncaptures * sizeof (Capture);
1361 counter = (int *)(state + statesz);
1362 statesz += regex->ncounters * sizeof (int);
1363 mark = (int *)(state + statesz);
1364 statesz += regex->nmarks * sizeof (int);
1365 SEE_ASSERT(eng, statesz == regex->statesz);
1366 newstate = calloc(statesz+1, sizeof(char));//SEE_STRING_ALLOCA(eng, char, statesz);
1367 #define index (capture[0].end)
1368 for (;;) {
1369 /* Catch bad branches */
1370 if (addr >= regex->codelen) {
1371 free(newstate);
1372 longjmp(regex->errJP, 666);
1373 //abort();
1374 //SEE_error_throw_string(eng, eng->Error, STR(internal_error));
1376 /* Read the opcode and its arguments */
1377 op = regex->code[addr];
1378 #ifndef NDEBUG
1379 if (SEE_regex_debug) {
1380 int x;
1381 //TJString mys;
1382 //mys.stringclass = 0;
1383 //mys.eng = 0;
1384 //mys.flags = 0;
1385 dprintf("index=%d captures=[", index);
1386 for (x = 0; x < regex->ncaptures; x++) {
1387 if (x) dprintf(",");
1388 if (capture[x].start == -1)
1389 dprintf("undef");
1390 else if (capture[x].start+capture[x].end > textLen) {
1391 dprintf("bad<%x:%x>",
1392 capture[x].start, capture[x].end);
1393 } else {
1394 int end = capture[x].end;
1395 if (end == -1) end = index;
1397 mys.length = end-capture[x].start;
1398 mys.data = text+capture[x].start;
1399 dprintf("\"");
1400 dprints(&mys);
1401 dprintf("\"");
1403 dprints(text+capture[x].start, end-capture[x].start);
1404 if (capture[x].end == -1) dprintf("+");
1407 dprintf("]");
1408 if (op == OP_ZERO || op == OP_REACH || op == OP_NREACH || op == OP_MNEXT || op == OP_RNEXT) {
1409 dprintf(" counter=[");
1410 for (x = 0; x < regex->ncounters; x++) {
1411 if (x) dprintf(",");
1412 dprintf("%d", counter[x]);
1414 dprintf("]");
1416 if (op == OP_MARK || op == OP_FDIST || op == OP_RDIST) {
1417 dprintf(" mark=[");
1418 for (x = 0; x < regex->nmarks; x++) {
1419 if (x) dprintf(",");
1420 dprintf("%d", mark[x]);
1422 dprintf("]");
1424 if (regex->code[addr] == OP_CHAR && index < textLen) dprintf(" ch='%c'", Canonicalize(regex, text[index]));
1425 dprintf("\n");
1426 (void)dprint_code(regex, addr);
1428 #endif
1429 addr++;
1430 switch (op) {
1431 case OP_FAIL: case OP_SUCCEED: case OP_BOL:
1432 case OP_EOL: case OP_BRK: case OP_NBRK:
1433 break;
1434 case OP_CHAR: case OP_ZERO: case OP_START:
1435 case OP_END: case OP_MARK: case OP_FDIST:
1436 case OP_BACKREF:
1437 i = CODE_MAKEI(regex->code, addr); /* integer arg */
1438 addr += CODE_SZI;
1439 break;
1440 case OP_REACH: case OP_NREACH: case OP_UNDEF:
1441 i = CODE_MAKEI(regex->code, addr); /* integer arg */
1442 addr += CODE_SZI;
1443 i2 = CODE_MAKEI(regex->code, addr); /* integer arg */
1444 addr += CODE_SZI;
1445 break;
1446 case OP_RDIST:
1447 i = CODE_MAKEI(regex->code, addr); /* integer arg */
1448 addr += CODE_SZI;
1449 i2 = CODE_MAKEI(regex->code, addr); /* integer arg */
1450 addr += CODE_SZI;
1451 i3 = CODE_MAKEI(regex->code, addr); /* integer arg */
1452 addr += CODE_SZI;
1453 break;
1454 case OP_MNEXT: case OP_RNEXT:
1455 i = CODE_MAKEI(regex->code, addr); /* integer arg */
1456 addr += CODE_SZI;
1457 i2 = CODE_MAKEI(regex->code, addr); /* integer arg */
1458 addr += CODE_SZI;
1459 a = CODE_MAKEA(regex->code, addr); /* address arg */
1460 addr += CODE_SZA;
1461 break;
1462 case OP_GOTO: case OP_GS: case OP_NS:
1463 case OP_GF: case OP_NF: case OP_AS:
1464 case OP_AF:
1465 a = CODE_MAKEA(regex->code, addr); /* address arg */
1466 addr += CODE_SZA;
1467 break;
1468 default:
1469 /* error */
1470 //SEE_error_throw_string(eng, eng->Error, STR(internal_error));
1471 //abort();
1472 free(newstate);
1473 longjmp(regex->errJP, 666);
1476 switch (op) {
1477 case OP_FAIL:
1478 free(newstate);
1479 return 0;
1480 case OP_SUCCEED:
1481 free(newstate);
1482 return 1;
1483 /* succeed if current character matches CharClass. index++ */
1484 case OP_CHAR:
1485 if (index < textLen) {
1486 ch = text[index++];
1487 /* N.B. strings are UTF-16 encoded! */
1489 if ((ch & 0xfc00) == 0xd800 && index < textLen && (text[index] & 0xfc00) == 0xdc00)
1490 ch = (((ch & 0x3ff) << 10) | (text[index++] & 0x3ff))+0x10000;
1492 ch = Canonicalize(regex, ch);
1493 if (!cc_contains(regex->cc[i], ch)) {
1494 free(newstate);
1495 return 0;
1497 } else {
1498 free(newstate);
1499 return 0;
1501 break;
1502 /* reset an iteration counter */
1503 case OP_ZERO: counter[i] = 0;
1504 break;
1505 /* fail if we havent reached a particular count */
1506 case OP_REACH:
1507 if (counter[i] < i2) {
1508 free(newstate);
1509 return 0;
1511 break;
1512 /* fail if we reached a particular count */
1513 case OP_NREACH:
1514 if (counter[i] >= i2) {
1515 free(newstate);
1516 return 0;
1518 break;
1519 /* start a capture group at current index */
1520 case OP_START:
1521 capture[i].start = index;
1522 capture[i].end = -1;
1523 break;
1524 /* finish a capture group at current index */
1525 case OP_END:
1526 capture[i].end = index;
1527 break;
1528 /* reset the given captures - usually done at a loop start */
1529 case OP_UNDEF:
1530 while (i < i2) {
1531 capture[i].start = -1;
1532 capture[i].end = -1;
1533 i++;
1535 break;
1536 /* Set a mark to the current index */
1537 case OP_MARK:
1538 mark[i] = index;
1539 break;
1540 /* fail if we haven't advanced past the mark */
1541 case OP_FDIST:
1542 if (mark[i] == index) {
1543 free(newstate);
1544 return 0;
1546 break;
1547 /* fail if haven't advanced past mark AND counter has reached a limit */
1548 case OP_RDIST:
1549 if (mark[i] == index && counter[i2] >= i3) {
1550 free(newstate);
1551 return 0;
1553 break;
1554 /* increment counter if it is less than n. always branch */
1555 case OP_MNEXT:
1556 if (counter[i] < i2) counter[i]++;
1557 addr = a;
1558 break;
1559 /* increment counter. if it is less than n, then branch */
1560 case OP_RNEXT:
1561 counter[i]++;
1562 if (counter[i] < i2) addr = a;
1563 break;
1564 case OP_GOTO:
1565 addr = a;
1566 break;
1567 /* operations that push state and branch for backtracking */
1568 case OP_GS: /* greedy success */
1569 case OP_NS: /* non-greedy success */
1570 case OP_GF: /* greedy fail */
1571 case OP_NF: /* greedy fail */
1572 case OP_AS: /* assert success */
1573 case OP_AF: /* assert fail */
1574 newaddr = (op == OP_NS || op == OP_NF) ? a : addr;
1575 memcpy(newstate, state, statesz);
1576 result = pcode_run(regex, newaddr, text, textLen, newstate);
1577 if (result) {
1578 if (op == OP_GF || op == OP_NF) {
1579 memcpy(state, newstate, statesz);
1580 free(newstate);
1581 return 1;
1582 } else if (op == OP_AF) {
1583 free(newstate);
1584 return 0;
1585 } else if (op == OP_AS) {
1586 int index_save = index;
1587 memcpy(state, newstate, statesz);
1588 index = index_save;
1589 addr = a;
1590 } else if (op == OP_GS) {
1591 addr = a;
1593 } else {
1594 if (op == OP_GS || op == OP_NS || op == OP_AS) {
1595 free(newstate);
1596 return 0;
1597 } else if (op == OP_GF || op == OP_AF) addr = a;
1599 break;
1600 /* succeed if we are at the beginning of a line */
1601 /* See 15.10.2.6 */
1602 case OP_BOL: /* ^ */
1603 if (index == 0) ; /* succeed */
1604 else if ((regex->flags & SEERX_FLAG_MULTILINE) == 0) { free(newstate); return 0; }
1605 else if (text[index-1] == 0x000a || /*LF*/
1606 text[index-1] == 0x000d) // || /*CR*/
1607 //text[index-1] == 0x2028 || /*LS*/
1608 //text[index-1] == 0x2029) /*PS*/
1609 ; /* succeed */
1610 else { free(newstate); return 0; }
1611 break;
1612 /* succeed if we are at the end of a line */
1613 case OP_EOL: /* $ */
1614 if (index == textLen) ; /* succeed */
1615 else if ((regex->flags & SEERX_FLAG_MULTILINE) == 0) { free(newstate); return 0; }
1616 else if (text[index] == 0x000a || /*LF*/
1617 text[index] == 0x000d) // || /*CR*/
1618 //text[index] == 0x2028 || /*LS*/
1619 //text[index] == 0x2029) /*PS*/
1620 ; /* succeed */
1621 else { free(newstate); return 0; }
1622 break;
1623 #define IsWordChar(e) ((e) >= 0 && (e) < textLen && ( \
1624 (text[e] >= 'a' && text[e] <= 'z') \
1625 || (text[e] >= 'A' && text[e] <= 'Z') \
1626 || (text[e] >= '0' && text[e] <= '9') \
1627 || text[e] == '_'))
1628 /* succeed if we are at a word break */
1629 case OP_BRK:
1630 case OP_NBRK: {
1631 SEE_boolean_t a, b;
1632 a = IsWordChar(index - 1);
1633 b = IsWordChar(index);
1634 if (op == OP_BRK) {
1635 if (a == b) { free(newstate); return 0; }
1636 } else {
1637 if (a != b) { free(newstate); return 0; }
1639 } break;
1640 /* succeed if we match a backreference */
1641 case OP_BACKREF:
1642 if (!SEERX_IS_CAPTURE_UNDEFINED(capture[i])) {
1643 int x, len, br;
1644 br = capture[i].start;
1645 len = capture[i].end - br;
1646 if (len+index > textLen) { free(newstate); return 0; }
1647 for (x = 0; x < len; x++)
1648 if (Canonicalize(regex, text[br+x]) != Canonicalize(regex, text[index+x])) {
1649 free(newstate);
1650 return 0;
1652 index += len;
1654 break;
1655 /* catch unexpected instructions */
1656 default:
1657 //SEE_error_throw_string(eng, eng->Error, STR(internal_error));
1658 //abort();
1659 free(newstate);
1660 longjmp(regex->errJP, 666);
1664 #undef index
1668 * Executes the regex on the text beginning at index.
1669 * Returns true of a match was successful.
1671 int SEE_regex_match (SEE_RegExpr *regex, const char *text, int textLen, int index, Capture *capture_ret) {
1672 int i, success;
1673 if (textLen < 0) textLen = text?strlen(text):0;
1674 char *state = calloc(regex->statesz+1, sizeof(char));//SEE_STRING_ALLOCA(eng, char, regex->statesz);
1675 Capture *capture = (Capture *)state;
1676 #ifndef NDEBUG
1677 memset(state, 0xd0, regex->statesz); /* catch bugs */
1678 #endif
1679 capture[0].start = index;
1680 capture[0].end = index;
1681 for (i = 1; i < regex->ncaptures; i++) {
1682 capture[i].start = -1;
1683 capture[i].end = -1;
1685 if (setjmp(regex->errJP)) {
1686 free(state);
1687 return 0;
1689 success = pcode_run(regex, 0, text, textLen, state);
1690 #ifndef NDEBUG
1691 if (SEE_regex_debug) dprintf(". %s\n", success ? "success" : "failure");
1692 #endif
1693 if (success && capture_ret) memcpy(capture_ret, capture, regex->ncaptures*sizeof(Capture));
1694 free(state);
1695 return success;
1699 /*------------------------------------------------------------
1700 * optimizer
1702 static void optimize_regex (SEE_RegExpr *regex) {
1704 * (nothing here yet)
1706 * possible optimisations include branch short-cuts,
1707 * and compiling the p-code to native machine instructions.
1712 void SEE_regex_free (SEE_RegExpr *regex) {
1713 if (regex) {
1714 CharClassRange *r, *n;
1715 for (n = NULL, r = regex->crList; r; r = n) {
1716 n = r->prev;
1717 free(r);
1719 CharClass *cr, *cn;
1720 for (cn = NULL, cr = regex->ccList; cr; cr = cn) {
1721 cn = cr->prev;
1722 free(cr);
1724 SEE_grow_deinit(&regex->codegrow);
1725 SEE_grow_deinit(&regex->ccgrow);
1726 free(regex);
1731 #ifdef __cplusplus
1733 #endif