src/felib/regex.c

   1 /*
   2  * Copyright (c) 2003
   3  *      David Leonard.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. Neither the name of David Leonard nor the names of its contributors
  14  *    may be used to endorse or promote products derived from this software
  15  *    without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  20  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  21  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  25  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  27  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28  * POSSIBILITY OF SUCH DAMAGE.
  29  */
  30 /* $Id: regex.c 1282 2007-09-13 22:26:35Z d $ */
  31 #ifdef __cplusplus
  32 extern "C" {
  33 #endif
  34
  35 #include <assert.h>
  36 #include <ctype.h>
  37 #include <setjmp.h>
  38 #include <stdint.h>
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <string.h>
  42 #include <unistd.h>
  43
  44 #include "regex.h"
  45
  46 /*
  47  * Regular expression engine.
  48  *
  49  * This module contains a parser to compile ECMA-262 regular expressions
  50  * into 'p-code', and a matcher engine that runs the p-code against
  51  * strings.
  52  *
  53  * NOTE: ECMA-262 "regular" expressions are not actually regular in
  54  * a technical sense, since the presence of non-greedy and zero-width
  55  * lookahead patterns make the matching process an NP-complete algorithm
  56  * rather than a linear process implied by context-free finite automata.
  57  * (See the NP-complete regex discussion at <http://perl.plover.com/NPC/>)
  58  *
  59  * The regex parser generates a 'SEE_RegExpr' which contains the bytecode
  60  * ('p-code'), that when executed will try to match the pattern against the
  61  * given string, commencing at the given index into the string.
  62  */
  63 #ifndef NDEBUG
  64 # define NDEBUG
  65 #endif
  66
  67 #ifdef TJS_RE_DUMP
  68 # undef NDEBUG
  69 #endif
  70
  71
  72 ////////////////////////////////////////////////////////////////////////////////
  73 #ifndef NDEBUG
  74 # define dprintf(...)  fprintf(stderr, __VA_ARGS__)
  75 static void dprints (const char *str, int len) {
  76   fputc('"', stderr);
  77   for (; len > 0; len--, str++) {
  78     uint8_t cc = (uint8_t)(*str);
  79     if (cc < ' ') fputc('.', stderr); else fputc(cc, stderr);
  80   }
  81   fputc('"', stderr);
  82 }
  83 #else
  84 # define dprintf(...)
  85 # define dprints(...)
  86 #endif
  87
  88
  89 ////////////////////////////////////////////////////////////////////////////////
  90 typedef unsigned char  SEE_unicode_t;
  91 typedef int  SEE_boolean_t;
  92
  93 #define UNICODE_TOUPPER(ch)  toupper(ch)
  94
  95
  96 ////////////////////////////////////////////////////////////////////////////////
  97 typedef struct {
  98   const char *str;
  99   int strLen;
 100   int pos; // next char to be taken
 101   int eof;
 102 } XXInput;
 103
 104
 105 static XXInput *xxiNew (const char *source, int srcLen) {
 106   XXInput *res = calloc(1, sizeof(XXInput));
 107   if (srcLen < 0) srcLen = strlen(source);
 108   res->str = source;
 109   res->strLen = srcLen;
 110   res->pos = 0;
 111   res->eof = srcLen<=0;
 112   return res;
 113 }
 114
 115
 116 static void SEE_INPUT_CLOSE (XXInput *input) {
 117   free(input);
 118 }
 119
 120
 121 static SEE_unicode_t xxiAhead (XXInput *input) {
 122   if (input->pos >= input->strLen) {
 123     input->eof = 1;
 124     return 0;
 125   }
 126   return input->str[input->pos];
 127 }
 128
 129
 130 static void SEE_INPUT_NEXT (XXInput *input) {
 131   input->pos++;
 132   if (input->pos >= input->strLen) {
 133     input->pos = input->strLen;
 134     input->eof = 1;
 135   }
 136 }
 137
 138
 139 static int SEE_input_lookahead_copy (XXInput *input, SEE_unicode_t *buf, int len) {
 140   uint32_t opos = input->pos;
 141   int f;
 142   if (len <= 0) return 0;
 143   memset(buf, 0, len);
 144   for (f = 0; f < len; f++) {
 145     if (input->pos >= input->strLen) break;
 146     buf[f] = input->str[input->pos++];
 147   }
 148   input->pos = opos;
 149   return f;
 150 }
 151
 152
 153 ////////////////////////////////////////////////////////////////////////////////
 154 typedef struct rxGrowable {
 155   uint8_t *data;
 156   int used;
 157   int size;
 158   int is_string;
 159   void **xptr;
 160   int elSize;
 161   unsigned int *usedPtr;
 162 } rxGrowable;
 163
 164
 165 static void SEE_grow_init (rxGrowable *gr, void **xptr, int elSize, unsigned int *usedPtr) {
 166   if (elSize < 1) elSize = 1;
 167   //fprintf(stderr, "GROW %p: elSize=%d\n", gr, elSize);
 168   gr->elSize = elSize;
 169   gr->data = NULL;
 170   gr->used = gr->size = 0;
 171   gr->is_string = 0;
 172   gr->xptr = xptr;
 173   if (gr->xptr) *(gr->xptr) = gr->data;
 174   gr->usedPtr = usedPtr;
 175   if (gr->usedPtr) *(gr->usedPtr) = gr->used;
 176 }
 177
 178 #define SEE_GROW_INIT(gr, xptr, uptr)  SEE_grow_init(gr, (void **)(&(xptr)), sizeof(*xptr), &uptr)
 179
 180
 181 static void SEE_grow_to (rxGrowable *gr, int minUsed) {
 182   if (minUsed >= gr->size) {
 183     int nsz = minUsed+128;
 184     void *p = realloc(gr->data, nsz*gr->elSize);
 185     if (!p) abort();
 186     gr->data = p;
 187     gr->size = nsz;
 188     if (gr->xptr) *(gr->xptr) = gr->data;
 189   }
 190   gr->used = minUsed;
 191   if (gr->usedPtr) *(gr->usedPtr) = gr->used;
 192 }
 193
 194
 195 static void SEE_grow_deinit (rxGrowable *gr) {
 196   if (gr) {
 197     if (gr->data) free(gr->data);
 198   }
 199 }
 200
 201
 202 ////////////////////////////////////////////////////////////////////////////////
 203 #ifndef NDEBUG
 204 int SEE_regex_debug = 1;
 205 #endif
 206
 207
 208 enum {
 209   OP_FAIL,    /* match failed */
 210   OP_SUCCEED, /* match succeeded */
 211   OP_CHAR,    /* match a char class instance */
 212   OP_ZERO,    /* reset counter */
 213   OP_REACH,   /* test counter over */
 214   OP_NREACH,  /* test counter under */
 215   OP_START,   /* enter a group */
 216   OP_END,     /* exit a group */
 217   OP_UNDEF,   /* reset a group */
 218   OP_MARK,    /* record a position */
 219   OP_FDIST,   /* position test */
 220   OP_RDIST,   /* position and counter test */
 221   OP_MNEXT,   /* max-loop */
 222   OP_RNEXT,   /* reach-loop */
 223   OP_GOTO,    /* branch */
 224   OP_GS,      /* greedy success */
 225   OP_NS,      /* non-greedy success */
 226   OP_GF,      /* greedy fail */
 227   OP_NF,      /* non-greedy fail */
 228   OP_AS,      /* assert success */
 229   OP_AF,      /* assert fail */
 230   OP_BOL,     /* test beginning of line */
 231   OP_EOL,     /* test end of line */
 232   OP_BRK,     /* test word-break */
 233   OP_NBRK,    /* test non-word-break */
 234   OP_BACKREF, /* backreference match */
 235
 236   OP_LAST
 237 };
 238
 239
 240 typedef struct CharClassRange CharClassRange;
 241 typedef struct CharClass CharClass;
 242
 243
 244 struct CharClassRange {
 245   CharClassRange *next;
 246   uint32_t lo; /* simple range of chars, eg [a-z] */
 247   uint32_t hi;
 248   CharClassRange *prev; // all allocated
 249 };
 250
 251
 252 struct CharClass {
 253   CharClassRange *ranges; /* linked list of character ranges */
 254   CharClass *prev; // all allocated
 255 };
 256
 257
 258 struct SEE_RegExpr {
 259   int ncaptures, ncounters, nmarks, maxref;
 260   int statesz;
 261   unsigned char *code;
 262   unsigned int codelen;
 263   rxGrowable codegrow;
 264   CharClass **cc;
 265   unsigned int cclen;
 266   rxGrowable ccgrow;
 267   int flags;
 268   CharClassRange *crList; // all allocated
 269   CharClass *ccList;
 270   jmp_buf errJP;
 271 };
 272
 273
 274 typedef struct REContext {
 275   XXInput *input;
 276   SEE_RegExpr *regex;
 277   jmp_buf errJP;
 278 } REContext;
 279
 280
 281 ////////////////////////////////////////////////////////////////////////////////
 282 #define SEE_NEW(eng, type)     calloc(1, sizeof(type))
 283 #define SEE_ASSERT(eng, cond)  assert(cond)
 284
 285
 286 #define NEW1(t)     SEE_NEW(REContext->eng, t)
 287 #define NEXT        xxiAhead(REContext->input)
 288 #define SKIP        SEE_INPUT_NEXT(REContext->input)
 289 #define ATEOF       (REContext->input->eof)
 290
 291 #define LOOKAHEAD(buf, len)  SEE_input_lookahead_copy(REContext->input, buf, len)
 292
 293 //  SEE_error_throw_string(REContext->eng, REContext->eng->SyntaxError, STR(regex_syntax_error))
 294 #define SYNTAX_ERROR \
 295   { \
 296     /*fprintf(stderr, "REGEX: syntax error\n");*/ \
 297     longjmp(REContext->errJP, 666); \
 298   }
 299
 300 #define EXPECT(c) \
 301   do { \
 302     if (ATEOF || NEXT != (c)) SYNTAX_ERROR; \
 303     SKIP; \
 304   } while (0)
 305
 306 #define RELADDR(base, addr)  ((addr)-(base))
 307 #define ASSERT(x)            SEE_ASSERT(REContext->eng, x)
 308 #define CODE_ADD(c)          code_add(REContext, c)
 309 #define CODE_INSERT(pos, n)  code_insert(REContext, pos, n)
 310 #define CODE_POS             REContext->regex->codelen
 311 #define CODE_PATCH(pos, c)   REContext->regex->code[pos] = c
 312
 313 #define CODE_ADDI(i) \
 314   do { \
 315     unsigned int _i = (i); \
 316     CODE_ADD((_i >> 8) & 0xff); \
 317     CODE_ADD(_i & 0xff); \
 318  } while (0)
 319
 320 #define CODE_ADDA(i)  CODE_ADDI(RELADDR(CODE_POS, i))
 321
 322 #define CODE_PATCHI(pos, i)           \
 323     do { CODE_PATCH(pos, ((i) >> 8) & 0xff);      \
 324    CODE_PATCH((pos)+1, (i) & 0xff); } while (0)
 325 #define CODE_PATCHA(addr, i)  CODE_PATCHI(addr, RELADDR(addr, i))
 326 #define CODE_SZA  2
 327 #define CODE_SZI  2
 328
 329 #define CODE_MAKEI(code, addr)  ((code[addr] << 8) | code[(addr)+1])
 330 #define CODE_MAKEA(code, addr)  ((CODE_MAKEI(code, addr) + (addr)) & 0xffff)
 331
 332 #define CC_NEW()                cc_new(REContext)
 333 #define CC_ADDRANGE(cc, l, h)   cc_add_range(REContext, cc, l, (h)+1)
 334 #define CC_ADDCHAR(cc, ch)      CC_ADDRANGE(cc, ch, ch)
 335 #define CC_INVERT(cc)         cc_invert(REContext, cc)
 336 #define CC_ADDCC(dst, src)      cc_add_cc(REContext, dst, src)
 337 #define CC_INTERN(cc)         cc_intern(REContext, cc)
 338
 339 #define UNDEFINED   (-1)
 340 #undef INFINITY /* XXX fixme - rename INFINITY in this file maybe? */
 341 #define INFINITY  (-1)
 342
 343
 344 ////////////////////////////////////////////////////////////////////////////////
 345 // Prototypes
 346 //
 347 static CharClass *cc_new (REContext *);
 348 static void cc_add_range (REContext *, CharClass *, SEE_unicode_t, SEE_unicode_t);
 349 static void cc_invert (REContext *, CharClass *);
 350 static void cc_add_cc (REContext *, CharClass *, CharClass *);
 351 static int cc_issingle (CharClass *);
 352 static uint32_t cc_count (CharClass *);
 353 static int cc_cmp (CharClass *, CharClass *);
 354 static int cc_intern (REContext *, CharClass *);
 355 static int cc_contains (CharClass *, SEE_unicode_t);
 356 static SEE_RegExpr *regex_new (REContext *);
 357 static void code_add (REContext *, int);
 358 static void code_insert (REContext *, int, int);
 359
 360 static void Disjunction_parse (REContext *);
 361 static void Alternative_parse (REContext *);
 362 static void Term_parse (REContext *);
 363 static int Quantifier_is_next (REContext *);
 364 static int Integer_parse (REContext *);
 365 static void Atom_parse (REContext *);
 366 static unsigned char HexDigit_parse (REContext *);
 367 static CharClass *ClassEscape_parse (REContext *);
 368 static CharClass *ClassAtom_parse (REContext *);
 369 static CharClass *CanonicalizeClass (REContext *, CharClass *);
 370 static CharClass *CharacterClass_parse (REContext *);
 371
 372 #ifndef NDEBUG
 373 static void dprint_ch (SEE_unicode_t);
 374 static void dprint_cc (CharClass *);
 375 static int  dprint_code (SEE_RegExpr *, int);
 376 static void dprint_regex (SEE_RegExpr *);
 377 #endif
 378
 379 static SEE_unicode_t Canonicalize (SEE_RegExpr *, SEE_unicode_t);
 380 static SEE_boolean_t pcode_run (SEE_RegExpr *regex, unsigned int addr, const char *text, int textLen, char *state);
 381 static void optimize_regex (SEE_RegExpr *);
 382
 383
 384 ////////////////////////////////////////////////////////////////////////////////
 385 // CharClass
 386 //
 387 // Create a new, empty CharClass
 388 static CharClass *cc_new (REContext *REContext) {
 389   CharClass *c;
 390   c = NEW1(struct CharClass);
 391   c->ranges = NULL;
 392   c->prev = REContext->regex->ccList;
 393   REContext->regex->ccList = c;
 394   return c;
 395 }
 396
 397
 398 // Add a range to a CharClass
 399 static void cc_add_range (REContext *REContext, CharClass *c, SEE_unicode_t lo, SEE_unicode_t hi) {
 400   CharClassRange *s, *newr = NULL;
 401   CharClassRange **rp;
 402   for (rp = &c->ranges; *rp; rp = &(*rp)->next) if (lo <= (*rp)->hi) break;
 403   if (!*rp || hi < (*rp)->lo) {
 404     newr = NEW1(struct CharClassRange);
 405     newr->lo = lo;
 406     newr->hi = hi;
 407     newr->next = *rp;
 408     *rp = newr;
 409     newr->prev = REContext->regex->crList;
 410     REContext->regex->crList = newr;
 411   } else {
 412     if (lo < (*rp)->lo) (*rp)->lo = lo;
 413     if (hi > (*rp)->hi) {
 414       (*rp)->hi = hi;
 415       s = (*rp)->next;
 416       while (s && s->hi < hi) (*rp)->next = s = s->next;
 417       if (s && s->lo <= hi) {
 418         (*rp)->hi = s->hi;
 419         (*rp)->next = s->next;
 420       }
 421     }
 422   }
 423 }
 424
 425
 426 // Invert a CharClass
 427 static void cc_invert (REContext *REContext, CharClass *c) {
 428   CharClassRange *l, *newlist, *r;
 429   r = c->ranges;
 430   if (r && r->lo == 0 && r->hi == ~0) {
 431     c->ranges = NULL;
 432     return;
 433   }
 434   l = newlist = NEW1(struct CharClassRange);
 435   l->prev = REContext->regex->crList;
 436   REContext->regex->crList = l;
 437   if (r && r->lo == 0) {
 438     l->lo = r->hi;
 439     r = r->next;
 440   } else l->lo = 0;
 441   for (; r; r = r->next) {
 442     l->hi = r->lo;
 443     if (r->hi == ~0) {
 444       l->next = NULL;
 445       l = NULL;
 446       break;
 447     }
 448     l = (l->next = NEW1(struct CharClassRange));
 449     l->lo = r->hi;
 450     l->prev = REContext->regex->crList;
 451     REContext->regex->crList = l;
 452   }
 453   if (l) {
 454     l->hi = ~0;
 455     l->next = NULL;
 456   }
 457   c->ranges = newlist;
 458 }
 459
 460
 461 static void cc_add_cc (REContext *REContext, CharClass *dst, CharClass *src) {
 462   CharClassRange *r;
 463   //FIXME: very inefficient
 464   for (r = src->ranges; r; r = r->next) cc_add_range(REContext, dst, r->lo, r->hi);
 465 }
 466
 467
 468 static int cc_issingle (CharClass *c) {
 469   CharClassRange *r = c->ranges;
 470   return r != NULL &&
 471          r->next == NULL &&
 472          r->lo+1 == r->hi;
 473 }
 474
 475
 476 // Return the number of characters in the class
 477 static uint32_t cc_count (CharClass *c) {
 478   uint32_t count = 0;
 479   CharClassRange *r;
 480   for (r = c->ranges; r; r = r->next) count += r->hi-r->lo;
 481   return count;
 482 }
 483
 484
 485 // Return 0 if two CharClasses are identical
 486 static int cc_cmp (CharClass *c1, CharClass *c2) {
 487   CharClassRange *r1, *r2;
 488   r1 = c1->ranges;
 489   r2 = c2->ranges;
 490   while (r1 && r2) {
 491     if (r1->lo != r2->lo) return r1->lo-r2->lo;
 492     if (r1->hi != r2->hi) return r1->hi-r2->hi;
 493     r1 = r1->next;
 494     r2 = r2->next;
 495   }
 496   if (r1) return 1;
 497   if (r2) return -1;
 498   return 0;
 499 }
 500
 501
 502 // Insert CharClass into REContext, returning unique ID
 503 static int cc_intern (REContext *REContext, CharClass *c) {
 504   SEE_RegExpr *regex = REContext->regex;
 505   int i;
 506   for (i = 0; i < regex->cclen; i++) if (cc_cmp(c, regex->cc[i]) == 0) return i;
 507   i = regex->cclen;
 508   SEE_grow_to(&regex->ccgrow, regex->cclen+1);
 509   regex->cc[i] = c;
 510   return i;
 511 }
 512
 513
 514 // Return true if CharClass c contains character ch
 515 static int cc_contains (CharClass *c, SEE_unicode_t ch) {
 516   CharClassRange *r;
 517   for (r = c->ranges; r; r = r->next) {
 518     if (ch >= r->lo && ch < r->hi) return 1;
 519     if (ch < r->lo) return 0;
 520   }
 521   return 0;
 522 }
 523
 524
 525 ////////////////////////////////////////////////////////////////////////////////
 526 #ifndef NDEBUG
 527 // Print a character in a readable form
 528 static void dprint_ch (SEE_unicode_t ch) {
 529   switch (ch) {
 530     case '{': case '}': case '[': case ']':
 531     case '(': case ')': case '-':
 532     case '.': case '^': case '$': case '|':
 533     case '?': case '*': case '+': case '\\':
 534       dprintf("\\%c", ch & 0x7f); break;
 535     case 0x0000: dprintf("\\0"); break;
 536     case 0x0009: dprintf("\\t"); break;
 537     case 0x000a: dprintf("\\n"); break;
 538     case 0x000b: dprintf("\\v"); break;
 539     case 0x000c: dprintf("\\f"); break;
 540     case 0x000d: dprintf("\\r"); break;
 541     default:
 542       if (ch >= ' ' && ch <= '~') dprintf("%c", ch & 0x7f);
 543       else if (ch < 0x100) dprintf("\\x%02x", ch & 0xff);
 544       else dprintf("\\u%04x", ch);
 545   }
 546 }
 547
 548
 549 /// Print a character class in a readable form
 550 static void dprint_cc (CharClass *c) {
 551   CharClassRange *r;
 552   dprintf("[");
 553   if (c->ranges && c->ranges->lo == 0) {
 554     dprintf("^");
 555     for (r = c->ranges; r; r = r->next) {
 556       if (r->next) {
 557         dprint_ch(r->hi);
 558         if (r->next->lo != r->hi + 1) {
 559           dprintf("-");
 560           dprint_ch(r->next->lo - 1);
 561         }
 562       } else if (r->hi != ~0) {
 563         dprint_ch(r->hi);
 564         dprintf("-");
 565         dprint_ch(~0);
 566       }
 567     }
 568   } else {
 569     for (r = c->ranges; r; r = r->next) {
 570       dprint_ch(r->lo);
 571       if (r->hi != r->lo + 1) {
 572         dprintf("-");
 573         dprint_ch(r->hi - 1);
 574       }
 575     }
 576   }
 577   dprintf("]");
 578 }
 579 #endif
 580
 581
 582 ////////////////////////////////////////////////////////////////////////////////
 583 // regex and pcode construction
 584 //
 585
 586 /*
 587  * Allocate a new regex structure. This will contain everything needed
 588  * to run and match a string, independent of the original pattern text.
 589  */
 590 static SEE_RegExpr *regex_new (REContext *REContext) {
 591   SEE_RegExpr *regex;
 592   regex = NEW1(struct SEE_RegExpr);
 593   regex->ncaptures = 0;
 594   regex->maxref = 0;
 595   regex->ncounters = 0;
 596   regex->nmarks = 0;
 597   regex->statesz = 0;
 598   SEE_GROW_INIT(&regex->codegrow, regex->code, regex->codelen);
 599   regex->codegrow.is_string = 1;
 600   SEE_GROW_INIT(&regex->ccgrow, regex->cc, regex->cclen);
 601   regex->flags = 0;
 602   return regex;
 603 }
 604
 605
 606 // add to the end of the p-code array, resizing as needed
 607 static void code_add (REContext *REContext, int c) {
 608   SEE_RegExpr *regex = REContext->regex;
 609   unsigned int i;
 610   i = regex->codelen;
 611   SEE_grow_to(&regex->codegrow, i+1);
 612   regex->code[i] = c;
 613 }
 614
 615
 616 // insert some bytes into the middle of the p-code, resizing as needed
 617 static void code_insert (REContext *REContext, int pos, int n) {
 618   SEE_RegExpr *regex = REContext->regex;
 619   int i;
 620   for (i = 0; i < n; i++) code_add(REContext, 0);
 621   for (i = regex->codelen - n; i > pos; i--) regex->code[i-1+n] = regex->code[i-1];
 622 }
 623
 624
 625 ////////////////////////////////////////////////////////////////////////////////
 626 // Parser
 627 /*
 628  * This recursive descent parser builds a p-code array as it runs.
 629  * During recursion, the p-code array is sometimes 'back-patched'
 630  * because branch distances weren't known in advance. In some
 631  * cases, p-code segments are also shifted. This all means that we
 632  * have to be very careful that our p-code is quite relocatable,
 633  * and not dependendent on absolute addresses.
 634  */
 635
 636 // parse a source pattern, and return a filled-in regex structure
 637 SEE_RegExpr *SEE_regex_parse (const char *source, int patLen, int flags) {
 638   REContext *REContext;
 639   SEE_RegExpr *regex;
 640   REContext = SEE_NEW(eng, struct REContext);
 641   //REContext->input = SEE_input_lookahead(SEE_input_string(eng, source), 24);
 642   REContext->input = xxiNew(source, patLen);
 643   REContext->regex = regex = regex_new(REContext);
 644   regex->flags = flags;
 645   regex->ncaptures = 1;
 646   if (setjmp(REContext->errJP)) {
 647     free(REContext);
 648     SEE_regex_free(regex);
 649     return NULL;
 650   }
 651   Disjunction_parse(REContext);
 652   if (!ATEOF) SYNTAX_ERROR;
 653   CODE_ADD(OP_SUCCEED);
 654   // Check that no backreferences were too big
 655   if (regex->maxref >= regex->ncaptures) SYNTAX_ERROR;
 656   //FIXME: XXX - should this close be enclosed in a 'finally'?
 657   SEE_INPUT_CLOSE(REContext->input);
 658   // compute the size of a captures context
 659   regex->statesz =
 660     regex->ncaptures*sizeof(Capture)+
 661     regex->ncounters*sizeof(int)+
 662     regex->nmarks*sizeof(int);
 663   optimize_regex(regex);
 664 #ifndef NDEBUG
 665   if (SEE_regex_debug) {
 666     dprintf("regex:");
 667     dprint_regex(regex);
 668     dprintf(".\n");
 669   }
 670 #endif
 671   free(REContext);
 672   return regex;
 673 }
 674
 675
 676 // Returns the number of capture parentheses in the compiled regex
 677 int SEE_regex_count_captures (const SEE_RegExpr *regex) {
 678   return regex->ncaptures;
 679 }
 680
 681
 682 // Returns the flags of the regex object
 683 int SEE_regex_get_flags (const SEE_RegExpr *regex) {
 684   return regex->flags;
 685 }
 686
 687
 688 /*
 689  * Disjunction :: Alternative
 690  *      Alternative | Disjunction
 691  */
 692 static void Disjunction_parse (REContext *REContext) {
 693   int pos = CODE_POS;
 694   Alternative_parse(REContext);
 695   if (!ATEOF && NEXT == '|') {
 696     int p = pos, p1, p2, x1, x2;
 697     int insert = 1+CODE_SZA;
 698     SKIP;
 699     CODE_INSERT(pos, insert);
 700     CODE_PATCH(p, OP_GF); p++;    /* GF x1 */
 701     p1 = p; p += CODE_SZA;
 702     ASSERT(p == pos + insert);    /* (a) */
 703     CODE_ADD(OP_GOTO);      /* GOTO x2 */
 704     p2 = CODE_POS; CODE_ADDA(0);
 705     x1 = CODE_POS;        /* x1: (b) */
 706     Disjunction_parse(REContext);
 707     x2 = CODE_POS;        /* x2: */
 708     CODE_PATCHA(p1, x1);
 709     CODE_PATCHA(p2, x2);
 710   }
 711 }
 712
 713
 714 /*
 715  * Alternative :: [empty]   -- lookahead in ) |
 716  *      Term
 717  *      Term Alternative
 718  */
 719 static void Alternative_parse (REContext *REContext) {
 720   while (!(ATEOF || NEXT == /*(*/')' || NEXT == '|')) Term_parse(REContext);
 721 }
 722
 723
 724 /*------------------------------------------------------------
 725  * Term ::
 726  *  Assertion   la = ^ $ \b \B
 727  *  Atom
 728  *  Atom Quantifier
 729  */
 730
 731 /* We have 24 bytes of lookahead, which is sufficient to
 732  * scan for {2147483647,2147483647}. Anything larger will
 733  * overflow the signed int type on 32 bit systems.
 734  */
 735 static int Quantifier_is_next (REContext *REContext) {
 736   int pos, len;
 737   SEE_unicode_t lookahead[24];
 738   if (NEXT != '{') return 0;
 739   /*
 740    * Strict ECMA-262 says that '{' is NOT a Pattern character,
 741    * but Mozilla allows it
 742    */
 743 /*EXT:24*/
 744   //if (!SEE_COMPAT_JS(REContext->eng, >=, JS11)) return 1;
 745   len = LOOKAHEAD(lookahead, 24);
 746   pos = 1;
 747   while (pos < len && lookahead[pos] >= '0' && lookahead[pos] <= '9') pos++;
 748   if (pos < len && lookahead[pos] == ',') pos++;
 749   else if (pos < len && lookahead[pos] == '}') return pos > 1;
 750   else return 0;
 751   while (pos < len && lookahead[pos] >= '0' && lookahead[pos] <= '9') pos++;
 752   if (pos < len && lookahead[pos] == '}') pos++; else return 0;
 753   return 1;
 754 }
 755
 756
 757 static void Term_parse (REContext *REContext) {
 758   int min, max, greedy, pos;
 759   int oparen, cparen;
 760   int lookahead_len;
 761   SEE_unicode_t lookahead[2];
 762   /*
 763    * parse Assertion inline since it is a bit special
 764    * in terms of its lookahead
 765    */
 766   switch (NEXT) {
 767     case '\\':
 768       lookahead_len = LOOKAHEAD(lookahead, 2);
 769       if (lookahead_len > 1 && lookahead[1] == 'b') {
 770         SKIP; SKIP;
 771         CODE_ADD(OP_BRK);
 772         return;
 773       }
 774       if (lookahead_len > 1 && lookahead[1] == 'B') {
 775         SKIP; SKIP;
 776         CODE_ADD(OP_NBRK);
 777         return;
 778       }
 779       // some other kind of escape
 780       break;
 781     case '^':
 782       SKIP;
 783       CODE_ADD(OP_BOL);
 784       return;
 785     case '$':
 786       SKIP;
 787       CODE_ADD(OP_EOL);
 788       return;
 789     // Lookaheads forbidden by the Atom production
 790     case '*': case '+': case '?': case ')':
 791     case ']': case '{': case '}': case '|':
 792       SYNTAX_ERROR;
 793   }
 794   pos = CODE_POS;
 795   oparen = REContext->regex->ncaptures;
 796   Atom_parse(REContext);
 797   cparen = REContext->regex->ncaptures;
 798   /*
 799    * parse Quantifier inline to save my sanity
 800    */
 801   if (ATEOF) {
 802     min = max = 1;
 803   } else if (NEXT == '*') {
 804     SKIP; min = 0; max = INFINITY;
 805   } else if (NEXT == '+') {
 806     SKIP; min = 1; max = INFINITY;
 807   } else if (NEXT == '?') {
 808     SKIP; min = 0; max = 1;
 809   } else if (Quantifier_is_next(REContext)) {
 810     SKIP;
 811     min = Integer_parse(REContext);
 812     if (!ATEOF && NEXT == ',') {
 813       EXPECT(',');
 814       if (!ATEOF && NEXT == '}') max = INFINITY;
 815       else max = Integer_parse(REContext);
 816     } else max = min;
 817     EXPECT('}');
 818   } else {
 819     min = max = 1;
 820   }
 821   if (!ATEOF && NEXT == '?') {
 822     SKIP; greedy = 0;
 823   } else greedy = 1;
 824   if (min == max && !greedy) {
 825     /*
 826      * XXX should we warn that the greedy modifiers to
 827      * 'a{n,n}?' and 'a{n}?' are technically meaningless?
 828      * We speed up our code by using greedy mode anyway.
 829      */
 830     greedy = 1;
 831   }
 832   /* Don't allow stupid ranges, such as 'a{7,3}' */
 833   if (max != INFINITY && min > max) SYNTAX_ERROR;
 834   if (min == 1 && max == 1) return; /* a */
 835   if (max == 0) {
 836     /* a{0} */
 837     /* Undo! */
 838     CODE_POS = pos;
 839     return;
 840   }
 841   if (oparen != cparen) {
 842     /*
 843      * If the atom introduces capture parentheses,
 844      * then insert code to reset them before each
 845      * iteration.
 846      */
 847     CODE_INSERT(pos, 1 + 2*CODE_SZI);
 848     CODE_PATCH(pos, OP_UNDEF);
 849     CODE_PATCHI(pos + 1, oparen);
 850     CODE_PATCHI(pos + 1 + CODE_SZI, cparen);
 851   }
 852   /*
 853    * The following code generators all generate looping
 854    * matchers.  While every corresponding pattern could
 855    * be written as the generalised 'a{n,m}' (where m could
 856    * be INFINITY), some efficiency is gained by selecting cases
 857    * where general code that would be a no-op is not emitted.
 858    */
 859   if (min == max) {       /* a{m} */
 860     int p = pos, px;
 861     int insert = 1 + CODE_SZI;
 862     int c = REContext->regex->ncounters++;
 863     CODE_INSERT(pos, insert);
 864     CODE_PATCH(p, OP_ZERO); p++;    /* ZERO c; */
 865     CODE_PATCHI(p, c); p += CODE_SZI;
 866     px = p;         /* x: */
 867     ASSERT(p == pos + insert);    /*  (a) */
 868     CODE_ADD(OP_RNEXT);     /* RNEXT c,m,x */
 869     CODE_ADDI(c);
 870     CODE_ADDI(max);
 871     CODE_ADDA(px);
 872     return;
 873   }
 874   if (min == 0 && max == 1) {
 875     /* a? */
 876     int p = pos, p1, px;
 877     int insert = 1 + CODE_SZA;
 878     CODE_INSERT(pos, insert);
 879     CODE_PATCH(p, greedy ? OP_GF : OP_NF);  /* GF x */
 880     p++; p1 = p; p += CODE_SZA;
 881     ASSERT(p == pos + insert);    /* (a) */
 882     px = CODE_POS;        /* x: */
 883
 884     CODE_PATCHA(p1, px);
 885     return;
 886   }
 887   if (min == 0 && max == INFINITY) {
 888     /* a* */
 889     int p = pos, px, py, p1;
 890     int insert = 2 + CODE_SZA + CODE_SZI;
 891     int m = REContext->regex->nmarks++;
 892     CODE_INSERT(pos, insert);
 893     px = p;         /* x: GF y */
 894     CODE_PATCH(p, greedy ? OP_GF : OP_NF); p++;
 895     p1 = p; p += CODE_SZA;
 896     CODE_PATCH(p, OP_MARK); p++;    /* MARK m */
 897     CODE_PATCHI(p, m); p += CODE_SZI;
 898     ASSERT(p == pos + insert);    /* (a) */
 899     CODE_ADD(OP_FDIST);     /* FDIST m */
 900     CODE_ADDI(m);
 901     CODE_ADD(OP_GOTO);      /* GOTO x */
 902     CODE_ADDA(px);
 903     py = CODE_POS;        /* y: */
 904     CODE_PATCHA(p1, py);
 905     return;
 906   }
 907   {
 908     /* a{n,m} */
 909     int p = pos, px, py, p1;
 910     int insert = 3 + CODE_SZI * 2 + CODE_SZA;
 911     int c = REContext->regex->ncounters++;
 912     int k = REContext->regex->nmarks++;
 913     CODE_INSERT(pos, insert);
 914     CODE_PATCH(p, OP_ZERO); p++;    /* ZERO c */
 915     CODE_PATCHI(p, c); p += CODE_SZI;
 916     px = p;         /* x: GF y */
 917     CODE_PATCH(p, greedy ? OP_GF : OP_NF); p++;
 918     p1 = p; p += CODE_SZA;
 919     CODE_PATCH(p, OP_MARK); p++;    /* MARK k */
 920     CODE_PATCHI(p, k); p += CODE_SZI;
 921     ASSERT(p == pos + insert);    /* (a) */
 922     if (min) {
 923       CODE_ADD(OP_RDIST);   /* RDIST k,c,n */
 924       CODE_ADDI(k);
 925       CODE_ADDI(c);
 926       CODE_ADDI(min);
 927     } else {
 928       CODE_ADD(OP_FDIST);   /* FDIST k */
 929       CODE_ADDI(k);
 930     }
 931     if (max != INFINITY) {
 932       CODE_ADD(OP_RNEXT);   /* RNEXT c,m,x */
 933       CODE_ADDI(c);
 934       CODE_ADDI(max);
 935       CODE_ADDA(px);
 936     } else {
 937       CODE_ADD(OP_MNEXT);   /* MNEXT c,n,x */
 938       CODE_ADDI(c);
 939       CODE_ADDI(min);
 940       CODE_ADDA(px);
 941     }
 942     py = CODE_POS;        /* y: */
 943     if (min) {
 944       CODE_ADD(OP_REACH);   /* REACH c,n */
 945       CODE_ADDI(c);
 946       CODE_ADDI(min);
 947     }
 948     CODE_PATCHA(p1, py);
 949     return;
 950   }
 951 }
 952
 953
 954 // Parse a simple integer; used for repetition ranges
 955 static int Integer_parse (REContext *REContext) {
 956   int val;
 957   int hasdig = 0;
 958   val = 0;
 959   while (!ATEOF && NEXT >= '0' && NEXT <= '9') {
 960     val = 10 * val + (NEXT-'0');
 961     hasdig = 1;
 962     SKIP;
 963   }
 964   if (!hasdig) SYNTAX_ERROR;
 965   return val;
 966 }
 967
 968
 969 /*
 970  * Atom:: la != ^ $     * + ?   )   ] { } |
 971  *  pattern character la != ^ $ \ . * + ? ( ) [ ] { } |
 972  *  .
 973  *  \ AtomEscape
 974  *  [ CharacterClass ]
 975  *  ( Disjunction )
 976  *  ( ?: Disjunction )
 977  *  ( ?= Disjunction )
 978  *  ( ?! Disjunction )
 979  */
 980 static void Atom_parse (REContext *REContext) {
 981   CharClass *c;
 982   int i;
 983   if (NEXT == '(') {
 984     SKIP;
 985     if (!ATEOF && NEXT == '?') {    /* (?... */
 986       SKIP;
 987       if (!ATEOF && NEXT == ':') {  /* (?:... */
 988         SKIP;
 989         Disjunction_parse(REContext);
 990       } else if (!ATEOF && (NEXT == '=' || NEXT == '!')) {
 991         int px, p1;     /* (?=... */
 992         int neg = (NEXT == '!');  /* (?!... */
 993         SKIP;
 994         CODE_ADD(neg ? OP_AF : OP_AS);  /* AS x */
 995         p1 = CODE_POS; CODE_ADDI(0);
 996         Disjunction_parse(REContext); /* (a) */
 997         CODE_ADD(OP_SUCCEED);   /* SUCCEED */
 998         px = CODE_POS;      /* x: */
 999         CODE_PATCHA(p1, px);
1000       } else SYNTAX_ERROR;
1001     } else {        /* (...) */
1002       i = REContext->regex->ncaptures++;
1003       CODE_ADD(OP_START);     /* START i */
1004       CODE_ADDI(i);
1005       Disjunction_parse(REContext); /* (a) */
1006       CODE_ADD(OP_END);     /* END i */
1007       CODE_ADDI(i);
1008     }
1009     EXPECT(')');
1010     return;
1011   }
1012   /*
1013    * All other atoms compile to simple character class matches
1014    * (or backreferences)
1015    */
1016   switch (NEXT) {
1017     case '\\':
1018       SKIP;
1019       if (ATEOF) SYNTAX_ERROR;
1020       if (NEXT >= '1' && NEXT <= '9') {
1021         i = 0;
1022         while (!ATEOF && (NEXT >= '0' && NEXT <= '9')) {
1023           i = 10 * i + NEXT - '0';
1024           SKIP;
1025         }
1026         CODE_ADD(OP_BACKREF);
1027         CODE_ADDI(i);
1028         if (i > REContext->regex->maxref) REContext->regex->maxref = i;
1029         return;
1030       }
1031       c = ClassEscape_parse(REContext);
1032       break;
1033     case '[':
1034       c = CharacterClass_parse(REContext);
1035       break;
1036     case '.':
1037       SKIP;
1038       c = CC_NEW();
1039       CC_ADDCHAR(c, 0x000a);
1040       CC_ADDCHAR(c, 0x000d);
1041       //CC_ADDCHAR(c, 0x2028);
1042       //CC_ADDCHAR(c, 0x2029);
1043       CC_INVERT(c);
1044       break;
1045     default:
1046       c = CC_NEW();
1047       CC_ADDCHAR(c, Canonicalize(REContext->regex, NEXT));
1048       SKIP;
1049       break;
1050   }
1051   i = CC_INTERN(c);
1052   CODE_ADD(OP_CHAR);
1053   CODE_ADDI(i);
1054 }
1055
1056
1057 static unsigned char HexDigit_parse (REContext *REContext) {
1058   SEE_unicode_t c;
1059   if (ATEOF) SYNTAX_ERROR;
1060   c = NEXT; SKIP;
1061   if (c >= '0' && c <= '9') return c-'0';
1062   if (c >= 'a' && c <= 'f') return c-'a'+10;
1063   if (c >= 'A' && c <= 'F') return c-'A'+10;
1064   SYNTAX_ERROR;
1065 }
1066
1067
1068 static CharClass *ClassEscape_parse (REContext *REContext) {
1069   CharClass *c;
1070   SEE_unicode_t ch, lookahead[3];
1071   int i;
1072   // EXPECT('\\'); // backslash already skipped
1073   c = CC_NEW();
1074   if (NEXT >= '0' && NEXT <= '9') {
1075     // \0oo - 3 digit octal escapes
1076 /*EXT:25*/
1077     if (/*SEE_COMPAT_JS(REContext->eng, >=, JS11) &&*/ NEXT == '0' && LOOKAHEAD(lookahead, 3) >= 2 &&
1078         lookahead[1] >= '0' && lookahead[1] < '8' && lookahead[2] >= '0' && lookahead[2] < '8') {
1079       ch = (lookahead[1] - '0') * 8 + (lookahead[2] - '0');
1080       CC_ADDCHAR(c, ch);
1081       SKIP;
1082       SKIP;
1083       SKIP;
1084       return c;
1085     }
1086     i = 0;
1087     while (!ATEOF && NEXT >= '0' && NEXT <= '9') {
1088       i = 10 * i + NEXT - '0';
1089       SKIP;
1090     }
1091     /*
1092      * 15.10.2.1.9: "Using a backreference inside a ClassAtom
1093      * causes an error"
1094      */
1095     if (i != 0) SYNTAX_ERROR;
1096     CC_ADDCHAR(c, i);
1097     return c;
1098   }
1099   ch = NEXT;
1100   SKIP;
1101   switch (ch) {
1102     case 'b': CC_ADDCHAR(c, 0x0008); break;
1103     case 't': CC_ADDCHAR(c, 0x0009); break;
1104     case 'n': CC_ADDCHAR(c, 0x000a); break;
1105     case 'v': CC_ADDCHAR(c, 0x000b); break;
1106     case 'f': CC_ADDCHAR(c, 0x000c); break;
1107     case 'r': CC_ADDCHAR(c, 0x000d); break;
1108     case 'D':
1109     case 'd':
1110       CC_ADDRANGE(c, '0', '9');
1111       if (ch == 'D') CC_INVERT(c);
1112       break;
1113     case 'W':
1114     case 'w':
1115       CC_ADDRANGE(c, 'a', 'z');
1116       CC_ADDRANGE(c, 'A', 'Z');
1117       CC_ADDRANGE(c, '0', '9');
1118       CC_ADDCHAR(c, '_');
1119       if (ch == 'W') CC_INVERT(c);
1120       break;
1121     case 'S':
1122     case 's':
1123 #if WITH_UNICODE_TABLES
1124       for (i = 0; i < SEE_unicode_Zscodeslen; i++) CC_ADDCHAR(c, SEE_unicode_Zscodes[i]);
1125 #else
1126       CC_ADDCHAR(c, 0x0020);
1127 #endif
1128       if (ch == 'S') CC_INVERT(c);
1129       break;
1130     case 'c':
1131       if (ATEOF) SYNTAX_ERROR;
1132       ch = NEXT; SKIP;
1133       if (('a' <= ch && ch <= 'z') ||
1134           ('A' <= ch && ch <= 'Z')) CC_ADDCHAR(c, ch % 32);
1135       else SYNTAX_ERROR;
1136       break;
1137     case 'u':
1138     case 'x':
1139       i = (ch == 'x' ? 2 : 4);
1140       ch = 0;
1141       while (i--) ch = (ch << 4) | HexDigit_parse(REContext);
1142       CC_ADDCHAR(c, ch);
1143       break;
1144     default:
1145       CC_ADDCHAR(c, ch);
1146       break;
1147   }
1148   return c;
1149 }
1150
1151
1152 /*
1153  * ClassAtom :
1154  *  '\' ClassEscape
1155  *  anychar
1156  */
1157 static CharClass *ClassAtom_parse (REContext *REContext) {
1158   CharClass *c;
1159   if (ATEOF) SYNTAX_ERROR;
1160   if (NEXT == '\\') {
1161     SKIP;
1162     return ClassEscape_parse(REContext);
1163   }
1164   c = CC_NEW();
1165   CC_ADDCHAR(c, NEXT);
1166   SKIP;
1167   return c;
1168 }
1169
1170
1171 /*
1172  * Convert the CharClass into a canonicalised version. (SLOW!)
1173  * !!!Every character in the class has to be individually canonicalized!!!
1174  * !!!OUCH!!!OUCH!!!I!HATE!YOU!ALL!!!
1175  */
1176 static CharClass *CanonicalizeClass (REContext *REContext, CharClass *c) {
1177   CharClass *ccanon;
1178   CharClassRange *r;
1179   SEE_unicode_t ch, uch;
1180   if (cc_count(c) > (uint32_t)~0 / 2) {
1181     CC_INVERT(c);
1182     ccanon = CanonicalizeClass(REContext, c);
1183     CC_INVERT(ccanon);
1184     return ccanon;
1185   }
1186   /*
1187    * Evil hack: if the CharClass range includes ['A'-0xEFFFF], then
1188    * there is no need to canonicalize because every uppercase character
1189    * is already there.
1190    */
1191   for (r = c->ranges; r; r = r->next) if (r->lo <= 'A' && r->hi > 0xf0000) return c;
1192   ccanon = CC_NEW();
1193   for (r = c->ranges; r; r = r->next) {
1194     for (ch = r->lo; ch < r->hi; ch++) {
1195       uch = UNICODE_TOUPPER(ch);
1196       CC_ADDCHAR(ccanon, uch);
1197     }
1198   }
1199   return ccanon;
1200 }
1201
1202
1203 /*
1204  * CharacterClass ::
1205  *  '[' [^] ( ClassAtom | ClassAtom '-' ClassAtom ) * ']'
1206  */
1207 static CharClass *CharacterClass_parse (REContext *REContext) {
1208   CharClass *c = CC_NEW(), *a, *b;
1209   int invertflag = 0;
1210   EXPECT('[');
1211   if (!ATEOF && NEXT == '^') {
1212     invertflag = 1;
1213     SKIP;
1214   }
1215   while (!ATEOF && NEXT != ']') {
1216     a = ClassAtom_parse(REContext);
1217     if (!ATEOF && NEXT == '-') {
1218       SKIP;
1219       // Treat '-' literally if at end of ClassRanges 15.10.2.16
1220       if (!ATEOF && NEXT == ']') {
1221         CC_ADDCHAR(a, '-');
1222         goto out;
1223       }
1224       if (!cc_issingle(a)) SYNTAX_ERROR;
1225       b = ClassAtom_parse(REContext);
1226       if (!cc_issingle(b)) SYNTAX_ERROR;
1227       if (b->ranges->lo < a->ranges->lo) SYNTAX_ERROR;
1228       a->ranges->hi = b->ranges->hi;
1229       // free(b)
1230     }
1231 out:
1232     CC_ADDCC(c, a);
1233     // free(a)
1234   }
1235   //if (c->ranges == NULL) SYNTAX_ERROR;
1236   EXPECT(']');
1237   if (REContext->regex->flags & SEERX_FLAG_IGNORECASE) {
1238     c = CanonicalizeClass(REContext, c);
1239     // free(c); // i.e the old c
1240   }
1241   if (invertflag) CC_INVERT(c);
1242   return c;
1243 }
1244
1245
1246 ////////////////////////////////////////////////////////////////////////////////
1247 // P-code
1248 //
1249 #ifndef NDEBUG
1250 static int dprint_code (SEE_RegExpr *regex, int addr) {
1251   int i;
1252   const char *op="", *opc;
1253   unsigned char *code = regex->code;
1254   dprintf("0x%04x: ", addr);
1255   switch (code[addr]) {
1256     case OP_FAIL: dprintf("FAIL"); op = ""; break;
1257     case OP_SUCCEED: dprintf("SUCCEED"); op = ""; break;
1258     case OP_CHAR: dprintf("CHAR"); op = "i"; break;
1259     case OP_ZERO: dprintf("ZERO"); op = "i"; break;
1260     case OP_REACH: dprintf("REACH"); op = "ii"; break;
1261     case OP_NREACH: dprintf("NREACH"); op = "ii"; break;
1262     case OP_START: dprintf("START"); op = "i"; break;
1263     case OP_END: dprintf("END"); op = "i"; break;
1264     case OP_UNDEF: dprintf("UNDEF"); op = "ii"; break;
1265     case OP_MARK: dprintf("MARK"); op = "i"; break;
1266     case OP_FDIST: dprintf("FDIST"); op = "i"; break;
1267     case OP_RDIST: dprintf("RDIST"); op = "iii"; break;
1268     case OP_MNEXT: dprintf("MNEXT"); op = "iia"; break;
1269     case OP_RNEXT: dprintf("RNEXT"); op = "iia"; break;
1270     case OP_GOTO: dprintf("GOTO"); op = "a"; break;
1271     case OP_GS: dprintf("GS"); op = "a"; break;
1272     case OP_NS: dprintf("NS"); op = "a"; break;
1273     case OP_GF: dprintf("GF"); op = "a"; break;
1274     case OP_NF: dprintf("NF"); op = "a"; break;
1275     case OP_AS: dprintf("AS"); op = "a"; break;
1276     case OP_AF: dprintf("AF"); op = "a"; break;
1277     case OP_BOL: dprintf("BOL"); op = ""; break;
1278     case OP_EOL: dprintf("EOL"); op = ""; break;
1279     case OP_BRK: dprintf("BRK"); op = ""; break;
1280     case OP_NBRK: dprintf("NBRK"); op = ""; break;
1281     case OP_BACKREF: dprintf("BACKREF"); op = "i"; break;
1282     default: dprintf("*** %d", code[addr]); break;
1283   }
1284   addr++;
1285   for (opc = op; *opc; opc++) {
1286     if (opc != op) dprintf(",");
1287     dprintf(" ");
1288     switch (*opc) {
1289       case 'a':
1290         i = CODE_MAKEA(code, addr);
1291         dprintf("0x%04x", i);
1292         i = CODE_MAKEI(code, addr);
1293         dprintf(" [0x%04x]", i);
1294         addr += CODE_SZA;
1295         break;
1296       case 'i':
1297         i = CODE_MAKEI(code, addr); addr += CODE_SZI;
1298         dprintf("%d", i);
1299         break;
1300       case 'c':
1301         i = CODE_MAKEI(code, addr); addr += CODE_SZI;
1302         dprintf("%d=", i);
1303         if (i > regex->cclen) dprintf("**BAD**");
1304         else dprint_cc(regex->cc[i]);
1305         break;
1306     }
1307   }
1308   dprintf("\n");
1309   return addr;
1310 }
1311
1312
1313 static void dprint_regex (SEE_RegExpr *regex) {
1314   int i, addr;
1315   CharClassRange *r;
1316   dprintf("regex %p\n", regex);
1317   dprintf("\tncaptures = %d\n", regex->ncaptures);
1318   dprintf("\tcodelen = %d\n", regex->codelen);
1319   dprintf("\tcclen = %d\n", regex->cclen);
1320   dprintf("\tflags = 0x%x\n", regex->flags);
1321   dprintf("\tcc:\n");
1322   for (i = 0; i < regex->cclen; i++) {
1323     dprintf("\t\t%d = ", i);
1324     dprint_cc(regex->cc[i]);
1325     dprintf("\n\t\t  = { ");
1326     for (r = regex->cc[i]->ranges; r; r = r->next) dprintf("%x:%x ", r->lo, r->hi);
1327     dprintf("}\n");
1328   }
1329   dprintf("\tcode:\n");
1330   addr = 0;
1331   while (addr < regex->codelen) addr = dprint_code(regex, addr);
1332 }
1333 #endif
1334
1335
1336 ////////////////////////////////////////////////////////////////////////////////
1337 // pcode-execution
1338 //
1339 /* 15.10.2.8 */
1340 static SEE_unicode_t Canonicalize (SEE_RegExpr *regex, SEE_unicode_t ch) {
1341   if (regex->flags & SEERX_FLAG_IGNORECASE) return UNICODE_TOUPPER(ch);
1342   return ch;
1343 }
1344
1345
1346 static SEE_boolean_t pcode_run (SEE_RegExpr *regex, unsigned int addr, const char *text, int textLen, char *state) {
1347   SEE_boolean_t result;
1348   int i = 0, i2 = 0, i3 = 0;
1349   unsigned int a = 0;
1350   unsigned char op;
1351   SEE_unicode_t ch;
1352   Capture *capture;
1353   int *counter, *mark, statesz;
1354   unsigned int newaddr;
1355   char *newstate;
1356   //if (SEE_system.periodic) (*SEE_system.periodic)(eng);
1357   // Compute the offsets into the state structure
1358   statesz = 0;
1359   capture = (Capture *)state;
1360   statesz += regex->ncaptures * sizeof (Capture);
1361   counter = (int *)(state + statesz);
1362   statesz += regex->ncounters * sizeof (int);
1363   mark = (int *)(state + statesz);
1364   statesz += regex->nmarks * sizeof (int);
1365   SEE_ASSERT(eng, statesz == regex->statesz);
1366   newstate = calloc(statesz+1, sizeof(char));//SEE_STRING_ALLOCA(eng, char, statesz);
1367 #define index (capture[0].end)
1368   for (;;) {
1369     /* Catch bad branches */
1370     if (addr >= regex->codelen) {
1371       free(newstate);
1372       longjmp(regex->errJP, 666);
1373       //abort();
1374       //SEE_error_throw_string(eng, eng->Error, STR(internal_error));
1375     }
1376     /* Read the opcode and its arguments */
1377     op = regex->code[addr];
1378 #ifndef NDEBUG
1379     if (SEE_regex_debug) {
1380       int x;
1381       //TJString mys;
1382       //mys.stringclass = 0;
1383       //mys.eng = 0;
1384       //mys.flags = 0;
1385       dprintf("index=%d captures=[", index);
1386       for (x = 0; x < regex->ncaptures; x++) {
1387         if (x) dprintf(",");
1388         if (capture[x].start == -1)
1389         dprintf("undef");
1390         else if (capture[x].start+capture[x].end > textLen) {
1391           dprintf("bad<%x:%x>",
1392           capture[x].start, capture[x].end);
1393         } else {
1394           int end = capture[x].end;
1395           if (end == -1) end = index;
1396           /*
1397           mys.length = end-capture[x].start;
1398           mys.data = text+capture[x].start;
1399           dprintf("\"");
1400           dprints(&mys);
1401           dprintf("\"");
1402           */
1403           dprints(text+capture[x].start, end-capture[x].start);
1404           if (capture[x].end == -1) dprintf("+");
1405         }
1406       }
1407       dprintf("]");
1408       if (op == OP_ZERO || op == OP_REACH || op == OP_NREACH || op == OP_MNEXT || op == OP_RNEXT) {
1409         dprintf(" counter=[");
1410         for (x = 0; x < regex->ncounters; x++) {
1411           if (x) dprintf(",");
1412           dprintf("%d", counter[x]);
1413         }
1414         dprintf("]");
1415       }
1416       if (op == OP_MARK || op == OP_FDIST || op == OP_RDIST) {
1417         dprintf(" mark=[");
1418         for (x = 0; x < regex->nmarks; x++) {
1419           if (x) dprintf(",");
1420           dprintf("%d", mark[x]);
1421         }
1422         dprintf("]");
1423       }
1424       if (regex->code[addr] == OP_CHAR && index < textLen) dprintf(" ch='%c'", Canonicalize(regex, text[index]));
1425       dprintf("\n");
1426       (void)dprint_code(regex, addr);
1427     }
1428 #endif
1429     addr++;
1430     switch (op) {
1431       case OP_FAIL: case OP_SUCCEED: case OP_BOL:
1432       case OP_EOL: case OP_BRK: case OP_NBRK:
1433         break;
1434       case OP_CHAR: case OP_ZERO: case OP_START:
1435       case OP_END: case OP_MARK: case OP_FDIST:
1436       case OP_BACKREF:
1437         i = CODE_MAKEI(regex->code, addr);  /* integer arg */
1438         addr += CODE_SZI;
1439         break;
1440       case OP_REACH: case OP_NREACH: case OP_UNDEF:
1441         i = CODE_MAKEI(regex->code, addr);  /* integer arg */
1442         addr += CODE_SZI;
1443         i2 = CODE_MAKEI(regex->code, addr); /* integer arg */
1444         addr += CODE_SZI;
1445         break;
1446       case OP_RDIST:
1447         i = CODE_MAKEI(regex->code, addr);  /* integer arg */
1448         addr += CODE_SZI;
1449         i2 = CODE_MAKEI(regex->code, addr); /* integer arg */
1450         addr += CODE_SZI;
1451         i3 = CODE_MAKEI(regex->code, addr); /* integer arg */
1452         addr += CODE_SZI;
1453         break;
1454       case OP_MNEXT: case OP_RNEXT:
1455         i = CODE_MAKEI(regex->code, addr);  /* integer arg */
1456         addr += CODE_SZI;
1457         i2 = CODE_MAKEI(regex->code, addr); /* integer arg */
1458         addr += CODE_SZI;
1459         a = CODE_MAKEA(regex->code, addr);  /* address arg */
1460         addr += CODE_SZA;
1461         break;
1462       case OP_GOTO: case OP_GS: case OP_NS:
1463       case OP_GF: case OP_NF: case OP_AS:
1464       case OP_AF:
1465         a = CODE_MAKEA(regex->code, addr);  /* address arg */
1466         addr += CODE_SZA;
1467         break;
1468       default:
1469         /* error */
1470         //SEE_error_throw_string(eng, eng->Error, STR(internal_error));
1471         //abort();
1472         free(newstate);
1473         longjmp(regex->errJP, 666);
1474     }
1475
1476     switch (op) {
1477       case OP_FAIL:
1478         free(newstate);
1479         return 0;
1480       case OP_SUCCEED:
1481         free(newstate);
1482         return 1;
1483       /* succeed if current character matches CharClass. index++ */
1484       case OP_CHAR:
1485         if (index < textLen) {
1486           ch = text[index++];
1487           /* N.B. strings are UTF-16 encoded! */
1488           /*
1489           if ((ch & 0xfc00) == 0xd800 && index < textLen && (text[index] & 0xfc00) == 0xdc00)
1490             ch = (((ch & 0x3ff) << 10) | (text[index++] & 0x3ff))+0x10000;
1491           */
1492           ch = Canonicalize(regex, ch);
1493           if (!cc_contains(regex->cc[i], ch)) {
1494             free(newstate);
1495             return 0;
1496           }
1497         } else {
1498           free(newstate);
1499           return 0;
1500         }
1501         break;
1502       /* reset an iteration counter */
1503       case OP_ZERO: counter[i] = 0;
1504         break;
1505       /* fail if we havent reached a particular count */
1506       case OP_REACH:
1507         if (counter[i] < i2) {
1508           free(newstate);
1509           return 0;
1510         }
1511         break;
1512       /* fail if we reached a particular count */
1513       case OP_NREACH:
1514         if (counter[i] >= i2) {
1515           free(newstate);
1516           return 0;
1517         }
1518         break;
1519       /* start a capture group at current index */
1520       case OP_START:
1521         capture[i].start = index;
1522         capture[i].end = -1;
1523         break;
1524       /* finish a capture group at current index */
1525       case OP_END:
1526         capture[i].end = index;
1527         break;
1528       /* reset the given captures - usually done at a loop start */
1529       case OP_UNDEF:
1530         while (i < i2) {
1531           capture[i].start = -1;
1532           capture[i].end = -1;
1533           i++;
1534         }
1535         break;
1536       /* Set a mark to the current index */
1537       case OP_MARK:
1538         mark[i] = index;
1539         break;
1540       /* fail if we haven't advanced past the mark */
1541       case OP_FDIST:
1542         if (mark[i] == index) {
1543           free(newstate);
1544           return 0;
1545         }
1546         break;
1547       /* fail if haven't advanced past mark AND counter has reached a limit */
1548       case OP_RDIST:
1549         if (mark[i] == index && counter[i2] >= i3) {
1550           free(newstate);
1551           return 0;
1552         }
1553         break;
1554       /* increment counter if it is less than n. always branch */
1555       case OP_MNEXT:
1556         if (counter[i] < i2) counter[i]++;
1557         addr = a;
1558         break;
1559       /* increment counter. if it is less than n, then branch */
1560       case OP_RNEXT:
1561         counter[i]++;
1562         if (counter[i] < i2) addr = a;
1563         break;
1564       case OP_GOTO:
1565         addr = a;
1566         break;
1567       /* operations that push state and branch for backtracking */
1568       case OP_GS: /* greedy success */
1569       case OP_NS: /* non-greedy success */
1570       case OP_GF: /* greedy fail */
1571       case OP_NF: /* greedy fail */
1572       case OP_AS: /* assert success */
1573       case OP_AF: /* assert fail */
1574         newaddr = (op == OP_NS || op == OP_NF) ? a : addr;
1575         memcpy(newstate, state, statesz);
1576         result = pcode_run(regex, newaddr, text, textLen, newstate);
1577         if (result) {
1578           if (op == OP_GF || op == OP_NF) {
1579             memcpy(state, newstate, statesz);
1580             free(newstate);
1581             return 1;
1582           } else if (op == OP_AF) {
1583             free(newstate);
1584             return 0;
1585           } else if (op == OP_AS) {
1586             int index_save = index;
1587             memcpy(state, newstate, statesz);
1588             index = index_save;
1589             addr = a;
1590           } else if (op == OP_GS) {
1591             addr = a;
1592           }
1593         } else {
1594           if (op == OP_GS || op == OP_NS || op == OP_AS) {
1595             free(newstate);
1596             return 0;
1597           } else if (op == OP_GF || op == OP_AF) addr = a;
1598         }
1599         break;
1600       /* succeed if we are at the beginning of a line */
1601       /* See 15.10.2.6 */
1602       case OP_BOL: /* ^ */
1603         if (index == 0) ; /* succeed */
1604         else if ((regex->flags & SEERX_FLAG_MULTILINE) == 0) { free(newstate); return 0; }
1605         else if (text[index-1] == 0x000a || /*LF*/
1606                  text[index-1] == 0x000d) // || /*CR*/
1607                  //text[index-1] == 0x2028 || /*LS*/
1608                  //text[index-1] == 0x2029) /*PS*/
1609             ; /* succeed */
1610         else { free(newstate); return 0; }
1611         break;
1612       /* succeed if we are at the end of a line */
1613       case OP_EOL: /* $ */
1614         if (index == textLen) ; /* succeed */
1615         else if ((regex->flags & SEERX_FLAG_MULTILINE) == 0) { free(newstate); return 0; }
1616         else if (text[index] == 0x000a || /*LF*/
1617                  text[index] == 0x000d) // || /*CR*/
1618                  //text[index] == 0x2028 || /*LS*/
1619                  //text[index] == 0x2029) /*PS*/
1620             ; /* succeed */
1621         else { free(newstate); return 0; }
1622         break;
1623 #define IsWordChar(e) ((e) >= 0 && (e) < textLen && ( \
1624          (text[e] >= 'a' && text[e] <= 'z') \
1625       || (text[e] >= 'A' && text[e] <= 'Z') \
1626       || (text[e] >= '0' && text[e] <= '9') \
1627       || text[e] == '_'))
1628       /* succeed if we are at a word break */
1629       case OP_BRK:
1630       case OP_NBRK: {
1631         SEE_boolean_t a, b;
1632         a = IsWordChar(index - 1);
1633         b = IsWordChar(index);
1634         if (op == OP_BRK) {
1635           if (a == b) { free(newstate); return 0; }
1636         } else {
1637           if (a != b) { free(newstate); return 0; }
1638         }
1639         } break;
1640       /* succeed if we match a backreference */
1641       case OP_BACKREF:
1642         if (!SEERX_IS_CAPTURE_UNDEFINED(capture[i])) {
1643           int x, len, br;
1644           br = capture[i].start;
1645           len = capture[i].end - br;
1646           if (len+index > textLen) { free(newstate); return 0; }
1647           for (x = 0; x < len; x++)
1648             if (Canonicalize(regex, text[br+x]) != Canonicalize(regex, text[index+x])) {
1649               free(newstate);
1650               return 0;
1651             }
1652           index += len;
1653         }
1654         break;
1655       /* catch unexpected instructions */
1656       default:
1657         //SEE_error_throw_string(eng, eng->Error, STR(internal_error));
1658         //abort();
1659         free(newstate);
1660         longjmp(regex->errJP, 666);
1661       }
1662   }
1663 }
1664 #undef index
1665
1666
1667 /*
1668  * Executes the regex on the text beginning at index.
1669  * Returns true of a match was successful.
1670  */
1671 int SEE_regex_match (SEE_RegExpr *regex, const char *text, int textLen, int index, Capture *capture_ret) {
1672   int i, success;
1673   if (textLen < 0) textLen = text?strlen(text):0;
1674   char *state = calloc(regex->statesz+1, sizeof(char));//SEE_STRING_ALLOCA(eng, char, regex->statesz);
1675   Capture *capture = (Capture *)state;
1676 #ifndef NDEBUG
1677   memset(state, 0xd0, regex->statesz); /* catch bugs */
1678 #endif
1679   capture[0].start = index;
1680   capture[0].end = index;
1681   for (i = 1; i < regex->ncaptures; i++) {
1682     capture[i].start = -1;
1683     capture[i].end = -1;
1684   }
1685   if (setjmp(regex->errJP)) {
1686     free(state);
1687     return 0;
1688   }
1689   success = pcode_run(regex, 0, text, textLen, state);
1690 #ifndef NDEBUG
1691   if (SEE_regex_debug) dprintf(". %s\n", success ? "success" : "failure");
1692 #endif
1693   if (success && capture_ret) memcpy(capture_ret, capture, regex->ncaptures*sizeof(Capture));
1694   free(state);
1695   return success;
1696 }
1697
1698
1699 /*------------------------------------------------------------
1700  * optimizer
1701  */
1702 static void optimize_regex (SEE_RegExpr *regex) {
1703   /*
1704    * (nothing here yet)
1705    *
1706    * possible optimisations include branch short-cuts,
1707    * and compiling the p-code to native machine instructions.
1708    */
1709 }
1710
1711
1712 void SEE_regex_free (SEE_RegExpr *regex) {
1713   if (regex) {
1714     CharClassRange *r, *n;
1715     for (n = NULL, r = regex->crList; r; r = n) {
1716       n = r->prev;
1717       free(r);
1718     }
1719     CharClass *cr, *cn;
1720     for (cn = NULL, cr = regex->ccList; cr; cr = cn) {
1721       cn = cr->prev;
1722       free(cr);
1723     }
1724     SEE_grow_deinit(&regex->codegrow);
1725     SEE_grow_deinit(&regex->ccgrow);
1726     free(regex);
1727   }
1728 }
1729
1730
1731 #ifdef __cplusplus
1732 }
1733 #endif