src/hsregexp.c

   1 /*
   2  * hsrxCompile and hsrxExec -- regsub and hsrxError are elsewhere
   3  *
   4  *  Copyright (c) 1986 by University of Toronto.
   5  *  Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *  Permission is granted to anyone to use this software for any
   8  *  purpose on any computer system, and to redistribute it freely,
   9  *  subject to the following restrictions:
  10  *
  11  *  1. The author is not responsible for the consequences of use of
  12  *    this software, no matter how awful, even if they arise
  13  *    from defects in it.
  14  *
  15  *  2. The origin of this software must not be misrepresented, either
  16  *    by explicit claim or by omission.
  17  *
  18  *  3. Altered versions must be plainly marked as such, and must not
  19  *    be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a hsrxCompile() redundancy.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  33  *** to avoid conflicting with the system regexp.h.  Const, bless its
  34  *** soul, was removed so it can compile everywhere.  The declaration
  35  *** of strchr() was in conflict on AIX, so it was removed (as it is
  36  *** happily defined in string.h).
  37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  41  *
  42  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  43  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  44  *   This includes counted repetitions, UTF-8 support, character classes,
  45  *   shorthand character classes, increased number of parentheses to 100,
  46  *   backslash escape sequences, non-greedy ops. It also removes \n as an alternative to |.
  47  *
  48  * Beware that some of this code is subtly aware of the way operator
  49  * precedence is structured in regular expressions.  Serious changes in
  50  * regular-expression syntax might require a total rethink.
  51  */
  52 #include <ctype.h>
  53 #include <stdio.h>
  54 #include <stdlib.h>
  55 #include <string.h>
  56
  57 #include "hsregexp.h"
  58
  59
  60 #define MAYBE_UNUSED  __attribute__((unused))
  61
  62 /**
  63  * Converts the given unicode codepoint (0 - 0xffff) to utf-8
  64  * and stores the result at 'p'.
  65  *
  66  * Returns the number of utf-8 characters (1-3).
  67  */
  68 #ifdef HSRX_UTF8
  69 static int utf8_fromunicode (char *p, unsigned short uc) MAYBE_UNUSED;
  70
  71 /**
  72  * Returns the length of the utf-8 sequence starting with 'c'.
  73  *
  74  * Returns 1-4, or -1 if this is not a valid start byte.
  75  *
  76  * Note that charlen=4 is not supported by the rest of the API.
  77  */
  78 static int utf8_charlen (int c) MAYBE_UNUSED;
  79
  80 /**
  81  * Returns the number of characters in the utf-8
  82  * string of the given byte length.
  83  *
  84  * Any bytes which are not part of an valid utf-8
  85  * sequence are treated as individual characters.
  86  *
  87  * The string *must* be null terminated.
  88  *
  89  * Does not support unicode code points > \uffff
  90  */
  91 static int utf8_strlen (const char *str, int bytelen) MAYBE_UNUSED;
  92
  93 /**
  94  * Returns the byte index of the given character in the utf-8 string.
  95  *
  96  * The string *must* be null terminated.
  97  *
  98  * This will return the byte length of a utf-8 string
  99  * if given the char length.
 100  */
 101 static int utf8_index (const char *str, int charindex) MAYBE_UNUSED;
 102
 103 /**
 104  * Returns the unicode codepoint corresponding to the
 105  * utf-8 sequence 'str'.
 106  *
 107  * Stores the result in *uc and returns the number of bytes
 108  * consumed.
 109  *
 110  * If 'str' is null terminated, then an invalid utf-8 sequence
 111  * at the end of the string will be returned as individual bytes.
 112  *
 113  * If it is not null terminated, the length *must* be checked first.
 114  *
 115  * Does not support unicode code points > \uffff
 116  */
 117 static int utf8_tounicode (const char *str, int *uc) MAYBE_UNUSED;
 118
 119 /**
 120  * Returns the number of bytes before 'str' that the previous
 121  * utf-8 character sequence starts (which may be the middle of a sequence).
 122  *
 123  * Looks back at most 'len' bytes backwards, which must be > 0.
 124  * If no start char is found, returns -len
 125  */
 126 static int utf8_prev_len (const char *str, int len) MAYBE_UNUSED;
 127
 128 /**
 129  * Returns the upper-case variant of the given unicode codepoint.
 130  *
 131  * Does not support unicode code points > \uffff
 132  */
 133 static int utf8_upper (int uc) MAYBE_UNUSED;
 134
 135 /**
 136  * Returns the lower-case variant of the given unicode codepoint.
 137  *
 138  * NOTE: Use utf8_upper() in preference for case-insensitive matching.
 139  *
 140  * Does not support unicode code points > \uffff
 141  */
 142 static int utf8_lower (int uc) MAYBE_UNUSED;
 143
 144
 145 static int utf8_charequal (const char *s1, const char *s2) MAYBE_UNUSED;
 146
 147
 148 /**
 149  * UTF-8 utility functions
 150  *
 151  * (c) 2010 Steve Bennett <steveb@workware.net.au>
 152  *
 153  * See LICENCE for licence details.
 154  */
 155 /* This one is always implemented */
 156 static int utf8_fromunicode (char *p, unsigned short uc) {
 157   if (uc <= 0x7f) { *p = uc; return 1; }
 158   if (uc <= 0x7ff) { *p++ = 0xc0 | ((uc & 0x7c0) >> 6); *p = 0x80 | (uc & 0x3f); return 2; }
 159   *p++ = 0xe0 | ((uc & 0xf000) >> 12); *p++ = 0x80 | ((uc & 0xfc0) >> 6); *p = 0x80 | (uc & 0x3f); return 3;
 160 }
 161
 162
 163 static int utf8_charlen (int c) {
 164   if ((c&0x80) == 0) return 1;
 165   if ((c&0xe0) == 0xc0) return 2;
 166   if ((c&0xf0) == 0xe0) return 3;
 167   if ((c&0xf8) == 0xf0) return 4;
 168   /* invalid sequence */
 169   return -1;
 170 }
 171
 172
 173 static int utf8_strlen (const char *str, int bytelen) {
 174   int charlen = 0;
 175   //
 176   if (bytelen < 0) bytelen = strlen(str);
 177   while (bytelen) {
 178     int c;
 179     int l = utf8_tounicode(str, &c);
 180     ++charlen;
 181     str += l;
 182     bytelen -= l;
 183   }
 184   return charlen;
 185 }
 186
 187
 188 static int utf8_index (const char *str, int index) {
 189   const char *s = str;
 190   //
 191   while (index--) {
 192     int c;
 193     //
 194     s += utf8_tounicode(s, &c);
 195   }
 196   return s-str;
 197 }
 198
 199
 200 static int utf8_charequal (const char *s1, const char *s2) {
 201   int c1, c2;
 202   //
 203   utf8_tounicode(s1, &c1);
 204   utf8_tounicode(s2, &c2);
 205   return c1==c2;
 206 }
 207
 208
 209 static int utf8_prev_len (const char *str, int len) {
 210   int n = 1;
 211   //
 212   //assert(len > 0);
 213   /* look up to len chars backward for a start-of-char byte */
 214   while (--len) {
 215     if ((str[-n] & 0x80) == 0) break; /* start of a 1-byte char */
 216     if ((str[-n] & 0xc0) == 0xc0) break; /* start of a multi-byte char */
 217     ++n;
 218   }
 219   return n;
 220 }
 221
 222
 223 static int utf8_tounicode (const char *str, int *uc) {
 224   unsigned const char *s = (unsigned const char *)str;
 225   //
 226   if (s[0] < 0xc0) { *uc = s[0]; return 1; }
 227   if (s[0] < 0xe0) {
 228     if ((s[1] & 0xc0) == 0x80) { *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80); return 2; }
 229   } else if (s[0] < 0xf0) {
 230     if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
 231       *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);
 232       return 3;
 233     }
 234   }
 235   /* invalid sequence, so just return the byte */
 236   *uc = *s;
 237   return 1;
 238 }
 239
 240
 241 struct casemap {
 242   unsigned short code; /* code point */
 243   signed char lowerdelta; /* add for lowercase, or if -128 use the ext table */
 244   signed char upperdelta; /* add for uppercase, or offset into the ext table */
 245 };
 246
 247 /* extended table for codepoints where |delta| > 127 */
 248 struct caseextmap {
 249     unsigned short lower;
 250     unsigned short upper;
 251 };
 252
 253 /* generated mapping tables */
 254 #include "hsregexp_unicode_mapping.c"
 255
 256 #define NUMCASEMAP sizeof(unicode_case_mapping) / sizeof(*unicode_case_mapping)
 257
 258 static inline int cmp_casemap (const void *key, const void *cm) {
 259   return *(int *)key-(int)((const struct casemap *)cm)->code;
 260 }
 261
 262
 263 static int utf8_map_case (int uc, int upper) {
 264   const struct casemap *cm = bsearch(&uc, unicode_case_mapping, NUMCASEMAP, sizeof(*unicode_case_mapping), cmp_casemap);
 265   if (cm) {
 266     if (cm->lowerdelta == -128) {
 267       uc = upper?unicode_extmap[cm->upperdelta].upper:unicode_extmap[cm->upperdelta].lower;
 268     } else {
 269       uc += upper?cm->upperdelta:cm->lowerdelta;
 270     }
 271   }
 272   return uc;
 273 }
 274
 275
 276 int utf8_upper (int uc) {
 277   if (isascii(uc)) return toupper(uc);
 278   return utf8_map_case(uc, 1);
 279 }
 280
 281
 282 int utf8_lower (int uc) {
 283   if (isascii(uc)) return tolower(uc);
 284   return utf8_map_case(uc, 0);
 285 }
 286
 287 #else
 288 /* No utf-8 support. 1 byte = 1 char */
 289 # define utf8_strlen(S, B) (B) < 0 ? strlen(S) : (B)
 290 # define utf8_tounicode(S, CP) (*(CP) = *(S), 1)
 291 # define utf8_upper(C) toupper(C)
 292 # define utf8_lower(C) tolower(C)
 293 # define utf8_index(C, I) (I)
 294 # define utf8_charlen(C) 1
 295 # define utf8_prev_len(S, L) 1
 296 #endif
 297
 298
 299 #define UCHAR(c)  ((unsigned char)(c))
 300
 301
 302 /*
 303  * Structure for regexp "program".  This is essentially a linear encoding
 304  * of a nondeterministic finite-state machine (aka syntax charts or
 305  * "railroad normal form" in parsing technology).  Each node is an opcode
 306  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
 307  * all nodes except BRANCH implement concatenation; a "next" pointer with
 308  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
 309  * have one of the subtle syntax dependencies:  an individual BRANCH (as
 310  * opposed to a collection of them) is never concatenated with anything
 311  * because of operator precedence.)  The operand of some types of node is
 312  * a literal string; for others, it is a node leading into a sub-FSM.  In
 313  * particular, the operand of a BRANCH node is the first node of the branch.
 314  * (NB this is *not* a tree structure:  the tail of the branch connects
 315  * to the thing following the set of BRANCHes.)  The opcodes are:
 316  */
 317
 318 /* This *MUST* be less than (255-20)/2=117 */
 319 #define HSRX_MAX_PAREN  (100)
 320
 321 /* definition number   opnd?    meaning */
 322 #define END       0 /* no       end of program */
 323 #define BOL       1 /* no       match "" at beginning of line */
 324 #define EOL       2 /* no       match "" at end of line */
 325 #define ANY       3 /* no       match any one character */
 326 #define ANYOF     4 /* str      match any character in this string */
 327 #define ANYBUT    5 /* str      match any character not in this string */
 328 #define BRANCH    6 /* node     match this alternative, or the next... */
 329 #define BACK      7 /* no       match "", "next" ptr points backward */
 330 #define EXACTLY   8 /* str      match this string */
 331 #define NOTHING   9 /* no       match empty string */
 332 #define REP      10 /* max,min  match this (simple) thing [min,max] times */
 333 #define REPMIN   11 /* max,min  match this (simple) thing [min,max] times, mininal match */
 334 #define REPX     12 /* max,min  match this (complex) thing [min,max] times */
 335 #define REPXMIN  13 /* max,min  match this (complex) thing [min,max] times, minimal match */
 336 /* no 14 */
 337 #define WORDA    15 /* no       match "" at wordchar, where prev is nonword */
 338 #define WORDZ    16 /* no       match "" at nonwordchar, where prev is word */
 339 #define OPEN     20 /* no       mark this point in input as start of #n (OPEN+1 is number 1, etc) */
 340 #define CLOSE    (OPEN+HSRX_MAX_PAREN) /* analogous to OPEN */
 341 #define CLOSEEND (CLOSE+HSRX_MAX_PAREN)
 342
 343
 344 /*
 345  * The first byte of the regexp internal "program" is actually this magic
 346  * number; the start node begins in the second byte.
 347  */
 348 #define HSRX_MAGIC  (0xFADED00DUL)
 349
 350
 351 /*
 352  * Opcode notes:
 353  *
 354  * BRANCH The set of branches constituting a single choice are hooked
 355  *    together with their "next" pointers, since precedence prevents
 356  *    anything being concatenated to any individual branch.  The
 357  *    "next" pointer of the last BRANCH in a choice points to the
 358  *    thing following the whole choice.  This is also where the
 359  *    final "next" pointer of each individual branch points; each
 360  *    branch starts with the operand node of a BRANCH node.
 361  *
 362  * BACK   Normal "next" pointers all implicitly point forward; BACK
 363  *    exists to make loop structures possible.
 364  *
 365  * STAR,PLUS  '?', and complex '*' and '+', are implemented as circular
 366  *    BRANCH structures using BACK.  Simple cases (one character
 367  *    per match) are implemented with STAR and PLUS for speed
 368  *    and to minimize recursive plunges.
 369  *
 370  * OPEN,CLOSE ...are numbered at compile time.
 371  */
 372
 373
 374 /*
 375  * A node is one char of opcode followed by two chars of "next" pointer.
 376  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 377  * value is a positive offset from the opcode of the node containing it.
 378  * An operand, if any, simply follows the node.  (Note that much of the
 379  * code generation knows about this implicit relationship.)
 380  *
 381  * Using two bytes for the "next" pointer is vast overkill for most things,
 382  * but allows patterns to get big without disasters.
 383  */
 384 #define OP(preg, p)    (preg->program[p])
 385 #define NEXT(preg, p)  (preg->program[p+1])
 386 #define OPERAND(p)     ((p)+2)
 387
 388
 389 /*
 390  * Utility definitions.
 391  */
 392
 393 #define FAIL(R,M)  { (R)->err = (M); return (M); }
 394 #define ISMULT(c)  ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 395 #define META       "^$.[()|?{+*"
 396
 397
 398 /*
 399  * Flags to be passed up and down.
 400  */
 401 #define WORST      0 /* worst case */
 402 #define HASWIDTH  01  /* known never to match null string */
 403 #define SIMPLE    02  /* simple enough to be STAR/PLUS operand */
 404 #define SPSTART   04  /* starts with * or + */
 405
 406
 407 #define MAX_REP_COUNT  (1000000)
 408
 409
 410 /*
 411  * Forward declarations for hsrxCompile()'s friends.
 412  */
 413 static int reg (HSRegExp *preg, int paren /*parenthesized?*/, int *flagp);
 414 static int regpiece (HSRegExp *preg, int *flagp);
 415 static int regbranch (HSRegExp *preg, int *flagp);
 416 static int regatom (HSRegExp *preg, int *flagp);
 417 static int regnode (HSRegExp *preg, int op);
 418 static int regnext (HSRegExp *preg, int p);
 419 static void regc (HSRegExp *preg, int b);
 420 static int reginsert (HSRegExp *preg, int op, int size, int opnd);
 421 static void regtail_ (HSRegExp *preg, int p, int val, int line);
 422 static void regoptail (HSRegExp *preg, int p, int val);
 423 #define regtail(PREG, P, VAL)  regtail_(PREG, P, VAL, __LINE__)
 424
 425 static int reg_range_find (const int *string, int c);
 426 static const char *str_find (const char *string, int c, int nocase);
 427 static int prefix_cmp (const int *prog, int proglen, const char *string, int nocase);
 428
 429 #ifdef HSRX_DEBUG
 430 int hsrxNarrate = 0;
 431 void hsrxDump (HSRegExp *preg);
 432 static const char *regprop (int op);
 433 #endif
 434
 435
 436 /**
 437  * Returns the length of the null-terminated integer sequence.
 438  */
 439 static inline int str_int_len (const int *seq) {
 440   int n = 0;
 441   while (*seq++) ++n;
 442   return n;
 443 }
 444
 445
 446 /*
 447  - hsrxCompile - compile a regular expression into internal code
 448  *
 449  * We can't allocate space until we know how big the compiled form will be,
 450  * but we can't compile it (and thus know how big it is) until we've got a
 451  * place to put the code.  So we cheat:  we compile it twice, once with code
 452  * generation turned off and size counting turned on, and once "for real".
 453  * This also means that we don't allocate space until we are sure that the
 454  * thing really will compile successfully, and we never have to move the
 455  * code and thus invalidate pointers into it.  (Note that it has to be in
 456  * one piece because free() must be able to free it all.)
 457  *
 458  * Beware that the optimization-preparation code in here knows about some
 459  * of the structure of the compiled regexp.
 460  */
 461 int hsrxCompile (HSRegExp *preg, const char *exp, int cflags) {
 462   int scan, longest, flags;
 463   unsigned int len;
 464   //
 465 #ifdef HSRX_DEBUG
 466   fprintf(stderr, "Compiling: '%s'\n", exp);
 467 #endif
 468   memset(preg, 0, sizeof(*preg));
 469   if (exp == NULL) FAIL(preg, HSRX_ERR_NULL_ARGUMENT);
 470   /* first pass: determine size, legality */
 471   preg->cflags = cflags;
 472   preg->regparse = exp;
 473   /* XXX: for now, start unallocated */
 474   preg->program = NULL;
 475   preg->proglen = 0;
 476 #if 1
 477   /* allocate space */
 478   preg->proglen = (strlen(exp)+1)*5;
 479   preg->program = malloc(preg->proglen*sizeof(int));
 480   if (preg->program == NULL) FAIL(preg, HSRX_ERR_NOMEM);
 481 #endif
 482   /* note that since we store a magic value as the first item in the program, program offsets will never be 0 */
 483   regc(preg, HSRX_MAGIC);
 484   if (reg(preg, 0, &flags) == 0) return preg->err;
 485   /* small enough for pointer-storage convention? */
 486   if (preg->re_nsub >= HSRX_MAX_PAREN) FAIL(preg, HSRX_ERR_TOO_BIG); /* probably could be 65535L */
 487   /* dig out information for optimizations. */
 488   preg->regstart = 0; /* worst-case defaults */
 489   preg->reganch = 0;
 490   preg->regmust = 0;
 491   preg->regmlen = 0;
 492   scan = 1; /* first BRANCH */
 493   if (OP(preg, regnext(preg, scan)) == END) {
 494     /* only one top-level choice */
 495     scan = OPERAND(scan);
 496     /* starting-point info */
 497     if (OP(preg, scan) == EXACTLY) preg->regstart = preg->program[OPERAND(scan)];
 498     else if (OP(preg, scan) == BOL) ++preg->reganch;
 499     /*
 500      * If there's something expensive in the r.e., find the
 501      * longest literal string that must appear and make it the
 502      * regmust.  Resolve ties in favor of later strings, since
 503      * the regstart check works with the beginning of the r.e.
 504      * and avoiding duplication strengthens checking.  Not a
 505      * strong reason, but sufficient in the absence of others.
 506      */
 507     if (flags&SPSTART) {
 508       longest = 0;
 509       len = 0;
 510       for (; scan != 0; scan = regnext(preg, scan)) {
 511         if (OP(preg, scan) == EXACTLY) {
 512           int plen = str_int_len(preg->program+OPERAND(scan));
 513           if (plen >= len) {
 514             longest = OPERAND(scan);
 515             len = plen;
 516           }
 517         }
 518       }
 519       preg->regmust = longest;
 520       preg->regmlen = len;
 521     }
 522   }
 523 #ifdef HSRX_DEBUG
 524   hsrxDump(preg);
 525 #endif
 526   return 0;
 527 }
 528
 529
 530 /*
 531  - reg - regular expression, i.e. main body or parenthesized thing
 532  *
 533  * Caller must absorb opening parenthesis.
 534  *
 535  * Combining parenthesis handling with the base level of regular expression
 536  * is a trifle forced, but the need to tie the tails of the branches to what
 537  * follows makes it hard to avoid.
 538  */
 539 static int reg (HSRegExp *preg, int paren/*parenthesized?*/, int *flagp) {
 540   int ret, br, ender, flags;
 541   int parno = 0;
 542   //
 543   *flagp = HASWIDTH; /* tentatively */
 544   /* make an OPEN node, if parenthesized */
 545   if (paren) {
 546     parno = ++preg->re_nsub;
 547     ret = regnode(preg, OPEN+parno);
 548   } else {
 549     ret = 0;
 550   }
 551   /* pick up the branches, linking them together */
 552   br = regbranch(preg, &flags);
 553   if (br == 0) return 0;
 554   if (ret != 0) {
 555     regtail(preg, ret, br); /* OPEN -> first */
 556   } else {
 557     ret = br;
 558   }
 559   if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
 560   *flagp |= flags&SPSTART;
 561   while (*preg->regparse == '|') {
 562     ++preg->regparse;
 563     br = regbranch(preg, &flags);
 564     if (br == 0) return 0;
 565     regtail(preg, ret, br); /* BRANCH -> BRANCH */
 566     if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
 567     *flagp |= flags&SPSTART;
 568   }
 569   /* make a closing node, and hook it on the end */
 570   ender = regnode(preg, (paren)?CLOSE+parno:END);
 571   regtail(preg, ret, ender);
 572   /* hook the tails of the branches to the closing node */
 573   for (br = ret; br != 0; br = regnext(preg, br)) regoptail(preg, br, ender);
 574   /* check for proper termination */
 575   if (paren && *preg->regparse++ != ')') {
 576     preg->err = HSRX_ERR_UNMATCHED_PAREN;
 577     return 0;
 578   }
 579   if (!paren && *preg->regparse != '\0') {
 580     if (*preg->regparse == ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
 581     preg->err = HSRX_ERR_JUNK_ON_END;
 582     return 0;
 583   }
 584   return ret;
 585 }
 586
 587
 588 /*
 589  - regbranch - one alternative of an | operator
 590  *
 591  * Implements the concatenation operator.
 592  */
 593 static int regbranch (HSRegExp *preg, int *flagp) {
 594   int ret, chain, latest, flags;
 595   //
 596   *flagp = WORST; /* tentatively */
 597   ret = regnode(preg, BRANCH);
 598   chain = 0;
 599   while (*preg->regparse != '\0' && *preg->regparse != ')' && *preg->regparse != '|') {
 600     latest = regpiece(preg, &flags);
 601     if (latest == 0) return 0;
 602     *flagp |= flags&HASWIDTH;
 603     if (chain == 0) {
 604       /* first piece */
 605       *flagp |= flags&SPSTART;
 606     } else {
 607       regtail(preg, chain, latest);
 608     }
 609     chain = latest;
 610   }
 611   if (chain == 0) regnode(preg, NOTHING); /* loop ran zero times */
 612   return ret;
 613 }
 614
 615
 616 /*
 617  - regpiece - something followed by possible [*+?]
 618  *
 619  * Note that the branching code sequences used for ? and the general cases
 620  * of * and + are somewhat optimized:  they use the same NOTHING node as
 621  * both the endmarker for their branch list and the body of the last branch.
 622  * It might seem that this node could be dispensed with entirely, but the
 623  * endmarker role is not redundant.
 624  */
 625 static int regpiece (HSRegExp *preg, int *flagp) {
 626   char op;
 627   int ret, next, flags, min, max;
 628   int chain = 0;
 629   //
 630   ret = regatom(preg, &flags);
 631   if (ret == 0) return 0;
 632   op = *preg->regparse;
 633   if (!ISMULT(op)) { *flagp = flags; return(ret); }
 634   if (!(flags&HASWIDTH) && op != '?') { preg->err = HSRX_ERR_OPERAND_COULD_BE_EMPTY; return 0; }
 635   /* handle braces (counted repetition) by expansion */
 636   if (op == '{') {
 637     char *end;
 638     //
 639     min = strtoul(preg->regparse+1, &end, 10);
 640     if (end == preg->regparse+1) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
 641     if (*end == '}') {
 642       max = min;
 643     } else {
 644       preg->regparse = end;
 645       max = strtoul(preg->regparse+1, &end, 10);
 646       if (*end != '}') { preg->err = HSRX_ERR_UNMATCHED_BRACES; return 0; }
 647     }
 648     if (end == preg->regparse+1) {
 649       max = MAX_REP_COUNT;
 650     } else if (max < min || max >= 100) {
 651       preg->err = HSRX_ERR_BAD_COUNT;
 652       return 0;
 653     }
 654     if (min >= 100) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
 655     preg->regparse = strchr(preg->regparse, '}');
 656   } else {
 657     min = (op == '+');
 658     max = (op == '?' ? 1 : MAX_REP_COUNT);
 659   }
 660   if (preg->regparse[1] == '?') {
 661     ++preg->regparse;
 662     next = reginsert(preg, flags&SIMPLE?REPMIN:REPXMIN, 5, ret);
 663   } else {
 664     next = reginsert(preg, flags&SIMPLE?REP:REPX, 5, ret);
 665   }
 666   preg->program[ret+2] = max;
 667   preg->program[ret+3] = min;
 668   preg->program[ret+4] = 0;
 669   *flagp = (min)?(WORST|HASWIDTH):(WORST|SPSTART);
 670   if (!(flags & SIMPLE)) {
 671     int back = regnode(preg, BACK);
 672     regtail(preg, back, ret);
 673     regtail(preg, next, back);
 674   }
 675   ++preg->regparse;
 676   if (ISMULT(*preg->regparse)) { preg->err = HSRX_ERR_NESTED_COUNT; return 0; }
 677   return chain?chain:ret;
 678 }
 679
 680
 681 /**
 682  * Add all characters in the inclusive range between lower and upper.
 683  *
 684  * Handles a swapped range (upper < lower).
 685  */
 686 static void reg_addrange (HSRegExp *preg, int lower, int upper) {
 687   if (lower > upper) reg_addrange(preg, upper, lower);
 688   /* add a range as length, start */
 689   regc(preg, upper-lower+1);
 690   regc(preg, lower);
 691 }
 692
 693
 694 /**
 695  * Add a null-terminated literal string as a set of ranges.
 696  */
 697 static void reg_addrange_str (HSRegExp *preg, const char *str) {
 698   while (*str) { reg_addrange(preg, *str, *str); ++str; }
 699 }
 700
 701
 702 /**
 703  * Extracts the next unicode char from utf8.
 704  *
 705  * If 'upper' is set, converts the char to uppercase.
 706  */
 707 static inline int reg_utf8_tounicode_case (const char *s, int *uc, int upper) {
 708 #ifdef HSRX_UTF8
 709   int l = utf8_tounicode(s, uc);
 710   if (upper) *uc = utf8_upper(*uc);
 711   return l;
 712 #else
 713   if (upper) *uc = toupper(*s); else *uc = *s;
 714   return 1;
 715 #endif
 716 }
 717
 718
 719
 720
 721 /**
 722  * Converts a hex digit to decimal.
 723  *
 724  * Returns -1 for an invalid hex digit.
 725  */
 726 static inline int hexdigitval (int c) {
 727   if (c >= '0' && c <= '9') return c-'0';
 728   if (c >= 'a' && c <= 'f') return c-'a'+10;
 729   if (c >= 'A' && c <= 'F') return c-'A'+10;
 730   return -1;
 731 }
 732
 733
 734 /**
 735  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 736  *
 737  * Returns the number of hex digits parsed.
 738  * If there are no hex digits, returns 0 and stores nothing.
 739  */
 740 static int parse_hex (const char *s, int n, int *uc) {
 741   int k, val = 0;
 742   //
 743   for (k = 0; k < n; ++k) {
 744     int c = hexdigitval(*s++);
 745     if (c == -1) break;
 746     val = (val<<4)|c;
 747   }
 748   if (k) *uc = val;
 749   return k;
 750 }
 751
 752
 753 /**
 754  * Call for chars after a backlash to decode the escape sequence.
 755  *
 756  * Stores the result in *ch.
 757  *
 758  * Returns the number of bytes consumed.
 759  */
 760 static int reg_decode_escape (const char *s, int *ch) {
 761   int n;
 762   const char *s0 = s;
 763   //
 764   *ch = *s++;
 765   switch (*ch) {
 766     case 'a': *ch = '\a'; break;
 767     case 'b': *ch = '\b'; break;
 768     case 'e': *ch = 27; break;
 769     case 'f': *ch = '\f'; break;
 770     case 'n': *ch = '\n'; break;
 771     case 'r': *ch = '\r'; break;
 772     case 't': *ch = '\t'; break;
 773     case 'v': *ch = '\v'; break;
 774     case 'u':
 775       if ((n = parse_hex(s, 4, ch)) > 0) s += n;
 776       break;
 777     case 'x':
 778       if ((n = parse_hex(s, 2, ch)) > 0) s += n;
 779       break;
 780     case '\0':
 781       --s;
 782       *ch = '\\';
 783       break;
 784   }
 785   return s-s0;
 786 }
 787
 788
 789 /*
 790  - regatom - the lowest level
 791  *
 792  * Optimization:  gobbles an entire sequence of ordinary characters so that
 793  * it can turn them into a single node, which is smaller to store and
 794  * faster to run.  Backslashed characters are exceptions, each becoming a
 795  * separate node; the code is simpler that way and it's not worth fixing.
 796  */
 797 static int regatom (HSRegExp *preg, int *flagp) {
 798   int ret, flags, ch;
 799   int nocase = (preg->cflags & HSRX_ICASE);
 800   int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 801   //
 802   *flagp = WORST; /* tentatively */
 803   preg->regparse += n;
 804   switch (ch) {
 805     /* FIXME: these chars only have meaning at beg/end of pat? */
 806     case '^': ret = regnode(preg, BOL); break;
 807     case '$': ret = regnode(preg, EOL); break;
 808     case '.':
 809       ret = regnode(preg, ANY);
 810       *flagp |= HASWIDTH|SIMPLE;
 811       break;
 812     case '[': {
 813       const char *pattern = preg->regparse;
 814       //
 815       if (*pattern == '^') {
 816         /* complement of range */
 817         ret = regnode(preg, ANYBUT);
 818         ++pattern;
 819       } else {
 820         ret = regnode(preg, ANYOF);
 821       }
 822       /* special case: if the first char is ']' or '-', it is part of the set */
 823       if (*pattern == ']' || *pattern == '-') {
 824         reg_addrange(preg, *pattern, *pattern);
 825         ++pattern;
 826       }
 827       while (*pattern && *pattern != ']') {
 828         /* is this a range? a-z */
 829         int start, end;
 830         //
 831         pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 832         if (start == '\\') {
 833           pattern += reg_decode_escape(pattern, &start);
 834           if (start == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
 835         }
 836         if (pattern[0] == '-' && pattern[1]) {
 837           /* skip '-' */
 838           pattern += utf8_tounicode(pattern, &end);
 839           pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 840           if (end == '\\') {
 841             pattern += reg_decode_escape(pattern, &end);
 842             if (end == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
 843           }
 844           reg_addrange(preg, start, end);
 845           continue;
 846         }
 847         if (start == '[') {
 848           if (strncmp(pattern, ":alpha:]", 8) == 0) {
 849             if ((preg->cflags & HSRX_ICASE) == 0) reg_addrange(preg, 'a', 'z');
 850             reg_addrange(preg, 'A', 'Z');
 851             pattern += 8;
 852             continue;
 853           }
 854           if (strncmp(pattern, ":alnum:]", 8) == 0) {
 855             if ((preg->cflags & HSRX_ICASE) == 0) reg_addrange(preg, 'a', 'z');
 856             reg_addrange(preg, 'A', 'Z');
 857             reg_addrange(preg, '0', '9');
 858             pattern += 8;
 859             continue;
 860           }
 861           if (strncmp(pattern, ":space:]", 8) == 0) {
 862             reg_addrange_str(preg, " \t\r\n\f\v");
 863             pattern += 8;
 864             continue;
 865           }
 866         }
 867         /* Not a range, so just add the char */
 868         reg_addrange(preg, start, start);
 869       }
 870       regc(preg, '\0');
 871       if (*pattern) ++pattern;
 872       preg->regparse = pattern;
 873       *flagp |= HASWIDTH|SIMPLE;
 874       } break;
 875     case '(':
 876       ret = reg(preg, 1, &flags);
 877       if (ret == 0) return 0;
 878       *flagp |= flags&(HASWIDTH|SPSTART);
 879       break;
 880     case '\0':
 881     case '|':
 882     case ')':
 883       preg->err = HSRX_ERR_INTERNAL;
 884       return 0; /* supposed to be caught earlier */
 885     case '?':
 886     case '+':
 887     case '*':
 888     case '{':
 889       preg->err = HSRX_ERR_COUNT_FOLLOWS_NOTHING;
 890       return 0;
 891     case '\\':
 892       switch (*preg->regparse++) {
 893         case '\0': preg->err = HSRX_ERR_TRAILING_BACKSLASH; return 0;
 894         case '<': case 'm': ret = regnode(preg, WORDA); break;
 895         case '>': case 'M': ret = regnode(preg, WORDZ); break;
 896         case 'd':
 897           ret = regnode(preg, ANYOF);
 898           reg_addrange(preg, '0', '9');
 899           regc(preg, '\0');
 900           *flagp |= HASWIDTH|SIMPLE;
 901           break;
 902         case 'w':
 903           ret = regnode(preg, ANYOF);
 904           if ((preg->cflags & HSRX_ICASE) == 0) reg_addrange(preg, 'a', 'z');
 905           reg_addrange(preg, 'A', 'Z');
 906           reg_addrange(preg, '0', '9');
 907           reg_addrange(preg, '_', '_');
 908           regc(preg, '\0');
 909           *flagp |= HASWIDTH|SIMPLE;
 910           break;
 911         case 's':
 912           ret = regnode(preg, ANYOF);
 913           reg_addrange_str(preg," \t\r\n\f\v");
 914           regc(preg, '\0');
 915           *flagp |= HASWIDTH|SIMPLE;
 916           break;
 917         /* FIXME: Someday handle \1, \2, ... */
 918         default:
 919           /* handle general quoted chars in exact-match routine */
 920           /* back up to include the backslash */
 921           --preg->regparse;
 922           goto de_fault;
 923       }
 924       break;
 925 de_fault:
 926     default: {
 927       /* encode a string of characters to be matched exactly */
 928       int added = 0;
 929       /* back up to pick up the first char of interest */
 930       preg->regparse -= n;
 931       ret = regnode(preg, EXACTLY);
 932       /* note that a META operator such as ? or * consumes the preceding char */
 933       /* thus we must be careful to look ahead by 2 and add the last char as it's own EXACTLY if necessary */
 934       /* until end of string or a META char is reached */
 935       while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 936         n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & HSRX_ICASE));
 937         if (ch == '\\' && preg->regparse[n]) {
 938           /* non-trailing backslash: is this a special escape, or a regular escape? */
 939           if (strchr("<>mMwds", preg->regparse[n])) {
 940             /* a special escape; all done with EXACTLY */
 941             break;
 942           }
 943           /* decode it; note that we add the length for the escape
 944            * sequence to the length for the backlash so we can skip
 945            * the entire sequence, or not as required
 946            */
 947           n += reg_decode_escape(preg->regparse+n, &ch);
 948           if (ch == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
 949         }
 950         /* now we have one char 'ch' of length 'n'; check to see if the following char is a MULT */
 951         if (ISMULT(preg->regparse[n])) {
 952           /* yes, but do we already have some EXACTLY chars? */
 953           if (added) {
 954             /* yes, so return what we have and pick up the current char next time around */
 955             break;
 956           }
 957           /* no, so add this single char and finish */
 958           regc(preg, ch);
 959           ++added;
 960           preg->regparse += n;
 961           break;
 962         }
 963         /* no, so just add this char normally */
 964         regc(preg, ch);
 965         ++added;
 966         preg->regparse += n;
 967       }
 968       regc(preg, '\0');
 969       *flagp |= HASWIDTH;
 970       if (added == 1) *flagp |= SIMPLE;
 971       break;
 972     }
 973     break;
 974   }
 975   return ret;
 976 }
 977
 978
 979 static void reg_grow (HSRegExp *preg, int n) {
 980   if (preg->p+n >= preg->proglen) {
 981     preg->proglen = (preg->p+n)*2;
 982     preg->program = realloc(preg->program, preg->proglen*sizeof(int));
 983   }
 984 }
 985
 986
 987 /*
 988  - regnode - emit a node
 989  */
 990 /* location */
 991 static int regnode (HSRegExp *preg, int op) {
 992   reg_grow(preg, 2);
 993   preg->program[preg->p++] = op;
 994   preg->program[preg->p++] = 0;
 995   /* return the start of the node */
 996   return preg->p-2;
 997 }
 998
 999
1000 /*
1001  - regc - emit (if appropriate) a byte of code
1002  */
1003 static void regc (HSRegExp *preg, int b) {
1004   reg_grow(preg, 1);
1005   preg->program[preg->p++] = b;
1006 }
1007
1008
1009 /*
1010  - reginsert - insert an operator in front of already-emitted operand
1011  *
1012  * Means relocating the operand.
1013  * Returns the new location of the original operand.
1014  */
1015 static int reginsert (HSRegExp *preg, int op, int size, int opnd) {
1016   reg_grow(preg, size);
1017   /* move everything from opnd up */
1018   memmove(preg->program+opnd+size, preg->program+opnd, sizeof(int)*(preg->p-opnd));
1019   /* zero out the new space */
1020   memset(preg->program+opnd, 0, sizeof(int)*size);
1021   preg->program[opnd] = op;
1022   preg->p += size;
1023   return opnd+size;
1024 }
1025
1026
1027 /*
1028  - regtail - set the next-pointer at the end of a node chain
1029  */
1030 static void regtail_ (HSRegExp *preg, int p, int val, int line) {
1031   int scan, temp, offset;
1032   /* find last node */
1033   scan = p;
1034   for (;;) {
1035     temp = regnext(preg, scan);
1036     if (temp == 0) break;
1037     scan = temp;
1038   }
1039   offset = OP(preg, scan)==BACK?scan-val:val-scan;
1040   preg->program[scan+1] = offset;
1041 }
1042
1043
1044 /*
1045  - regoptail - regtail on operand of first argument; nop if operandless
1046  */
1047 static void regoptail (HSRegExp *preg, int p, int val) {
1048   /* "operandless" and "op != BRANCH" are synonymous in practice */
1049   if (p != 0 && OP(preg, p) == BRANCH) regtail(preg, OPERAND(p), val);
1050 }
1051
1052
1053 /*
1054  * hsrxExec and friends
1055  */
1056
1057 /*
1058  * forwards
1059  */
1060 static int regtry (HSRegExp *preg, const char *string);
1061 static int regmatch (HSRegExp *preg, int prog);
1062 static int regrepeat (HSRegExp *preg, int p, int max);
1063
1064
1065 /*
1066  - hsrxExec - match a regexp against a string
1067  */
1068 int hsrxExec (HSRegExp  *preg, const char *string, size_t nmatch, HSRxMatch pmatch[], int eflags) {
1069   const char *s;
1070   int scan;
1071   /* be paranoid... */
1072   if (preg == NULL || preg->program == NULL || string == NULL) return HSRX_ERR_NULL_ARGUMENT;
1073   /* check validity of program */
1074   if (*preg->program != HSRX_MAGIC) return HSRX_ERR_CORRUPTED;
1075 #ifdef HSRX_DEBUG
1076   fprintf(stderr, "hsrxExec: %s\n", string);
1077   hsrxDump(preg);
1078 #endif
1079   preg->eflags = eflags;
1080   preg->pmatch = pmatch;
1081   preg->nmatch = nmatch;
1082   preg->start = string; /* all offsets are computed from here */
1083   /* must clear out the embedded repeat counts */
1084   for (scan = OPERAND(1); scan != 0; scan = regnext(preg, scan)) {
1085     switch (OP(preg, scan)) {
1086     case REP:
1087     case REPMIN:
1088     case REPX:
1089     case REPXMIN:
1090       preg->program[scan+4] = 0;
1091       break;
1092     }
1093   }
1094   /* if there is a "must appear" string, look for it */
1095   if (preg->regmust != 0) {
1096     s = string;
1097     while ((s = str_find(s, preg->program[preg->regmust], preg->cflags&HSRX_ICASE)) != NULL) {
1098       if (prefix_cmp(preg->program+preg->regmust, preg->regmlen, s, preg->cflags&HSRX_ICASE) >= 0) {
1099         break;
1100       }
1101       ++s;
1102     }
1103     if (s == NULL) return HSRX_NOMATCH; /* not present */
1104   }
1105   /* mark beginning of line for ^ */
1106   preg->regbol = string;
1107   /* simplest case: anchored match need be tried only once (maybe per line) */
1108   if (preg->reganch) {
1109     if (eflags & HSRX_NOTBOL) {
1110       /* this is an anchored search, but not an BOL, so possibly skip to the next line */
1111       goto nextline;
1112     }
1113     for (;;) {
1114       int ret = regtry(preg, string);
1115       if (ret) return HSRX_NOERROR;
1116       if (*string) {
1117 nextline:
1118         if (preg->cflags&HSRX_NEWLINE) {
1119           /* try the next anchor? */
1120           string = strchr(string, '\n');
1121           if (string) { preg->regbol = ++string; continue; }
1122         }
1123       }
1124       return HSRX_NOMATCH;
1125     }
1126   }
1127   /* messy cases: unanchored match */
1128   s = string;
1129   if (preg->regstart != '\0') {
1130     /* we know what char it must start with */
1131     while ((s = str_find(s, preg->regstart, preg->cflags & HSRX_ICASE)) != NULL) {
1132       if (regtry(preg, s)) return HSRX_NOERROR;
1133       ++s;
1134     }
1135   } else {
1136     /* we don't -- general case */
1137     for (;;) {
1138       if (regtry(preg, s)) return HSRX_NOERROR;
1139       if (*s == '\0') break;
1140       s += utf8_charlen(*s);
1141     }
1142   }
1143   /* failure */
1144   return HSRX_NOMATCH;
1145 }
1146
1147
1148 /*
1149  - regtry - try match at specific point
1150  */
1151 /* 0 failure, 1 success */
1152 static int regtry (HSRegExp *preg, const char *string) {
1153   int i;
1154   //
1155   preg->reginput = string;
1156   for (i = 0; i < preg->nmatch; ++i) preg->pmatch[i].rm_so = preg->pmatch[i].rm_eo = -1;
1157   if (regmatch(preg, 1)) {
1158     if (preg->nmatch > 0) {
1159       preg->pmatch[0].rm_so = string-preg->start;
1160       preg->pmatch[0].rm_eo = preg->reginput-preg->start;
1161     }
1162     return 1;
1163   }
1164   return 0;
1165 }
1166
1167
1168 /**
1169  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1170  *
1171  * If 'nocase' is non-zero, does a case-insensitive match.
1172  *
1173  * Returns -1 on not found.
1174  */
1175 static int prefix_cmp (const int *prog, int proglen, const char *string, int nocase) {
1176   const char *s = string;
1177   //
1178   while (proglen && *s) {
1179     int ch;
1180     int n = reg_utf8_tounicode_case(s, &ch, nocase);
1181     //
1182     if (ch != *prog) return -1;
1183     ++prog;
1184     s += n;
1185     --proglen;
1186   }
1187   if (proglen == 0) return s-string;
1188   return -1;
1189 }
1190
1191
1192 /**
1193  * Searchs for 'c' in the range 'range'.
1194  *
1195  * Returns 1 if found, or 0 if not.
1196  */
1197 static int reg_range_find (const int *range, int c) {
1198   while (*range) {
1199     /*printf("checking %d in range [%d,%d]\n", c, range[1], (range[0]+range[1]-1));*/
1200     if (c >= range[1] && c <= (range[0]+range[1]-1)) return 1;
1201     range += 2;
1202   }
1203   return 0;
1204 }
1205
1206 /**
1207  * Search for the character 'c' in the utf-8 string 'string'.
1208  *
1209  * If 'nocase' is set, the 'string' is assumed to be uppercase
1210  * and 'c' is converted to uppercase before matching.
1211  *
1212  * Returns the byte position in the string where the 'c' was found, or
1213  * NULL if not found.
1214  */
1215 static const char *str_find (const char *string, int c, int nocase) {
1216   if (nocase) {
1217     /* the "string" should already be converted to uppercase */
1218     c = utf8_upper(c);
1219   }
1220   while (*string) {
1221     int ch;
1222     int n = reg_utf8_tounicode_case(string, &ch, nocase);
1223     //
1224     if (c == ch) return string;
1225     string += n;
1226   }
1227   return NULL;
1228 }
1229
1230
1231 /**
1232  * Returns true if 'ch' is an end-of-line char.
1233  *
1234  * In HSRX_NEWLINE mode, \n is considered EOL in
1235  * addition to \0
1236  */
1237 static int reg_iseol (HSRegExp *preg, int ch) {
1238   if (preg->cflags&HSRX_NEWLINE) return ch=='\0'||ch=='\n';
1239   return ch=='\0';
1240 }
1241
1242
1243 static int regmatchsimplerepeat (HSRegExp *preg, int scan, int matchmin) {
1244   int no, c;
1245   const char *save;
1246   int nextch = '\0';
1247   int max = preg->program[scan+2];
1248   int min = preg->program[scan+3];
1249   int next = regnext(preg, scan);
1250   /* lookahead to avoid useless match attempts when we know what character comes next */
1251   if (OP(preg, next) == EXACTLY) nextch = preg->program[OPERAND(next)];
1252   save = preg->reginput;
1253   no = regrepeat(preg, scan+5, max);
1254   if (no < min) return 0;
1255   if (matchmin) {
1256     /* from min up to no */
1257     max = no;
1258     no = min;
1259   }
1260   /* else from no down to min */
1261   for (;;) {
1262     if (matchmin) {
1263       if (no > max) break;
1264     } else {
1265       if (no < min) break;
1266     }
1267     preg->reginput = save+utf8_index(save, no);
1268     reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags&HSRX_ICASE));
1269     /* If it could work, try it. */
1270     if (reg_iseol(preg, nextch) || c == nextch) {
1271       if (regmatch(preg, next)) return 1;
1272     }
1273     if (matchmin) {
1274       /* couldn't or didn't, add one more */
1275       ++no;
1276     } else {
1277       /* couldn't or didn't -- back up */
1278       --no;
1279     }
1280   }
1281   return 0;
1282 }
1283
1284
1285 static int regmatchrepeat (HSRegExp *preg, int scan, int matchmin) {
1286   int *scanpt = preg->program+scan;
1287   int max = scanpt[2];
1288   int min = scanpt[3];
1289   //
1290   /* have we reached min? */
1291   if (scanpt[4] < min) {
1292     /* no, so get another one */
1293     ++(scanpt[4]);
1294     if (regmatch(preg, scan+5)) return 1;
1295     --(scanpt[4]);
1296     return 0;
1297   }
1298   if (scanpt[4] > max) return 0;
1299   if (matchmin) {
1300     /* minimal, so try other branch first */
1301     if (regmatch(preg, regnext(preg, scan))) return 1;
1302     /* no, so try one more */
1303     ++(scanpt[4]);
1304     if (regmatch(preg, scan+5)) return 1;
1305     --(scanpt[4]);
1306     return 0;
1307   }
1308   /* maximal, so try this branch again */
1309   if (scanpt[4] < max) {
1310     ++(scanpt[4]);
1311     if (regmatch(preg, scan+5)) return 1;
1312     --(scanpt[4]);
1313   }
1314   /* at this point we are at max with no match: try the other branch */
1315   return regmatch(preg, regnext(preg, scan));
1316 }
1317
1318
1319 /*
1320  - regmatch - main matching routine
1321  *
1322  * Conceptually the strategy is simple:  check to see whether the current
1323  * node matches, call self recursively to see whether the rest matches,
1324  * and then act accordingly.  In practice we make some effort to avoid
1325  * recursion, in particular by going through "ordinary" nodes (that don't
1326  * need to know whether the rest of the match failed) by a loop instead of
1327  * by recursion.
1328  */
1329 /* 0 failure, 1 success */
1330 static int regmatch (HSRegExp *preg, int prog) {
1331   int scan; /* current node */
1332   int next; /* next node */
1333   //
1334   scan = prog;
1335 #ifdef HSRX_DEBUG
1336   if (scan != 0 && hsrxNarrate) fprintf(stderr, "%s(\n", regprop(scan));
1337 #endif
1338   while (scan != 0) {
1339     int n, c;
1340 #ifdef HSRX_DEBUG
1341     if (hsrxNarrate) {
1342       //fprintf(stderr, "%s...\n", regprop(scan));
1343       fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* where, what */
1344     }
1345 #endif
1346     next = regnext(preg, scan);
1347     n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags&HSRX_ICASE));
1348     switch (OP(preg, scan)) {
1349       case BOL:
1350         if (preg->reginput != preg->regbol) return 0;
1351         break;
1352       case EOL:
1353         if (!reg_iseol(preg, c)) return 0;
1354         break;
1355       case WORDA:
1356         /* must be looking at a letter, digit, or _ */
1357         if ((!isalnum(UCHAR(c))) && c != '_') return(0);
1358         /* prev must be BOL or nonword */
1359         if (preg->reginput > preg->regbol && (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_')) return 0;
1360         break;
1361       case WORDZ:
1362         /* can't match at BOL */
1363         if (preg->reginput > preg->regbol) {
1364           /* current must be EOL or nonword */
1365           if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1366             c = preg->reginput[-1];
1367             /* previous must be word */
1368             if (isalnum(UCHAR(c)) || c == '_') break;
1369           }
1370         }
1371         /* no */
1372         return 0;
1373       case ANY:
1374         if (reg_iseol(preg, c)) return 0;
1375         preg->reginput += n;
1376         break;
1377       case EXACTLY: {
1378         int opnd = OPERAND(scan);
1379         int len = str_int_len(preg->program+opnd);
1380         int slen = prefix_cmp(preg->program+opnd, len, preg->reginput, preg->cflags&HSRX_ICASE);
1381         //
1382         if (slen < 0) return 0;
1383         preg->reginput += slen;
1384         } break;
1385       case ANYOF:
1386         if (reg_iseol(preg, c) || reg_range_find(preg->program+OPERAND(scan), c) == 0) return 0;
1387         preg->reginput += n;
1388         break;
1389       case ANYBUT:
1390         if (reg_iseol(preg, c) || reg_range_find(preg->program+OPERAND(scan), c) != 0) return 0;
1391         preg->reginput += n;
1392         break;
1393       case NOTHING: break;
1394       case BACK: break;
1395       case BRANCH:
1396         if (OP(preg, next) != BRANCH) {
1397           /* no choice */
1398           next = OPERAND(scan); /* avoid recursion */
1399         } else {
1400           const char *save;
1401           //
1402           do {
1403             save = preg->reginput;
1404             if (regmatch(preg, OPERAND(scan))) return 1;
1405             preg->reginput = save;
1406             scan = regnext(preg, scan);
1407           } while (scan != 0 && OP(preg, scan) == BRANCH);
1408           return 0;
1409           /* NOTREACHED */
1410         }
1411         break;
1412       case REP:
1413       case REPMIN:
1414         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1415       case REPX:
1416       case REPXMIN:
1417         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1418       case END:
1419         return 1; /* success! */
1420       default:
1421         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSEEND) {
1422           const char *save = preg->reginput;
1423           //
1424           if (regmatch(preg, next)) {
1425             int no;
1426             /* don't set startp if some later invocation of the same parentheses already has */
1427             if (OP(preg, scan) < CLOSE) {
1428               no = OP(preg, scan)-OPEN;
1429               if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) preg->pmatch[no].rm_so = save-preg->start;
1430             } else {
1431               no = OP(preg, scan)-CLOSE;
1432               if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) preg->pmatch[no].rm_eo = save-preg->start;
1433             }
1434             return 1;
1435           }
1436           return 0;
1437         }
1438         return HSRX_ERR_INTERNAL;
1439     }
1440     scan = next;
1441   }
1442   /* we get here only if there's trouble -- normally "case END" is the terminating point */
1443   return HSRX_ERR_INTERNAL;
1444 }
1445
1446
1447 /*
1448  - regrepeat - repeatedly match something simple, report how many
1449  */
1450 static int regrepeat (HSRegExp *preg, int p, int max) {
1451   const char *scan;
1452   int opnd, ch, n;
1453   int count = 0;
1454   //
1455   scan = preg->reginput;
1456   opnd = OPERAND(p);
1457   switch (OP(preg, p)) {
1458     case ANY:
1459       /* no need to handle utf8 specially here */
1460       while (!reg_iseol(preg, *scan) && count < max) { ++count; ++scan; }
1461       break;
1462     case EXACTLY:
1463       while (count < max) {
1464         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags&HSRX_ICASE);
1465         if (preg->program[opnd] != ch) break;
1466         ++count;
1467         scan += n;
1468       }
1469       break;
1470     case ANYOF:
1471       while (count < max) {
1472         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & HSRX_ICASE);
1473         if (reg_iseol(preg, ch) || reg_range_find(preg->program+opnd, ch) == 0) break;
1474         ++count;
1475         scan += n;
1476       }
1477       break;
1478     case ANYBUT:
1479       while (count < max) {
1480         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & HSRX_ICASE);
1481         if (reg_iseol(preg, ch) || reg_range_find(preg->program+opnd, ch) != 0) break;
1482         ++count;
1483         scan += n;
1484       }
1485       break;
1486     default: /* ih dear! called inappropriately */
1487       preg->err = HSRX_ERR_INTERNAL;
1488       count = 0; /* best compromise */
1489       break;
1490   }
1491   preg->reginput = scan;
1492   return count;
1493 }
1494
1495
1496 /*
1497  - regnext - dig the "next" pointer out of a node
1498  */
1499 static int regnext (HSRegExp *preg, int p) {
1500   int offset = NEXT(preg, p);
1501   //
1502   if (offset == 0) return 0;
1503   if (OP(preg, p) == BACK) return p-offset;
1504   return p+offset;
1505 }
1506
1507
1508 #ifdef HSRX_DEBUG
1509 /*
1510  - hsrxDump - dump a regexp onto stdout in vaguely comprehensible form
1511  */
1512 void hsrxDump (HSRegExp *preg) {
1513   int s, next, i;
1514   char buf[4];
1515   int op = EXACTLY; /* arbitrary non-END op */
1516   //
1517   for (i = 1; i < preg->p; ++i) {
1518     printf("%02x ", preg->program[i]);
1519     if (i%16 == 15) printf("\n");
1520   }
1521   printf("\n");
1522   //
1523   s = 1;
1524   /* while that wasn't END last time... */
1525   while (op != END && s < preg->p) {
1526     op = OP(preg, s);
1527     printf("%3d: %s", s, regprop(op)); /* where, what */
1528     next = regnext(preg, s);
1529     if (next == 0) {
1530       /* next ptr */
1531       printf("(0)");
1532     } else {
1533       printf("(%d)", next);
1534     }
1535     s += 2;
1536     if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1537       int max = preg->program[s];
1538       int min = preg->program[s+1];
1539       //
1540       if (max == 65535) printf("{%d,*}", min); else printf("{%d,%d}", min, max);
1541       printf(" %d", preg->program[s+2]);
1542       s += 3;
1543     } else if (op == ANYOF || op == ANYBUT) {
1544       /* set of ranges */
1545       while (preg->program[s]) {
1546         int len = preg->program[s++];
1547         int first = preg->program[s++];
1548         buf[utf8_fromunicode(buf, first)] = 0;
1549         printf("%s", buf);
1550         if (len > 1) {
1551           buf[utf8_fromunicode(buf, first+len-1)] = 0;
1552           printf("-%s", buf);
1553         }
1554       }
1555       ++s;
1556     } else if (op == EXACTLY) {
1557       /* literal string, where present */
1558       while (preg->program[s]) {
1559         buf[utf8_fromunicode(buf, preg->program[s])] = 0;
1560         printf("%s", buf);
1561         ++s;
1562       }
1563       ++s;
1564     }
1565     putchar('\n');
1566   }
1567   if (op == END) {
1568     /* header fields of interest */
1569     if (preg->regstart) {
1570       buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1571       printf("start '%s' ", buf);
1572     }
1573     if (preg->reganch) printf("anchored ");
1574     if (preg->regmust != 0) {
1575       int i;
1576       //
1577       printf("must have:");
1578       for (i = 0; i < preg->regmlen; ++i) putchar(preg->program[preg->regmust+i]);
1579       putchar('\n');
1580     }
1581   }
1582   printf("\n");
1583 }
1584
1585
1586 /*
1587  - regprop - printable representation of opcode
1588  */
1589 static const char *regprop (int op) {
1590   static char buf[50];
1591   //
1592   switch (op) {
1593     case BOL: return "BOL";
1594     case EOL: return "EOL";
1595     case ANY: return "ANY";
1596     case ANYOF: return "ANYOF";
1597     case ANYBUT: return "ANYBUT";
1598     case BRANCH: return "BRANCH";
1599     case EXACTLY: return "EXACTLY";
1600     case NOTHING: return "NOTHING";
1601     case BACK: return "BACK";
1602     case END: return "END";
1603     case REP: return "REP";
1604     case REPMIN: return "REPMIN";
1605     case REPX: return "REPX";
1606     case REPXMIN: return "REPXMIN";
1607     case WORDA: return "WORDA";
1608     case WORDZ: return "WORDZ";
1609     default:
1610       if (op >= OPEN && op < CLOSE) snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1611       else if (op >= CLOSE && op < CLOSEEND) snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1612       else snprintf(buf, sizeof(buf), "?%d?\n", op);
1613       return buf;
1614   }
1615 }
1616 #endif
1617
1618
1619 size_t hsrxError (int errcode, const HSRegExp *preg, char *errbuf,  size_t errbuf_size) {
1620   static const char *error_strings[] = {
1621     "success",
1622     "no match",
1623     "bad pattern",
1624     "null argument",
1625     "unknown error",
1626     "too big",
1627     "out of memory",
1628     "too many ()",
1629     "parentheses () not balanced",
1630     "braces {} not balanced",
1631     "invalid repetition count(s)",
1632     "extra characters",
1633     "*+ of empty atom",
1634     "nested count",
1635     "internal error",
1636     "count follows nothing",
1637     "trailing backslash",
1638     "corrupted program",
1639     "contains null char",
1640   };
1641   const char *err;
1642   //
1643   if (errcode < 0 || errcode >= HSRX_ERR_NUM) {
1644     err = "Bad error code";
1645   } else {
1646     err = error_strings[errcode];
1647   }
1648   return snprintf(errbuf, errbuf_size, "%s", err);
1649 }
1650
1651
1652 void hsrxFree (HSRegExp *preg) {
1653   if (preg != NULL && preg->program != NULL) free(preg->program);
1654 }