src/hsregexp.c

   1 /*
   2  * hsrxCompile and hsrxExec -- regsub and hsrxError are elsewhere
   3  *
   4  *  Copyright (c) 1986 by University of Toronto.
   5  *  Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *  Permission is granted to anyone to use this software for any
   8  *  purpose on any computer system, and to redistribute it freely,
   9  *  subject to the following restrictions:
  10  *
  11  *  1. The author is not responsible for the consequences of use of
  12  *    this software, no matter how awful, even if they arise
  13  *    from defects in it.
  14  *
  15  *  2. The origin of this software must not be misrepresented, either
  16  *    by explicit claim or by omission.
  17  *
  18  *  3. Altered versions must be plainly marked as such, and must not
  19  *    be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a hsrxCompile() redundancy.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  33  *** to avoid conflicting with the system regexp.h.  Const, bless its
  34  *** soul, was removed so it can compile everywhere.  The declaration
  35  *** of strchr() was in conflict on AIX, so it was removed (as it is
  36  *** happily defined in string.h).
  37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  41  *
  42  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  43  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  44  *   This includes counted repetitions, UTF-8 support, character classes,
  45  *   shorthand character classes, increased number of parentheses to 100,
  46  *   backslash escape sequences, non-greedy ops. It also removes \n as an alternative to |.
  47  *
  48  *   THIS IS AN ALTERED VERSION.  It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
  49  *   in Sep, 2011, to add non-capturing brackets (?:...) and backreferences.
  50  *
  51  *   THIS IS AN ALTERED VERSION.  It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
  52  *   in Nov, 2011, to add \b and \B
  53  *
  54  * Beware that some of this code is subtly aware of the way operator
  55  * precedence is structured in regular expressions.  Serious changes in
  56  * regular-expression syntax might require a total rethink.
  57  */
  58 #include <ctype.h>
  59 #include <stdio.h>
  60 #include <stdlib.h>
  61 #include <string.h>
  62
  63 #include "hsregexp.h"
  64
  65
  66 #define MAYBE_UNUSED  __attribute__((unused))
  67
  68 /* obey POSIX bug with the meaning of ')' */
  69 #define OBEY_POSIX_BUGS  0
  70
  71
  72 /**
  73  * UTF-8 utility functions
  74  *
  75  * (c) 2010 Steve Bennett <steveb@workware.net.au>
  76  *
  77  * See LICENCE for licence details.
  78  */
  79 struct casemap {
  80   unsigned short code; /* code point */
  81   unsigned short lower; /* code */
  82   unsigned short upper; /* code */
  83 };
  84
  85
  86 /* generated mapping tables */
  87 #include "hsregexp_unicode_mapping.c"
  88
  89
  90 #define NUMCASEMAP  (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
  91
  92 static int cmp_casemap (const void *key, const void *cm) {
  93   return *(int *)key-(int)((const struct casemap *)cm)->code;
  94 }
  95
  96
  97 static int utf8_map_case (int uc, int upper) {
  98   const struct casemap *cm = bsearch(&uc, unicode_case_mapping, NUMCASEMAP, sizeof(*unicode_case_mapping), cmp_casemap);
  99   if (cm) uc = upper?cm->upper:cm->lower;
 100   return uc;
 101 }
 102
 103
 104 /**
 105  * Converts the given unicode codepoint (0 - 0xffff) to utf-8
 106  * and stores the result at 'p'.
 107  *
 108  * Returns the number of utf-8 characters (1-3).
 109  */
 110 /* This one is always implemented */
 111 static inline int utf8_fromunicode (char *p, unsigned short uc) {
 112   if (uc <= 0x7f) { *p = uc; return 1; }
 113   if (uc <= 0x7ff) { *p++ = 0xc0 | ((uc & 0x7c0) >> 6); *p = 0x80 | (uc & 0x3f); return 2; }
 114   *p++ = 0xe0 | ((uc & 0xf000) >> 12); *p++ = 0x80 | ((uc & 0xfc0) >> 6); *p = 0x80 | (uc & 0x3f); return 3;
 115 }
 116
 117
 118 /**
 119  * Returns the unicode codepoint corresponding to the
 120  * utf-8 sequence 'str'.
 121  *
 122  * Stores the result in *uc and returns the number of bytes
 123  * consumed.
 124  *
 125  * If 'str' is null terminated, then an invalid utf-8 sequence
 126  * at the end of the string will be returned as individual bytes.
 127  *
 128  * If it is not null terminated, the length *must* be checked first.
 129  *
 130  * Does not support unicode code points > \uffff
 131  */
 132 static inline int utf8_tounicode (const char *str, int *uc) {
 133   unsigned const char *s = (unsigned const char *)str;
 134   //
 135   if (s[0] < 0xc0) { *uc = s[0]; return 1; }
 136   if (s[0] < 0xe0) {
 137     if ((s[1] & 0xc0) == 0x80) { *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80); return 2; }
 138   } else if (s[0] < 0xf0) {
 139     if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
 140       *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);
 141       return 3;
 142     }
 143   }
 144   /* invalid sequence, so just return the byte */
 145   *uc = *s;
 146   return 1;
 147 }
 148
 149
 150 /**
 151  * Returns the length of the utf-8 sequence starting with 'c'.
 152  *
 153  * Returns 1-4, or -1 if this is not a valid start byte.
 154  *
 155  * Note that charlen=4 is not supported by the rest of the API.
 156  */
 157 static MAYBE_UNUSED inline int utf8_charlen (int c) {
 158   if ((c&0x80) == 0) return 1;
 159   if ((c&0xe0) == 0xc0) return 2;
 160   if ((c&0xf0) == 0xe0) return 3;
 161   if ((c&0xf8) == 0xf0) return 4;
 162   /* invalid sequence */
 163   return -1;
 164 }
 165
 166
 167 /**
 168  * Returns the number of characters in the utf-8
 169  * string of the given byte length.
 170  *
 171  * Any bytes which are not part of an valid utf-8
 172  * sequence are treated as individual characters.
 173  *
 174  * The string *must* be null terminated.
 175  *
 176  * Does not support unicode code points > \uffff
 177  */
 178 static MAYBE_UNUSED int utf8_strlen (const char *str, int bytelen) {
 179   int charlen = 0;
 180   //
 181   if (bytelen < 0) bytelen = strlen(str);
 182   while (bytelen) {
 183     int c, l = utf8_tounicode(str, &c);
 184     ++charlen;
 185     str += l;
 186     bytelen -= l;
 187   }
 188   return charlen;
 189 }
 190
 191
 192 /**
 193  * Returns the byte index of the given character in the utf-8 string.
 194  *
 195  * The string *must* be null terminated.
 196  *
 197  * This will return the byte length of a utf-8 string
 198  * if given the char length.
 199  */
 200 static MAYBE_UNUSED inline int utf8_index (const char *str, int index) {
 201   const char *s = str;
 202   //
 203   while (index--) {
 204     int c;
 205     s += utf8_tounicode(s, &c);
 206   }
 207   return s-str;
 208 }
 209
 210
 211 /**
 212  * Returns the number of bytes before 'str' that the previous
 213  * utf-8 character sequence starts (which may be the middle of a sequence).
 214  *
 215  * Looks back at most 'len' bytes backwards, which must be > 0.
 216  * If no start char is found, returns -len
 217  */
 218 static MAYBE_UNUSED inline int utf8_prev_len (const char *str, int len) {
 219   int n = 1;
 220   //
 221   //assert(len > 0);
 222   /* look up to len chars backward for a start-of-char byte */
 223   while (--len) {
 224     if ((str[-n] & 0x80) == 0) break; /* start of a 1-byte char */
 225     if ((str[-n] & 0xc0) == 0xc0) break; /* start of a multi-byte char */
 226     ++n;
 227   }
 228   return n;
 229 }
 230
 231
 232 /**
 233  * Returns the upper-case variant of the given unicode codepoint.
 234  *
 235  * Does not support unicode code points > \uffff
 236  */
 237 static MAYBE_UNUSED int utf8_upper (int uc) {
 238   if (isascii(uc)) return toupper(uc);
 239   return utf8_map_case(uc, 1);
 240 }
 241
 242
 243 /**
 244  * Returns the lower-case variant of the given unicode codepoint.
 245  *
 246  * NOTE: Use utf8_upper() in preference for case-insensitive matching.
 247  *
 248  * Does not support unicode code points > \uffff
 249  */
 250 static MAYBE_UNUSED int utf8_lower (int uc) {
 251   if (isascii(uc)) return tolower(uc);
 252   return utf8_map_case(uc, 0);
 253 }
 254
 255
 256 static MAYBE_UNUSED inline int utf8_charequal (const char *s1, const char *s2) {
 257   int c1, c2;
 258   //
 259   utf8_tounicode(s1, &c1);
 260   utf8_tounicode(s2, &c2);
 261   return c1==c2;
 262 }
 263
 264
 265 #define UCHAR(c)  ((unsigned char)(c))
 266
 267
 268 /* This *MUST* be less than (255-20-2)/2=116 */
 269 #define HSRX_MAX_PAREN  (100)
 270
 271 /*
 272  * Structure for regexp "program". This is essentially a linear encoding
 273  * of a nondeterministic finite-state machine (aka syntax charts or
 274  * "railroad normal form" in parsing technology). Each node is an opcode
 275  * plus a "next" pointer, possibly plus an operand. "Next" pointers of
 276  * all nodes except BRANCH implement concatenation; a "next" pointer with
 277  * a BRANCH on both ends of it is connecting two alternatives. Here we
 278  * have one of the subtle syntax dependencies: an individual BRANCH (as
 279  * opposed to a collection of them) is never concatenated with anything
 280  * because of operator precedence. The operand of some types of node is
 281  * a literal string; for others, it is a node leading into a sub-FSM.
 282  * In particular, the operand of a BRANCH node is the first node of the
 283  * branch.
 284  * (NB this is *not* a tree structure: the tail of the branch connects
 285  * to the thing following the set of BRANCHes.)
 286  * The opcodes are:
 287  */
 288 /* definition_num   opnd?       meaning */
 289 #define END       0 /* no       end of program */
 290 #define BOL       1 /* no       match "" at beginning of line */
 291 #define EOL       2 /* no       match "" at end of line */
 292 #define ANY       3 /* no       match any one character */
 293 #define ANYOF     4 /* str      match any character in this string */
 294 #define ANYBUT    5 /* str      match any character not in this string */
 295 #define BRANCH    6 /* node     match this alternative, or the next... */
 296 #define BACK      7 /* no       match "", "next" ptr points backward */
 297 #define EXACTLY   8 /* str      match this string */
 298 #define NOTHING   9 /* no       match empty string */
 299 #define REP      10 /* max,min  match this (simple) thing [min,max] times */
 300 #define REPMIN   11 /* max,min  match this (simple) thing [min,max] times, mininal match */
 301 #define REPX     12 /* max,min  match this (complex) thing [min,max] times */
 302 #define REPXMIN  13 /* max,min  match this (complex) thing [min,max] times, minimal match */
 303 /* no 14 */
 304 #define WORDA    15 /* no       match "" at wordchar, where prev is nonword */
 305 #define WORDZ    16 /* no       match "" at nonwordchar, where prev is word */
 306 #define BACKREF  17 /* captnum  match previous capture here */
 307 #define WBRK     18 /* no       match word break -- \b */
 308 #define WNBRK    19 /* no       match non-word-break -- \B */
 309 /* */
 310 #define OPEN     20 /* no       mark this point in input as start of #n (OPEN+1 is number 1, etc) */
 311 #define CLOSE    (OPEN+HSRX_MAX_PAREN) /* analogous to OPEN */
 312 #define CLOSEEND (CLOSE+HSRX_MAX_PAREN)
 313
 314 #define NCOPEN    (CLOSEEND+0)
 315 #define NCCLOSE   (CLOSEEND+1)
 316
 317
 318 /*
 319  * The first byte of the regexp internal "program" is actually this magic
 320  * number; the start node begins in the second byte.
 321  */
 322 #define HSRX_MAGIC  (0xFADED00DUL)
 323
 324
 325 /*
 326  * Opcode notes:
 327  *
 328  * BRANCH The set of branches constituting a single choice are hooked
 329  *    together with their "next" pointers, since precedence prevents
 330  *    anything being concatenated to any individual branch.  The
 331  *    "next" pointer of the last BRANCH in a choice points to the
 332  *    thing following the whole choice.  This is also where the
 333  *    final "next" pointer of each individual branch points; each
 334  *    branch starts with the operand node of a BRANCH node.
 335  *
 336  * BACK   Normal "next" pointers all implicitly point forward; BACK
 337  *    exists to make loop structures possible.
 338  *
 339  * STAR,PLUS  '?', and complex '*' and '+', are implemented as circular
 340  *    BRANCH structures using BACK.  Simple cases (one character
 341  *    per match) are implemented with STAR and PLUS for speed
 342  *    and to minimize recursive plunges.
 343  *
 344  * OPEN,CLOSE ...are numbered at compile time.
 345  */
 346
 347
 348 /*
 349  * A node is one char of opcode followed by two chars of "next" pointer.
 350  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 351  * value is a positive offset from the opcode of the node containing it.
 352  * An operand, if any, simply follows the node.  (Note that much of the
 353  * code generation knows about this implicit relationship.)
 354  *
 355  * Using two bytes for the "next" pointer is vast overkill for most things,
 356  * but allows patterns to get big without disasters.
 357  */
 358 #define OP(preg, p)    (preg->program[p])
 359 #define NEXT(preg, p)  (preg->program[p+1])
 360 #define OPERAND(p)     ((p)+2)
 361
 362
 363 /*
 364  * Utility definitions.
 365  */
 366
 367 #define FAIL(R,M)   {(R)->err = (M); return (M);}
 368 #define ISMULT(c)   ((c)=='*'||(c)=='+'||(c)=='?'||(c) == '{')
 369 #define META        "^$.[()|?{+*"
 370
 371
 372 /*
 373  * Flags to be passed up and down.
 374  */
 375 #define WORST     0 /* worst case */
 376 #define HASWIDTH  1 /* known never to match null string */
 377 #define SIMPLE    2 /* simple enough to be STAR/PLUS operand */
 378 #define SPSTART   4 /* starts with * or + */
 379
 380
 381 #define MAX_REP_COUNT  (1000000)
 382
 383
 384 /*
 385  * Forward declarations for hsrxCompile()'s friends.
 386  */
 387 static int reg (HSRegExp *preg, int paren /*parenthesized?*/, int *flagp);
 388 static int regpiece (HSRegExp *preg, int *flagp);
 389 static int regbranch (HSRegExp *preg, int *flagp);
 390 static int regatom (HSRegExp *preg, int *flagp);
 391 static int regnode (HSRegExp *preg, int op);
 392 static int regnext (HSRegExp *preg, int p);
 393 static void regc (HSRegExp *preg, int b);
 394 static int reginsert (HSRegExp *preg, int op, int size, int opnd);
 395 static void regtail_ (HSRegExp *preg, int p, int val, int line);
 396 static void regoptail (HSRegExp *preg, int p, int val);
 397 #define regtail(PREG, P, VAL)  regtail_(PREG, P, VAL, __LINE__)
 398
 399 static int reg_range_find (HSRegExp *preg, const int *string, int c);
 400 static const char *str_find (HSRegExp *preg, const char *string, int c);
 401 static int prefix_cmp (HSRegExp *preg, const int *prog, int proglen, const char *string);
 402
 403 #ifdef HSRX_DEBUG
 404 int hsrxNarrate = 0;
 405 void hsrxDump (HSRegExp *preg);
 406 static const char *regprop (int op);
 407 #endif
 408
 409
 410 /**
 411  * Returns the length of the null-terminated integer sequence.
 412  */
 413 static
 414 #if __GNUC__ > 2
 415 __attribute__((pure))
 416 #endif
 417 inline int str_int_len (const int *seq) {
 418   int n = 0;
 419   while (*seq++) ++n;
 420   return n;
 421 }
 422
 423
 424 /*
 425  - hsrxCompile - compile a regular expression into internal code
 426  *
 427  * We can't allocate space until we know how big the compiled form will be,
 428  * but we can't compile it (and thus know how big it is) until we've got a
 429  * place to put the code.  So we cheat:  we compile it twice, once with code
 430  * generation turned off and size counting turned on, and once "for real".
 431  * This also means that we don't allocate space until we are sure that the
 432  * thing really will compile successfully, and we never have to move the
 433  * code and thus invalidate pointers into it.  (Note that it has to be in
 434  * one piece because free() must be able to free it all.)
 435  *
 436  * Beware that the optimization-preparation code in here knows about some
 437  * of the structure of the compiled regexp.
 438  */
 439 int hsrxCompile (HSRegExp *preg, const char *exp, int cflags) {
 440   int scan, longest, flags;
 441   unsigned int len;
 442   //
 443 #ifdef HSRX_DEBUG
 444   fprintf(stderr, "Compiling: '%s'\n", exp);
 445 #endif
 446   memset(preg, 0, sizeof(*preg));
 447   if (exp == NULL) FAIL(preg, HSRX_ERR_NULL_ARGUMENT);
 448   /* first pass: determine size, legality */
 449   preg->start = exp; /* for '^' special */
 450   preg->cflags = cflags;
 451   preg->regparse = exp;
 452   /* XXX: for now, start unallocated */
 453   preg->program = NULL;
 454   preg->proglen = 0;
 455 #if 1
 456   /* allocate space */
 457   preg->proglen = (strlen(exp)+1)*5;
 458   preg->program = malloc(preg->proglen*sizeof(int));
 459   if (preg->program == NULL) FAIL(preg, HSRX_ERR_NOMEM);
 460 #endif
 461   /* note that since we store a magic value as the first item in the program, program offsets will never be 0 */
 462   regc(preg, HSRX_MAGIC);
 463   if (reg(preg, 0, &flags) == 0) return preg->err;
 464   /* small enough for pointer-storage convention? */
 465   if (preg->re_nsub >= HSRX_MAX_PAREN) FAIL(preg, HSRX_ERR_TOO_BIG); /* probably could be 65535L */
 466   preg->bmatch = malloc(sizeof(HSRxMatch)*(preg->re_nsub+1));
 467   if (preg->bmatch == NULL) FAIL(preg, HSRX_ERR_NOMEM);
 468   /* dig out information for optimizations. */
 469   preg->regstart = 0; /* worst-case defaults */
 470   preg->reganch = 0;
 471   preg->regmust = 0;
 472   preg->regmlen = 0;
 473   scan = 1; /* first BRANCH */
 474   if (OP(preg, regnext(preg, scan)) == END) {
 475     /* only one top-level choice */
 476     scan = OPERAND(scan);
 477     /* starting-point info */
 478     if (OP(preg, scan) == EXACTLY) preg->regstart = preg->program[OPERAND(scan)];
 479     else if (OP(preg, scan) == BOL) ++preg->reganch;
 480     /*
 481      * If there's something expensive in the r.e., find the
 482      * longest literal string that must appear and make it the
 483      * regmust.  Resolve ties in favor of later strings, since
 484      * the regstart check works with the beginning of the r.e.
 485      * and avoiding duplication strengthens checking.  Not a
 486      * strong reason, but sufficient in the absence of others.
 487      */
 488     if (flags&SPSTART) {
 489       longest = 0;
 490       len = 0;
 491       for (; scan != 0; scan = regnext(preg, scan)) {
 492         if (OP(preg, scan) == EXACTLY) {
 493           int plen = str_int_len(preg->program+OPERAND(scan));
 494           if (plen >= len) {
 495             longest = OPERAND(scan);
 496             len = plen;
 497           }
 498         }
 499       }
 500       preg->regmust = longest;
 501       preg->regmlen = len;
 502     }
 503   }
 504 #ifdef HSRX_DEBUG
 505   hsrxDump(preg);
 506 #endif
 507   return 0;
 508 }
 509
 510
 511 /*
 512  - reg - regular expression, i.e. main body or parenthesized thing
 513  *
 514  * Caller must absorb opening parenthesis.
 515  *
 516  * Combining parenthesis handling with the base level of regular expression
 517  * is a trifle forced, but the need to tie the tails of the branches to what
 518  * follows makes it hard to avoid.
 519  *
 520  * paren:
 521  *   0: not parentesized
 522  *   1: normal parens
 523  *   2: non-capturing parens
 524  */
 525 static int reg (HSRegExp *preg, int paren, int *flagp) {
 526   int ret, br, ender, flags;
 527   int parno = 0;
 528   //
 529   *flagp = HASWIDTH; /* tentatively */
 530   /* make an OPEN node, if parenthesized */
 531   if (paren) {
 532     // new (capturing) paren
 533     parno = paren==1?++preg->re_nsub:-666;
 534     ret = regnode(preg, paren==1?OPEN+parno:NCOPEN);
 535   } else {
 536     ret = 0;
 537   }
 538   /* pick up the branches, linking them together */
 539   br = regbranch(preg, &flags);
 540   if (br == 0) return 0;
 541   if (ret != 0) regtail(preg, ret, br); /* OPEN -> first */ else ret = br;
 542   if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
 543   *flagp |= flags&SPSTART;
 544   while (*preg->regparse == '|') {
 545     ++preg->regparse;
 546     br = regbranch(preg, &flags);
 547     if (br == 0) return 0;
 548     regtail(preg, ret, br); /* BRANCH -> BRANCH */
 549     if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
 550     *flagp |= flags&SPSTART;
 551   }
 552   /* make a closing node, and hook it on the end */
 553   ender = regnode(preg, (paren)?(paren==1?CLOSE+parno:NCCLOSE):END);
 554   regtail(preg, ret, ender);
 555   /* hook the tails of the branches to the closing node */
 556   for (br = ret; br != 0; br = regnext(preg, br)) regoptail(preg, br, ender);
 557   /* check for proper termination */
 558   if (paren) {
 559     if (preg->cflags&HSRX_EXTENDED) {
 560       if (*preg->regparse++ != ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
 561     } else {
 562       if (preg->regparse[0] != '\\' || preg->regparse[1] != ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
 563       preg->regparse += 2;
 564     }
 565   }
 566   if (!paren && *preg->regparse != '\0') {
 567     if (preg->cflags&HSRX_EXTENDED) {
 568       if (*preg->regparse == ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
 569     } else {
 570       if (preg->regparse[0] == '\\' && preg->regparse[1] == ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
 571     }
 572     preg->err = HSRX_ERR_JUNK_ON_END;
 573     return 0;
 574   }
 575   preg->start = NULL;
 576   return ret;
 577 }
 578
 579
 580 /*
 581  - regbranch - one alternative of an | operator
 582  *
 583  * Implements the concatenation operator.
 584  */
 585 static int regbranch (HSRegExp *preg, int *flagp) {
 586   int ret, chain, latest, flags;
 587   //
 588   *flagp = WORST; /* tentatively */
 589   ret = regnode(preg, BRANCH);
 590   chain = 0;
 591   while (*preg->regparse != '\0') {
 592     if (preg->cflags&HSRX_EXTENDED) {
 593       if (*preg->regparse == ')' || *preg->regparse == '|') break;
 594     } else {
 595       if (preg->regparse[0] == '\\' && preg->regparse[1] == ')') break;
 596     }
 597     latest = regpiece(preg, &flags);
 598     if (latest == 0) return 0;
 599     *flagp |= flags&HASWIDTH;
 600     if (chain == 0) *flagp |= flags&SPSTART; /* first piece */ else regtail(preg, chain, latest);
 601     chain = latest;
 602   }
 603   if (chain == 0) regnode(preg, NOTHING); /* loop ran zero times */
 604   return ret;
 605 }
 606
 607
 608 /*
 609  - regpiece - something followed by possible [*+?]
 610  *
 611  * Note that the branching code sequences used for ? and the general cases
 612  * of * and + are somewhat optimized:  they use the same NOTHING node as
 613  * both the endmarker for their branch list and the body of the last branch.
 614  * It might seem that this node could be dispensed with entirely, but the
 615  * endmarker role is not redundant.
 616  */
 617 static int regpiece (HSRegExp *preg, int *flagp) {
 618   char op;
 619   int ret, next, flags, min, max;
 620   int chain = 0;
 621   //
 622   ret = regatom(preg, &flags);
 623   if (ret == 0) return 0;
 624   op = *preg->regparse;
 625   if (preg->cflags&HSRX_EXTENDED) {
 626     if (!ISMULT(op)) { *flagp = flags; return ret; }
 627     if (!(flags&HASWIDTH) && op != '?') { preg->err = HSRX_ERR_OPERAND_COULD_BE_EMPTY; return 0; }
 628   } else {
 629     if (op != '*') {
 630       if (op == '\\' && preg->regparse[1] == '{') {
 631         op = '{';
 632         ++preg->regparse; /* skip slash */
 633       } else {
 634         *flagp = flags;
 635         return ret;
 636       }
 637     }
 638   }
 639   /* handle braces (counted repetition) by expansion */
 640   if (op == '{') {
 641     /* for non-extended regexps this is the only possible path */
 642     char *end;
 643     //
 644     ++preg->regparse; /* skip paren */
 645     while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
 646     /* first number */
 647     if (*preg->regparse == ',') {
 648       ++preg->regparse;
 649       min = 0;
 650     } else {
 651       min = strtoul(preg->regparse, &end, 10);
 652       if (end == preg->regparse) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
 653       preg->regparse = end;
 654     }
 655     /* second number */
 656     while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
 657     if (*preg->regparse == ',') {
 658       ++preg->regparse;
 659       while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
 660       if (!(isdigit(*preg->regparse))) {
 661         max = MAX_REP_COUNT;
 662       } else {
 663         max = strtoul(preg->regparse, &end, 10);
 664         if (end == preg->regparse) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
 665         preg->regparse = end;
 666       }
 667     } else {
 668       max = min;
 669     }
 670     if (max > min || min < 0 || max < 0 || min > MAX_REP_COUNT || max > MAX_REP_COUNT) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
 671     /* check for proper termination */
 672     while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
 673     if (preg->cflags&HSRX_EXTENDED) {
 674       if (*preg->regparse != '}') { preg->err = HSRX_ERR_UNMATCHED_BRACES; return 0; }
 675       ++preg->regparse;
 676     } else {
 677       if (preg->regparse[0] != '\\' || preg->regparse[1] != '}') { preg->err = HSRX_ERR_UNMATCHED_BRACES; return 0; }
 678       preg->regparse += 2;
 679     }
 680   } else {
 681     min = op=='+'?1:0;
 682     max = op=='?'?1:MAX_REP_COUNT;
 683     ++preg->regparse; /* skip special */
 684   }
 685   /* insert 'repeat' opcode */
 686   if ((preg->cflags&HSRX_EXTENDED) && *preg->regparse == '?') {
 687     ++preg->regparse;
 688     next = reginsert(preg, flags&SIMPLE?REPMIN:REPXMIN, 5, ret);
 689   } else {
 690     next = reginsert(preg, flags&SIMPLE?REP:REPX, 5, ret);
 691   }
 692   preg->program[ret+2] = max;
 693   preg->program[ret+3] = min;
 694   preg->program[ret+4] = 0;
 695   *flagp = (min)?(WORST|HASWIDTH):(WORST|SPSTART);
 696   if (!(flags&SIMPLE)) {
 697     int back = regnode(preg, BACK);
 698     regtail(preg, back, ret);
 699     regtail(preg, next, back);
 700   }
 701   if (preg->cflags&HSRX_EXTENDED) {
 702     if (ISMULT(*preg->regparse)) { preg->err = HSRX_ERR_NESTED_COUNT; return 0; }
 703   } else {
 704     if (*preg->regparse == '*' || (preg->regparse[0] == '\\' && preg->regparse[1] == '{')) { preg->err = HSRX_ERR_NESTED_COUNT; return 0; }
 705   }
 706   return chain?chain:ret;
 707 }
 708
 709
 710 /**
 711  * Add all characters in the inclusive range between lower and upper.
 712  *
 713  * Handles a swapped range (upper < lower).
 714  */
 715 static void reg_addrange (HSRegExp *preg, int lower, int upper) {
 716   if (lower > upper) reg_addrange(preg, upper, lower);
 717   /* add a range as length, start */
 718   regc(preg, upper-lower+1);
 719   regc(preg, lower);
 720 }
 721
 722
 723 /**
 724  * Add a null-terminated literal string as a set of ranges.
 725  */
 726 /*
 727 static void reg_addrange_str (HSRegExp *preg, const char *str) {
 728   while (*str) { reg_addrange(preg, *str, *str); ++str; }
 729 }
 730 */
 731
 732
 733 /**
 734  * Extracts the next unicode char from utf8.
 735  *
 736  * If 'upper' is set, converts the char to uppercase.
 737  */
 738 static inline int reg_utf8_tounicode_case (HSRegExp *preg, const char *s, int *uc, int upper) {
 739   if (preg->cflags&HSRX_UTF8) {
 740     int l = utf8_tounicode(s, uc);
 741     if (upper) *uc = utf8_upper(*uc);
 742     return l;
 743   }
 744   *uc = upper?toupper(*s):(unsigned char)(*s);
 745   return 1;
 746 }
 747
 748
 749 /**
 750  * Extracts the next unicode char from utf8.
 751  *
 752  * If 'upper' is set, converts the char to uppercase.
 753  */
 754 static inline int reg_utf8_tounicode_caseR (HSRegExp *preg, const char *s, int *uc, int upper) {
 755   if (preg->eflags&HSRX_UTF8) {
 756     int l = utf8_tounicode(s, uc);
 757     if (upper) *uc = utf8_upper(*uc);
 758     return l;
 759   }
 760   *uc = upper?toupper(*s):(unsigned char)(*s);
 761   return 1;
 762 }
 763
 764
 765 /**
 766  * Converts a hex digit to decimal.
 767  *
 768  * Returns -1 for an invalid hex digit.
 769  */
 770 static inline int hexdigitval (int c) {
 771   if (c >= '0' && c <= '9') return c-'0';
 772   if (c >= 'a' && c <= 'f') return c-'a'+10;
 773   if (c >= 'A' && c <= 'F') return c-'A'+10;
 774   return -1;
 775 }
 776
 777
 778 /**
 779  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 780  *
 781  * Returns the number of hex digits parsed.
 782  * If there are no hex digits, returns 0 and stores nothing.
 783  */
 784 static int parse_hex (const char *s, int n, int *uc) {
 785   int k, val = 0;
 786   //
 787   for (k = 0; k < n; ++k) {
 788     int c = hexdigitval(*s++);
 789     if (c == -1) break;
 790     val = (val<<4)|c;
 791   }
 792   if (k) *uc = val;
 793   return k;
 794 }
 795
 796
 797 /**
 798  * Call for chars after a backlash to decode the escape sequence.
 799  *
 800  * Stores the result in *ch.
 801  *
 802  * Returns the number of bytes consumed.
 803  */
 804 static int reg_decode_escape (const char *s, int *ch) {
 805   int n;
 806   const char *s0 = s;
 807   //
 808   *ch = *s++;
 809   switch (*ch) {
 810     case 'a': *ch = '\a'; break;
 811     case 'b': *ch = '\b'; break;
 812     case 'e': *ch = 27; break;
 813     case 'f': *ch = '\f'; break;
 814     case 'n': *ch = '\n'; break;
 815     case 'r': *ch = '\r'; break;
 816     case 't': *ch = '\t'; break;
 817     case 'v': *ch = '\v'; break;
 818     case 'u':
 819       if ((n = parse_hex(s, 4, ch)) > 0) s += n;
 820       break;
 821     case 'x':
 822       if ((n = parse_hex(s, 2, ch)) > 0) s += n;
 823       break;
 824     case '\0':
 825       --s;
 826       *ch = '\\';
 827       break;
 828   }
 829   return s-s0;
 830 }
 831
 832
 833 /*
 834  - regatom - the lowest level
 835  *
 836  * Optimization:  gobbles an entire sequence of ordinary characters so that
 837  * it can turn them into a single node, which is smaller to store and
 838  * faster to run.  Backslashed characters are exceptions, each becoming a
 839  * separate node; the code is simpler that way and it's not worth fixing.
 840  */
 841 static int regatom (HSRegExp *preg, int *flagp) {
 842   int ret, flags, ch;
 843   int nocase = preg->cflags&HSRX_ICASE;
 844   int n = reg_utf8_tounicode_case(preg, preg->regparse, &ch, nocase);
 845   //
 846   *flagp = WORST; /* tentatively */
 847   preg->regparse += n;
 848   switch (ch) {
 849     case '^': /* this char only have special meaning at beging of pat */
 850       if (preg->regparse-n != preg->start) {
 851         switch (*(preg->regparse-n)) {
 852           case '(': break;
 853           case '|': if (preg->cflags&HSRX_EXTENDED) break; /* fallthru */
 854           default: goto de_fault;
 855         }
 856       }
 857       ret = regnode(preg, BOL);
 858       if (!(preg->cflags&HSRX_EXTENDED) && *preg->regparse == '*') goto de_fault;
 859       break;
 860     case '$': /* this char only have special meaning at end of pat */
 861       switch (*preg->regparse) {
 862         case 0: case ')': break;
 863         case '|': if (preg->cflags&HSRX_EXTENDED) break; /* fallthru */
 864         default: goto de_fault;
 865       }
 866       ret = regnode(preg, EOL);
 867       break;
 868     case '.':
 869       ret = regnode(preg, ANY);
 870       *flagp |= HASWIDTH|SIMPLE;
 871       break;
 872     case '[': {
 873       const char *pattern = preg->regparse;
 874       //
 875       if (*pattern == '^') {
 876         /* complement of range */
 877         ret = regnode(preg, ANYBUT);
 878         ++pattern;
 879       } else {
 880         ret = regnode(preg, ANYOF);
 881       }
 882       /* special case: if the first char is ']' or '-', it is part of the set */
 883       if (*pattern == ']' || *pattern == '-') {
 884         reg_addrange(preg, *pattern, *pattern);
 885         ++pattern;
 886       }
 887       while (*pattern && *pattern != ']') {
 888         /* is this a range? a-z */
 889         int start, end;
 890         //
 891         pattern += reg_utf8_tounicode_case(preg, pattern, &start, nocase);
 892         if (start == '\\') {
 893           pattern += reg_decode_escape(pattern, &start);
 894           if (start == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
 895         }
 896         if (pattern[0] == '-' && pattern[1]) {
 897           ++pattern; /* skip '-' */
 898           pattern += reg_utf8_tounicode_case(preg, pattern, &end, nocase);
 899           if (end == '\\') {
 900             pattern += reg_decode_escape(pattern, &end);
 901             if (end == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
 902           }
 903           reg_addrange(preg, start, end);
 904           continue;
 905         }
 906         if ((preg->cflags&HSRX_EXTENDED) && start == '[') {
 907           if (strncmp(pattern, ":alpha:]", 8) == 0) {
 908             pattern += 8;
 909             if (!nocase) reg_addrange(preg, 'a', 'z');
 910             reg_addrange(preg, 'A', 'Z');
 911             continue;
 912           }
 913           if (strncmp(pattern, ":alnum:]", 8) == 0) {
 914             pattern += 8;
 915             if (!nocase) reg_addrange(preg, 'a', 'z');
 916             reg_addrange(preg, 'A', 'Z');
 917             reg_addrange(preg, '0', '9');
 918             continue;
 919           }
 920           if (strncmp(pattern, ":space:]", 8) == 0) {
 921             pattern += 8;
 922             //reg_addrange_str(preg, " \t\r\n\f\v");
 923             reg_addrange(preg, 9, 13);
 924             reg_addrange(preg, 32, 32);
 925             continue;
 926           }
 927           if (strncmp(pattern, ":blank:]", 8) == 0) {
 928             pattern += 8;
 929             //reg_addrange_str(preg, " \t\f\v");
 930             reg_addrange(preg, 9, 9);
 931             reg_addrange(preg, 11, 12);
 932             reg_addrange(preg, 32, 32);
 933             continue;
 934           }
 935           if (strncmp(pattern, ":digit:]", 8) == 0) {
 936             pattern += 8;
 937             reg_addrange(preg, '0', '9');
 938             continue;
 939           }
 940           if (strncmp(pattern, ":lower:]", 8) == 0) {
 941             pattern += 8;
 942             if (nocase) reg_addrange(preg, 'A', 'Z'); else reg_addrange(preg, 'a', 'z');
 943             continue;
 944           }
 945           if (strncmp(pattern, ":upper:]", 8) == 0) {
 946             pattern += 8;
 947             reg_addrange(preg, 'A', 'Z');
 948             continue;
 949           }
 950           if (strncmp(pattern, ":xdigit:]", 9) == 0) {
 951             pattern += 9;
 952             reg_addrange(preg, '0', '9');
 953             if (!nocase) reg_addrange(preg, 'a', 'f');
 954             reg_addrange(preg, 'A', 'F');
 955             continue;
 956           }
 957         }
 958         /* not a range, so just add the char */
 959         reg_addrange(preg, start, start);
 960       }
 961       regc(preg, 0);
 962       if (*pattern) ++pattern;
 963       preg->regparse = pattern;
 964       *flagp |= HASWIDTH|SIMPLE;
 965       } break;
 966     case '(':
 967       // only for exteded mode
 968       if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 969         // non-capturing bracket
 970         preg->regparse += 2;
 971         ret = reg(preg, 2, &flags);
 972         goto bracecont;
 973       }
 974 bracestart:
 975       ret = reg(preg, 1, &flags);
 976 bracecont:
 977       if (ret == 0) return 0;
 978       *flagp |= flags&(HASWIDTH|SPSTART);
 979       break;
 980     case '\0': preg->err = HSRX_ERR_INTERNAL; return 0; /* supposed to be caught earlier */
 981     case ')':
 982 #if OBEY_POSIX_BUGS
 983       goto de_fault;
 984 #endif
 985     case '|':
 986       if (preg->cflags&HSRX_EXTENDED) { preg->err = HSRX_ERR_INTERNAL; return 0; } /* supposed to be caught earlier */
 987       goto de_fault;
 988     case '?':
 989     case '+':
 990     case '{':
 991     case '*':
 992       if (preg->cflags&HSRX_EXTENDED) { preg->err = HSRX_ERR_COUNT_FOLLOWS_NOTHING; return 0; }
 993       goto de_fault;
 994     case '\\':
 995       switch (*preg->regparse++) {
 996         case '\0': preg->err = HSRX_ERR_TRAILING_BACKSLASH; return 0;
 997         case '<': case 'm': ret = regnode(preg, WORDA); break;
 998         case '>': case 'M': ret = regnode(preg, WORDZ); break;
 999         case 'd': case 'D':
1000           ret = regnode(preg, preg->regparse[-1]=='d'?ANYOF:ANYBUT);
1001           reg_addrange(preg, '0', '9');
1002           regc(preg, 0);
1003           *flagp |= HASWIDTH|SIMPLE;
1004           break;
1005         case 'b': ret = regnode(preg, WBRK); break;
1006         case 'B': ret = regnode(preg, WNBRK); break;
1007         case 'w': case 'W':
1008           ret = regnode(preg, preg->regparse[-1]=='w'?ANYOF:ANYBUT);
1009           if (!(preg->cflags&HSRX_ICASE)) reg_addrange(preg, 'a', 'z');
1010           reg_addrange(preg, 'A', 'Z');
1011           reg_addrange(preg, '0', '9');
1012           reg_addrange(preg, '_', '_');
1013           regc(preg, 0);
1014           *flagp |= HASWIDTH|SIMPLE;
1015           break;
1016         case 's': case 'S':
1017           ret = regnode(preg, preg->regparse[-1]=='s'?ANYOF:ANYBUT);
1018           //reg_addrange_str(preg," \t\r\n\f\v");
1019           reg_addrange(preg, 9, 13);
1020           reg_addrange(preg, 32, 32);
1021           regc(preg, 0);
1022           *flagp |= HASWIDTH|SIMPLE;
1023           break;
1024         case '(':
1025           if (!(preg->cflags&HSRX_EXTENDED)) goto bracestart;
1026           goto escnotspecial;
1027         case '{':
1028           if (!(preg->cflags&HSRX_EXTENDED)) { preg->err = HSRX_ERR_COUNT_FOLLOWS_NOTHING; return 0; }
1029           goto escnotspecial;
1030         case ')':
1031         case '}':
1032           if (!(preg->cflags&HSRX_EXTENDED)) { preg->err = HSRX_ERR_INTERNAL; return 0; } /* supposed to be caught earlier */
1033           goto escnotspecial;
1034         case '0' ... '9':
1035           n = 0; ch = 0; --(preg->regparse);
1036           while (isdigit(preg->regparse[0])) {
1037             n = (n*10)+preg->regparse[0]-'0';
1038             if (n > HSRX_MAX_PAREN) { preg->regparse -= ch; preg->err = HSRX_ERR_INVALID_BACKREF; return 0; }
1039             ++(preg->regparse);
1040             ++ch;
1041           }
1042           if (ch > 1) {
1043             /* eat possible ';' */
1044             if (preg->regparse[0] == ';') { ++(preg->regparse); ++ch; }
1045           }
1046           if (n < 1 || n > preg->re_nsub) { preg->regparse -= ch; preg->err = HSRX_ERR_INVALID_BACKREF; return 0; }
1047           ret = regnode(preg, BACKREF);
1048           regc(preg, n);
1049           break;
1050         default:
1051           /* handle general quoted chars in exact-match routine */
1052           /* back up to include the backslash */
1053 escnotspecial:
1054           --preg->regparse;
1055           goto de_fault;
1056       }
1057       break;
1058 de_fault:
1059     default: {
1060       /* encode a string of characters to be matched exactly */
1061       int added = 0;
1062       /* back up to pick up the first char of interest */
1063       preg->regparse -= n;
1064       ret = regnode(preg, EXACTLY);
1065       /* note that a META operator such as ? or * consumes the preceding char */
1066       /* thus we must be careful to look ahead by 2 and add the last char as it's own EXACTLY if necessary */
1067       /* until end of string or a META char is reached */
1068       while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
1069         n = reg_utf8_tounicode_case(preg, preg->regparse, &ch, preg->cflags&HSRX_ICASE);
1070         if (ch == '\\' && preg->regparse[n]) {
1071           /* non-trailing backslash: is this a special escape, or a regular escape? */
1072           if (strchr("<>mMwds", preg->regparse[n])) break; /* a special escape; all done with EXACTLY */
1073           if (!(preg->cflags&HSRX_EXTENDED) && strchr("(){}", preg->regparse[n])) break; /* a special escape; all done with EXACTLY */
1074           /* decode it; note that we add the length for the escape
1075            * sequence to the length for the backlash so we can skip
1076            * the entire sequence, or not as required
1077            */
1078           n += reg_decode_escape(preg->regparse+n, &ch);
1079           if (ch == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
1080         }
1081         /* now we have one char 'ch' of length 'n'; check to see if the following char is a MULT */
1082         if (((preg->cflags&HSRX_EXTENDED) && ISMULT(preg->regparse[n])) ||
1083             (!(preg->cflags&HSRX_EXTENDED) && (preg->regparse[n] == '*' || (preg->regparse[n] == '\\' && preg->regparse[n+1] == '{')))) {
1084           /* yes, but do we already have some EXACTLY chars? */
1085           if (added) break; /* yes, so return what we have and pick up the current char next time around */
1086           /* no, so add this single char and finish */
1087           regc(preg, ch);
1088           ++added;
1089           preg->regparse += n;
1090           break;
1091         }
1092         /* no, so just add this char normally */
1093         regc(preg, ch);
1094         ++added;
1095         preg->regparse += n;
1096       }
1097       regc(preg, 0);
1098       *flagp |= HASWIDTH;
1099       if (added == 1) *flagp |= SIMPLE;
1100       break;
1101     }
1102     break;
1103   }
1104   return ret;
1105 }
1106
1107
1108 static void reg_grow (HSRegExp *preg, int n) {
1109   if (preg->p+n >= preg->proglen) {
1110     preg->proglen = (preg->p+n)*2;
1111     preg->program = realloc(preg->program, preg->proglen*sizeof(int));
1112   }
1113 }
1114
1115
1116 /*
1117  - regnode - emit a node
1118  */
1119 /* location */
1120 static int regnode (HSRegExp *preg, int op) {
1121   reg_grow(preg, 2);
1122   preg->program[preg->p++] = op;
1123   preg->program[preg->p++] = 0;
1124   /* return the start of the node */
1125   return preg->p-2;
1126 }
1127
1128
1129 /*
1130  - regc - emit (if appropriate) a byte of code
1131  */
1132 static void regc (HSRegExp *preg, int b) {
1133   reg_grow(preg, 1);
1134   preg->program[preg->p++] = b;
1135 }
1136
1137
1138 /*
1139  - reginsert - insert an operator in front of already-emitted operand
1140  *
1141  * Means relocating the operand.
1142  * Returns the new location of the original operand.
1143  */
1144 static int reginsert (HSRegExp *preg, int op, int size, int opnd) {
1145   reg_grow(preg, size);
1146   /* move everything from opnd up */
1147   memmove(preg->program+opnd+size, preg->program+opnd, sizeof(int)*(preg->p-opnd));
1148   /* zero out the new space */
1149   memset(preg->program+opnd, 0, sizeof(int)*size);
1150   preg->program[opnd] = op;
1151   preg->p += size;
1152   return opnd+size;
1153 }
1154
1155
1156 /*
1157  - regtail - set the next-pointer at the end of a node chain
1158  */
1159 static void regtail_ (HSRegExp *preg, int p, int val, int line) {
1160   int scan, temp, offset;
1161   /* find last node */
1162   scan = p;
1163   for (;;) {
1164     temp = regnext(preg, scan);
1165     if (temp == 0) break;
1166     scan = temp;
1167   }
1168   offset = OP(preg, scan)==BACK?scan-val:val-scan;
1169   preg->program[scan+1] = offset;
1170 }
1171
1172
1173 /*
1174  - regoptail - regtail on operand of first argument; nop if operandless
1175  */
1176 static void regoptail (HSRegExp *preg, int p, int val) {
1177   /* "operandless" and "op != BRANCH" are synonymous in practice */
1178   if (p != 0 && OP(preg, p) == BRANCH) regtail(preg, OPERAND(p), val);
1179 }
1180
1181
1182 /*
1183  * hsrxExec and friends
1184  */
1185
1186 /*
1187  * forwards
1188  */
1189 static int regtry (HSRegExp *preg, const char *string);
1190 static int regmatch (HSRegExp *preg, int prog);
1191 static int regrepeat (HSRegExp *preg, int p, int max);
1192
1193
1194 /*
1195  - hsrxExec - match a regexp against a string
1196  */
1197 int hsrxExec (HSRegExp *preg, const char *string, size_t nmatch, HSRxMatch pmatch[], int eflags) {
1198   const char *s;
1199   int scan;
1200   //int parens[HSRX_MAX_PAREN+1];
1201   /* be paranoid... */
1202   if (preg == NULL || preg->program == NULL || string == NULL) return HSRX_ERR_NULL_ARGUMENT;
1203   /* check validity of program */
1204   if (*preg->program != HSRX_MAGIC) return HSRX_ERR_CORRUPTED;
1205 #ifdef HSRX_DEBUG
1206   fprintf(stderr, "hsrxExec: %s\n", string);
1207   hsrxDump(preg);
1208 #endif
1209   preg->eflags = eflags;
1210   preg->pmatch = pmatch;
1211   preg->nmatch = nmatch;
1212   preg->start = string; /* all offsets are computed from here */
1213   /* must clear out the embedded repeat counts */
1214   for (scan = OPERAND(1); scan != 0; scan = regnext(preg, scan)) {
1215     switch (OP(preg, scan)) {
1216       case REP: case REPMIN: case REPX: case REPXMIN: preg->program[scan+4] = 0; break;
1217     }
1218   }
1219   /* clear matches for backrefing */
1220   for (scan = 0; scan <= preg->re_nsub; ++scan) preg->bmatch[scan].rm_so = preg->bmatch[scan].rm_eo = -1;
1221   /* if there is a "must appear" string, look for it */
1222   if (preg->regmust != 0) {
1223     s = string;
1224     while ((s = str_find(preg, s, preg->program[preg->regmust])) != NULL) {
1225       if (prefix_cmp(preg, preg->program+preg->regmust, preg->regmlen, s) >= 0) {
1226         break;
1227       }
1228       ++s;
1229     }
1230     if (s == NULL) return HSRX_NOMATCH; /* not present */
1231   }
1232   /* mark beginning of line for ^ */
1233   preg->regbol = string;
1234   /* simplest case: anchored match need be tried only once (maybe per line) */
1235   if (preg->reganch) {
1236     if (!(preg->cflags&HSRX_NEWLINE) && (eflags&HSRX_NOTBOL)) {
1237       /* this is an anchored search, but not an BOL, so possibly skip to the next line */
1238       goto nextline;
1239     }
1240     for (;;) {
1241       int ret = regtry(preg, string);
1242       if (ret) return HSRX_NOERROR;
1243       if (*string) {
1244 nextline:
1245         if (preg->cflags&HSRX_NEWLINE) {
1246           /* try the next anchor? */
1247           string = strchr(string, '\n');
1248           if (string == NULL) { preg->regbol = ++string; continue; }
1249         }
1250       }
1251       return HSRX_NOMATCH;
1252     }
1253   }
1254   /* messy cases: unanchored match */
1255   s = string;
1256   if (preg->regstart != '\0') {
1257     /* we know what char it must start with */
1258     while ((s = str_find(preg, s, preg->regstart)) != NULL) {
1259       if (regtry(preg, s)) return HSRX_NOERROR;
1260       ++s;
1261     }
1262   } else {
1263     /* we don't -- general case */
1264     while (*string) {
1265       int l;
1266       if (regtry(preg, s)) return HSRX_NOERROR;
1267       if (*s == '\0') break;
1268       l = eflags&HSRX_UTF8?utf8_charlen(*s):1;
1269       s += l;
1270     }
1271   }
1272   /* failure */
1273   return HSRX_NOMATCH;
1274 }
1275
1276
1277 /*
1278  - regtry - try match at specific point
1279  */
1280 /* 0 failure, 1 success */
1281 static int regtry (HSRegExp *preg, const char *string) {
1282   int i;
1283   //
1284   preg->reginput = string;
1285   for (i = 0; i < preg->nmatch; ++i) preg->pmatch[i].rm_so = preg->pmatch[i].rm_eo = -1;
1286   if (regmatch(preg, 1)) {
1287     if (preg->nmatch > 0 && preg->pmatch != NULL) {
1288       preg->pmatch[0].rm_so = string-preg->start;
1289       preg->pmatch[0].rm_eo = preg->reginput-preg->start;
1290     }
1291     return 1;
1292   }
1293   return 0;
1294 }
1295
1296
1297 /**
1298  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1299  *
1300  * If 'nocase' is non-zero, does a case-insensitive match.
1301  *
1302  * Returns -1 on not found.
1303  */
1304 static int prefix_cmp (HSRegExp *preg, const int *prog, int proglen, const char *string) {
1305   const char *s = string;
1306   int nocase = preg->cflags&HSRX_ICASE;
1307   //
1308   while (proglen && *s) {
1309     int ch, n = reg_utf8_tounicode_caseR(preg, s, &ch, nocase);
1310     //
1311     if (ch != *prog) return -1;
1312     ++prog;
1313     s += n;
1314     --proglen;
1315   }
1316   if (proglen == 0) return s-string;
1317   return -1;
1318 }
1319
1320
1321 /**
1322  * Search for the character 'c' in the utf-8 string 'string'.
1323  *
1324  * If 'nocase' is set, the 'string' is assumed to be uppercase
1325  * and 'c' is converted to uppercase before matching.
1326  *
1327  * Returns the byte position in the string where the 'c' was found, or
1328  * NULL if not found.
1329  */
1330 static const char *str_find (HSRegExp *preg, const char *string, int c) {
1331   int nocase = preg->cflags&HSRX_ICASE;
1332   //
1333   if (nocase) c = utf8_upper(c);
1334   while (*string) {
1335     int ch, n = reg_utf8_tounicode_caseR(preg, string, &ch, nocase);
1336     //
1337     if (c == ch) return string;
1338     string += n;
1339   }
1340   return NULL;
1341 }
1342
1343
1344 /**
1345  * Searchs for 'c' in the range 'range'.
1346  *
1347  * Returns 1 if found, or 0 if not.
1348  */
1349 static int reg_range_find (HSRegExp *preg, const int *range, int c) {
1350   if (preg->cflags&HSRX_ICASE) c = utf8_upper(c);
1351   while (*range) {
1352     /*printf("checking %d in range [%d,%d]\n", c, range[1], (range[0]+range[1]-1));*/
1353     if (c >= range[1] && c <= range[0]+range[1]-1) return 1;
1354     range += 2;
1355   }
1356   return 0;
1357 }
1358
1359
1360 /**
1361  * Returns true if 'ch' is an end-of-line char.
1362  *
1363  * In HSRX_NEWLINE mode, \n is considered EOL in
1364  * addition to \0
1365  */
1366 static int reg_iseol (HSRegExp *preg, int ch) {
1367   if (preg->cflags&HSRX_NEWLINE) return ch=='\0'||ch=='\n';
1368   return ch=='\0';
1369 }
1370
1371
1372 static int regmatchsimplerepeat (HSRegExp *preg, int scan, int matchmin) {
1373   int no, c;
1374   const char *save;
1375   int nextch = '\0';
1376   int max = preg->program[scan+2];
1377   int min = preg->program[scan+3];
1378   int next = regnext(preg, scan);
1379   /* lookahead to avoid useless match attempts when we know what character comes next */
1380   if (OP(preg, next) == EXACTLY) nextch = preg->program[OPERAND(next)];
1381   save = preg->reginput;
1382   no = regrepeat(preg, scan+5, max);
1383   if (no < min) return 0;
1384   if (matchmin) {
1385     /* from min up to no */
1386     max = no;
1387     no = min;
1388   }
1389   /* else from no down to min */
1390   for (;;) {
1391     if (matchmin) {
1392       if (no > max) break;
1393     } else {
1394       if (no < min) break;
1395     }
1396     preg->reginput = save+utf8_index(save, no);
1397     reg_utf8_tounicode_caseR(preg, preg->reginput, &c, preg->cflags&HSRX_ICASE);
1398     /* if it could work, try it */
1399     if (reg_iseol(preg, nextch) || c == nextch) {
1400       if (regmatch(preg, next)) return 1;
1401     }
1402     if (matchmin) {
1403       /* couldn't or didn't, add one more */
1404       ++no;
1405     } else {
1406       /* couldn't or didn't, back up */
1407       --no;
1408     }
1409   }
1410   return 0;
1411 }
1412
1413
1414 static int regmatchrepeat (HSRegExp *preg, int scan, int matchmin) {
1415   int *scanpt = preg->program+scan;
1416   int max = scanpt[2];
1417   int min = scanpt[3];
1418   //
1419   /* have we reached min? */
1420   if (scanpt[4] < min) {
1421     /* no, so get another one */
1422     ++(scanpt[4]);
1423     if (regmatch(preg, scan+5)) return 1;
1424     --(scanpt[4]);
1425     return 0;
1426   }
1427   if (scanpt[4] > max) return 0;
1428   if (matchmin) {
1429     /* minimal, so try other branch first */
1430     if (regmatch(preg, regnext(preg, scan))) return 1;
1431     /* no, so try one more */
1432     ++(scanpt[4]);
1433     if (regmatch(preg, scan+5)) return 1;
1434     --(scanpt[4]);
1435     return 0;
1436   }
1437   /* maximal, so try this branch again */
1438   if (scanpt[4] < max) {
1439     ++(scanpt[4]);
1440     if (regmatch(preg, scan+5)) return 1;
1441     --(scanpt[4]);
1442   }
1443   /* at this point we are at max with no match: try the other branch */
1444   return regmatch(preg, regnext(preg, scan));
1445 }
1446
1447
1448 static int prevChar (HSRegExp *preg) {
1449   if (preg->reginput == preg->start) return 0;
1450   if (preg->eflags&HSRX_UTF8) {
1451     int c;
1452     const char *pcp = preg->reginput-utf8_prev_len(preg->reginput, preg->reginput-preg->start);
1453     reg_utf8_tounicode_caseR(preg, pcp, &c, preg->cflags&HSRX_ICASE);
1454     return c;
1455   }
1456   return (unsigned char)(preg->reginput[-1]);
1457 }
1458
1459
1460 /*TODO: unicode word breaks?*/
1461 static inline int isWordChar (int ch) {
1462   return
1463     (ch >= 'a' && ch <= 'z') ||
1464     (ch >= 'A' && ch <= 'Z') ||
1465     (ch >= '0' && ch <= '9') ||
1466     (ch == '_');
1467 }
1468
1469
1470 /*
1471  - regmatch - main matching routine
1472  *
1473  * Conceptually the strategy is simple:  check to see whether the current
1474  * node matches, call self recursively to see whether the rest matches,
1475  * and then act accordingly.  In practice we make some effort to avoid
1476  * recursion, in particular by going through "ordinary" nodes (that don't
1477  * need to know whether the rest of the match failed) by a loop instead of
1478  * by recursion.
1479  */
1480 /* 0 failure, 1 success */
1481 static int regmatch (HSRegExp *preg, int prog) {
1482   int scan; /* current node */
1483   int next; /* next node */
1484   //
1485   scan = prog;
1486 #ifdef HSRX_DEBUG
1487   if (scan != 0 && hsrxNarrate) fprintf(stderr, "%s(\n", regprop(scan));
1488 #endif
1489   while (scan != 0) {
1490     int n, c, s;
1491 #ifdef HSRX_DEBUG
1492     if (hsrxNarrate) {
1493       //fprintf(stderr, "%s...\n", regprop(scan));
1494       fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* where, what */
1495     }
1496 #endif
1497     next = regnext(preg, scan);
1498     n = reg_utf8_tounicode_caseR(preg, preg->reginput, &c, preg->cflags&HSRX_ICASE);
1499     switch (OP(preg, scan)) {
1500       case BOL:
1501         if ((preg->cflags&HSRX_NEWLINE) || !(preg->eflags&HSRX_NOTBOL)) {
1502           if (preg->reginput != preg->regbol) return 0;
1503         }
1504         if (!(preg->cflags&HSRX_NEWLINE) && (preg->eflags&HSRX_NOTBOL)) return 0;
1505         break;
1506       case EOL:
1507         if ((preg->cflags&HSRX_NEWLINE) || !(preg->eflags&HSRX_NOTEOL)) {
1508           if (!reg_iseol(preg, c)) return 0;
1509         }
1510         if (!(preg->cflags&HSRX_NEWLINE) && (preg->eflags&HSRX_NOTEOL)) return 0;
1511         break;
1512       case WORDA:
1513         /* must be looking at a letter, digit, or _ */
1514         if (!isalnum(UCHAR(c)) && c != '_') return 0;
1515         /* prev must be BOL or nonword */
1516         if (preg->reginput > preg->regbol && (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_')) return 0;
1517         break;
1518       case WORDZ:
1519         /* can't match at BOL */
1520         if (preg->reginput > preg->regbol) {
1521           /* current must be EOL or nonword */
1522           if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1523             c = preg->reginput[-1];
1524             /* previous must be word */
1525             if (isalnum(UCHAR(c)) || c == '_') break;
1526           }
1527         }
1528         /* no */
1529         return 0;
1530       case WBRK:
1531         if (isWordChar(prevChar(preg)) == isWordChar(c)) return 0; /* failed */
1532         break;
1533       case WNBRK:
1534         if (isWordChar(prevChar(preg)) != isWordChar(c)) return 0; /* failed */
1535         break;
1536       case ANY:
1537         if (reg_iseol(preg, c)) return 0;
1538         preg->reginput += n;
1539         break;
1540       case EXACTLY: {
1541         int opnd = OPERAND(scan);
1542         int len = str_int_len(preg->program+opnd);
1543         int slen = prefix_cmp(preg, preg->program+opnd, len, preg->reginput);
1544         //
1545         if (slen < 0) return 0;
1546         preg->reginput += slen;
1547         } break;
1548       case ANYOF:
1549         if (reg_iseol(preg, c) || reg_range_find(preg, preg->program+OPERAND(scan), c) == 0) return 0;
1550         preg->reginput += n;
1551         break;
1552       case ANYBUT:
1553         if (reg_iseol(preg, c) || reg_range_find(preg, preg->program+OPERAND(scan), c) != 0) return 0;
1554         preg->reginput += n;
1555         break;
1556       case NOTHING: break;
1557       case BACK: break;
1558       case BRANCH:
1559         if (OP(preg, next) != BRANCH) {
1560           /* no choice */
1561           next = OPERAND(scan); /* avoid recursion */
1562         } else {
1563           const char *save;
1564           //
1565           do {
1566             save = preg->reginput;
1567             if (regmatch(preg, OPERAND(scan))) return 1;
1568             preg->reginput = save;
1569             scan = regnext(preg, scan);
1570           } while (scan != 0 && OP(preg, scan) == BRANCH);
1571           return 0;
1572           /* NOTREACHED */
1573         }
1574         break;
1575       case REP:
1576       case REPMIN:
1577         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1578       case REPX:
1579       case REPXMIN:
1580         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1581       case END:
1582         return 1; /* success! */
1583       case BACKREF: /* backreference */
1584         c = preg->program[OPERAND(scan)];
1585 #ifdef HSRX_DEBUG
1586         printf("BACKREF: %d (%d)\n", c, preg->re_nsub);
1587 #endif
1588         if (c < 1 || c > preg->re_nsub) return 0; /* invalid backref can't be matched */
1589         s = preg->bmatch[c].rm_so;
1590 #ifdef HSRX_DEBUG
1591         printf("  %d;%d\n", s, preg->bmatch[c].rm_eo);
1592 #endif
1593         if (s < 0 || preg->bmatch[c].rm_eo < s) return 0; /* capture not found yet */
1594         /* check string */
1595         n = preg->bmatch[c].rm_eo-s; /* capture length */
1596         /* dumb compare */
1597         for (c = 0; c < n; ++c) {
1598 #ifdef HSRX_DEBUG
1599           printf(" [%c] [%c]\n", preg->reginput[c], preg->start[s+c]);
1600 #endif
1601           if (!preg->reginput[c] || preg->reginput[c] != preg->start[s+c]) {
1602 #ifdef HSRX_DEBUG
1603             printf("  FAIL!\n");
1604 #endif
1605             return 0; /* alas */
1606           }
1607         }
1608 #ifdef HSRX_DEBUG
1609         printf("  HIT!\n");
1610 #endif
1611         /* move position and return success */
1612         preg->reginput += n;
1613         break;
1614       default:
1615         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) <= /*CLOSEEND*/NCCLOSE) {
1616           const char *save = preg->reginput;
1617           //int saveso = 0, saveeo = 0;
1618           //
1619           if (OP(preg, scan) < CLOSEEND) {
1620             if (OP(preg, scan) < CLOSE) {
1621               c = OP(preg, scan)-OPEN;
1622               preg->bmatch[c].rm_so = save-preg->start;
1623             } else {
1624               c = OP(preg, scan)-CLOSE;
1625               preg->bmatch[c].rm_eo = save-preg->start;
1626             }
1627           }
1628           //
1629           if (regmatch(preg, next)) {
1630             /* don't set startp if some later invocation of the same parentheses already has */
1631             if (OP(preg, scan) < CLOSEEND) {
1632               /* capturing parens */
1633               if (OP(preg, scan) < CLOSE) {
1634                 /* open */
1635                 c = OP(preg, scan)-OPEN;
1636                 if (c < preg->nmatch && preg->pmatch[c].rm_so == -1) preg->pmatch[c].rm_so = save-preg->start;
1637               } else {
1638                 /* close */
1639                 c = OP(preg, scan)-CLOSE;
1640                 if (c < preg->nmatch && preg->pmatch[c].rm_eo == -1) preg->pmatch[c].rm_eo = save-preg->start;
1641               }
1642             }
1643             return 1;
1644           }
1645           //
1646           if (OP(preg, scan) < CLOSEEND) {
1647             if (OP(preg, scan) < CLOSE) {
1648               c = OP(preg, scan)-OPEN;
1649               preg->bmatch[c].rm_so = -1;
1650             } else {
1651               c = OP(preg, scan)-CLOSE;
1652               preg->bmatch[c].rm_eo = -1;
1653             }
1654           }
1655           //
1656           return 0;
1657         }
1658         return HSRX_ERR_INTERNAL;
1659     }
1660     scan = next;
1661   }
1662   /* we get here only if there's trouble -- normally "case END" is the terminating point */
1663   return HSRX_ERR_INTERNAL;
1664 }
1665
1666
1667 /*
1668  - regrepeat - repeatedly match something simple, report how many
1669  */
1670 static int regrepeat (HSRegExp *preg, int p, int max) {
1671   const char *scan;
1672   int opnd, ch, n;
1673   int count = 0;
1674   //
1675   scan = preg->reginput;
1676   opnd = OPERAND(p);
1677   switch (OP(preg, p)) {
1678     case ANY:
1679       /* no need to handle utf8 specially here */
1680       while (!reg_iseol(preg, *scan) && count < max) { ++count; ++scan; }
1681       break;
1682     case EXACTLY:
1683       while (count < max) {
1684         n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1685         if (preg->program[opnd] != ch) break;
1686         ++count;
1687         scan += n;
1688       }
1689       break;
1690     case ANYOF:
1691       while (count < max) {
1692         n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1693         if (reg_iseol(preg, ch) || reg_range_find(preg, preg->program+opnd, ch) == 0) break;
1694         ++count;
1695         scan += n;
1696       }
1697       break;
1698     case ANYBUT:
1699       while (count < max) {
1700         n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1701         if (reg_iseol(preg, ch) || reg_range_find(preg, preg->program+opnd, ch) != 0) break;
1702         ++count;
1703         scan += n;
1704       }
1705       break;
1706     default: /* ih dear! called inappropriately */
1707       preg->err = HSRX_ERR_INTERNAL;
1708       count = 0; /* best compromise */
1709       break;
1710   }
1711   preg->reginput = scan;
1712   return count;
1713 }
1714
1715
1716 /*
1717  - regnext - dig the "next" pointer out of a node
1718  */
1719 static int regnext (HSRegExp *preg, int p) {
1720   int offset = NEXT(preg, p);
1721   //
1722   if (offset == 0) return 0;
1723   if (OP(preg, p) == BACK) return p-offset;
1724   return p+offset;
1725 }
1726
1727
1728 #ifdef HSRX_DEBUG
1729 /*
1730  - hsrxDump - dump a regexp onto stdout in vaguely comprehensible form
1731  */
1732 static void dumpChar (int ch) {
1733   if (ch > 255) {
1734     printf("{0x%x}", (unsigned int)ch);
1735   } else if (ch < 32 || ch >= 127) {
1736     printf("{%d}", ch);
1737   } else {
1738     putchar(ch);
1739   }
1740 }
1741
1742
1743 void hsrxDump (HSRegExp *preg) {
1744   int s, next, i;
1745   char buf[4];
1746   int op = EXACTLY; /* arbitrary non-END op */
1747   //
1748   for (i = 1; i < preg->p; ++i) {
1749     printf("%02x ", preg->program[i]);
1750     if (i%16 == 15) printf("\n");
1751   }
1752   printf("\n");
1753   //
1754   s = 1;
1755   /* while that wasn't END last time... */
1756   while (op != END && s < preg->p) {
1757     op = OP(preg, s);
1758     printf("%3d: %-12s", s, regprop(op)); /* where, what */
1759     next = regnext(preg, s);
1760     if (next == 0) {
1761       /* next ptr */
1762       printf("(0)");
1763     } else {
1764       printf("(%d)", next);
1765     }
1766     s += 2;
1767     if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1768       int max = preg->program[s];
1769       int min = preg->program[s+1];
1770       //
1771       if (max == 65535) printf(" {%d,*}", min); else printf(" {%d,%d}", min, max);
1772       printf(" %d", preg->program[s+2]);
1773       s += 3;
1774     } else if (op == ANYOF || op == ANYBUT) {
1775       /* set of ranges */
1776       printf(" r: [");
1777       while (preg->program[s]) {
1778         int len = preg->program[s++];
1779         int first = preg->program[s++];
1780         /*
1781         buf[utf8_fromunicode(buf, first)] = 0;
1782         printf("%s", buf);
1783         if (len > 1) {
1784           buf[utf8_fromunicode(buf, first+len-1)] = 0;
1785           printf("-%s", buf);
1786         }
1787         */
1788         dumpChar(first);
1789         printf("-");
1790         if (len > 1) {
1791           dumpChar(first+len-1);
1792         } else {
1793           dumpChar(first);
1794         }
1795       }
1796       printf("]");
1797       ++s;
1798     } else if (op == EXACTLY) {
1799       /* literal string, where present */
1800       printf(" l: '");
1801       while (preg->program[s]) {
1802         /*
1803         buf[utf8_fromunicode(buf, preg->program[s])] = 0;
1804         printf("%s", buf);
1805         */
1806         dumpChar(preg->program[s]);
1807         ++s;
1808       }
1809       printf("'");
1810       ++s;
1811     } else if (op == BACKREF) {
1812       printf(" refno: %d", preg->program[s++]);
1813     }
1814     putchar('\n');
1815   }
1816   if (op == END) {
1817     /* header fields of interest */
1818     if (preg->regstart) {
1819       buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1820       printf("start '%s' ", buf);
1821     }
1822     if (preg->reganch) printf("anchored ");
1823     if (preg->regmust != 0) {
1824       int i;
1825       //
1826       printf("must have:");
1827       for (i = 0; i < preg->regmlen; ++i) putchar(preg->program[preg->regmust+i]);
1828       putchar('\n');
1829     }
1830   }
1831   printf("\n");
1832 }
1833
1834
1835 /*
1836  - regprop - printable representation of opcode
1837  */
1838 static const char *regprop (int op) {
1839   static char buf[50];
1840   //
1841   switch (op) {
1842     case BOL: return "BOL";
1843     case EOL: return "EOL";
1844     case ANY: return "ANY";
1845     case ANYOF: return "ANYOF";
1846     case ANYBUT: return "ANYBUT";
1847     case BRANCH: return "BRANCH";
1848     case EXACTLY: return "EXACTLY";
1849     case NOTHING: return "NOTHING";
1850     case BACK: return "BACK";
1851     case END: return "END";
1852     case REP: return "REP";
1853     case REPMIN: return "REPMIN";
1854     case REPX: return "REPX";
1855     case REPXMIN: return "REPXMIN";
1856     case WORDA: return "WORDA";
1857     case WORDZ: return "WORDZ";
1858     case BACKREF: return "BACKREF";
1859     case NCOPEN: return "NCOPEN";
1860     case NCCLOSE: return "NCCLOSE";
1861     default:
1862       if (op >= OPEN && op < CLOSE) snprintf(buf, sizeof(buf), "OPEN:%d", op-OPEN);
1863       else if (op >= CLOSE && op < CLOSEEND) snprintf(buf, sizeof(buf), "CLOSE:%d", op-CLOSE);
1864       else snprintf(buf, sizeof(buf), "?%d?\n", op);
1865       return buf;
1866   }
1867 }
1868 #endif
1869
1870
1871 size_t hsrxError (int errcode, const HSRegExp *preg, char *errbuf,  size_t errbuf_size) {
1872   static const char *error_strings[] = {
1873     "success",
1874     "no match",
1875     "bad pattern",
1876     "null argument",
1877     "unknown error",
1878     "too big",
1879     "out of memory",
1880     "too many ()",
1881     "parentheses () not balanced",
1882     "braces {} not balanced",
1883     "invalid repetition count(s)",
1884     "extra characters",
1885     "*+ of empty atom",
1886     "nested count",
1887     "internal error",
1888     "count follows nothing",
1889     "trailing backslash",
1890     "corrupted program",
1891     "contains null char",
1892     "invalid backreference number",
1893   };
1894   const char *err;
1895   //
1896   if (errcode < 0 || errcode >= HSRX_ERR_NUM) {
1897     err = "Bad error code";
1898   } else {
1899     err = error_strings[errcode];
1900   }
1901   return snprintf(errbuf, errbuf_size, "%s", err);
1902 }
1903
1904
1905 void hsrxFree (HSRegExp *preg) {
1906   if (preg != NULL) {
1907     if (preg->bmatch != NULL) free(preg->bmatch); preg->bmatch = NULL;
1908     if (preg->program != NULL) free(preg->program); preg->program = NULL;
1909   }
1910 }