src/hsregexp.c

   1 /*
   2  * hsrxCompile and hsrxExec -- regsub and hsrxError are elsewhere
   3  *
   4  *  Copyright (c) 1986 by University of Toronto.
   5  *  Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *  Permission is granted to anyone to use this software for any
   8  *  purpose on any computer system, and to redistribute it freely,
   9  *  subject to the following restrictions:
  10  *
  11  *  1. The author is not responsible for the consequences of use of
  12  *    this software, no matter how awful, even if they arise
  13  *    from defects in it.
  14  *
  15  *  2. The origin of this software must not be misrepresented, either
  16  *    by explicit claim or by omission.
  17  *
  18  *  3. Altered versions must be plainly marked as such, and must not
  19  *    be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a hsrxCompile() redundancy.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  33  *** to avoid conflicting with the system regexp.h.  Const, bless its
  34  *** soul, was removed so it can compile everywhere.  The declaration
  35  *** of strchr() was in conflict on AIX, so it was removed (as it is
  36  *** happily defined in string.h).
  37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  41  *
  42  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  43  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  44  *   This includes counted repetitions, UTF-8 support, character classes,
  45  *   shorthand character classes, increased number of parentheses to 100,
  46  *   backslash escape sequences, non-greedy ops. It also removes \n as an alternative to |.
  47  *
  48  *   THIS IS AN ALTERED VERSION.  It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
  49  *   in Sep, 2011, to add non-capturing brackets (?:...) and backreferences.
  50  *
  51  *   THIS IS AN ALTERED VERSION.  It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
  52  *   in Nov, 2011, to add \b and \B
  53  *
  54  * Beware that some of this code is subtly aware of the way operator
  55  * precedence is structured in regular expressions.  Serious changes in
  56  * regular-expression syntax might require a total rethink.
  57  */
  58 #include <ctype.h>
  59 #include <stdio.h>
  60 #include <stdlib.h>
  61 #include <string.h>
  62
  63 #include "hsregexp.h"
  64
  65
  66 #define MAYBE_UNUSED  __attribute__((unused))
  67
  68 /* obey POSIX bug with the meaning of ')' */
  69 #define OBEY_POSIX_BUGS  0
  70
  71
  72 /**
  73  * UTF-8 utility functions
  74  *
  75  * (c) 2010 Steve Bennett <steveb@workware.net.au>
  76  *
  77  * See LICENCE for licence details.
  78  */
  79 struct casemap {
  80   unsigned short code; /* code point */
  81   unsigned short lower; /* code */
  82   unsigned short upper; /* code */
  83 };
  84
  85
  86 /* generated mapping tables */
  87 #include "hsregexp_unicode_mapping.c"
  88
  89
  90 #define NUMCASEMAP  (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
  91
  92 static int cmp_casemap (const void *key, const void *cm) {
  93   return *(int *)key-(int)((const struct casemap *)cm)->code;
  94 }
  95
  96
  97 static int utf8_map_case (int uc, int upper) {
  98   const struct casemap *cm = bsearch(&uc, unicode_case_mapping, NUMCASEMAP, sizeof(*unicode_case_mapping), cmp_casemap);
  99   if (cm) uc = upper?cm->upper:cm->lower;
 100   return uc;
 101 }
 102
 103
 104 /**
 105  * Converts the given unicode codepoint (0 - 0xffff) to utf-8
 106  * and stores the result at 'p'.
 107  *
 108  * Returns the number of utf-8 characters (1-3).
 109  */
 110 /* This one is always implemented */
 111 static inline int utf8_fromunicode (char *p, unsigned short uc) {
 112   if (uc <= 0x7f) { *p = uc; return 1; }
 113   if (uc <= 0x7ff) { *p++ = 0xc0 | ((uc & 0x7c0) >> 6); *p = 0x80 | (uc & 0x3f); return 2; }
 114   *p++ = 0xe0 | ((uc & 0xf000) >> 12); *p++ = 0x80 | ((uc & 0xfc0) >> 6); *p = 0x80 | (uc & 0x3f); return 3;
 115 }
 116
 117
 118 /**
 119  * Returns the unicode codepoint corresponding to the
 120  * utf-8 sequence 'str'.
 121  *
 122  * Stores the result in *uc and returns the number of bytes
 123  * consumed.
 124  *
 125  * If 'str' is null terminated, then an invalid utf-8 sequence
 126  * at the end of the string will be returned as individual bytes.
 127  *
 128  * If it is not null terminated, the length *must* be checked first.
 129  *
 130  * Does not support unicode code points > \uffff
 131  */
 132 static inline int utf8_tounicode (const char *str, int *uc) {
 133   unsigned const char *s = (unsigned const char *)str;
 134   //
 135   if (s[0] < 0xc0) { *uc = s[0]; return 1; }
 136   if (s[0] < 0xe0) {
 137     if ((s[1] & 0xc0) == 0x80) { *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80); return 2; }
 138   } else if (s[0] < 0xf0) {
 139     if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
 140       *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);
 141       return 3;
 142     }
 143   }
 144   /* invalid sequence, so just return the byte */
 145   *uc = *s;
 146   return 1;
 147 }
 148
 149
 150 /**
 151  * Returns the length of the utf-8 sequence starting with 'c'.
 152  *
 153  * Returns 1-4, or -1 if this is not a valid start byte.
 154  *
 155  * Note that charlen=4 is not supported by the rest of the API.
 156  */
 157 static MAYBE_UNUSED inline int utf8_charlen (int c) {
 158   if ((c&0x80) == 0) return 1;
 159   if ((c&0xe0) == 0xc0) return 2;
 160   if ((c&0xf0) == 0xe0) return 3;
 161   if ((c&0xf8) == 0xf0) return 4;
 162   /* invalid sequence */
 163   return -1;
 164 }
 165
 166
 167 /**
 168  * Returns the number of characters in the utf-8
 169  * string of the given byte length.
 170  *
 171  * Any bytes which are not part of an valid utf-8
 172  * sequence are treated as individual characters.
 173  *
 174  * The string *must* be null terminated.
 175  *
 176  * Does not support unicode code points > \uffff
 177  */
 178 static MAYBE_UNUSED int utf8_strlen (const char *str, int bytelen) {
 179   int charlen = 0;
 180   //
 181   if (bytelen < 0) bytelen = strlen(str);
 182   while (bytelen) {
 183     int c, l = utf8_tounicode(str, &c);
 184     ++charlen;
 185     str += l;
 186     bytelen -= l;
 187   }
 188   return charlen;
 189 }
 190
 191
 192 /**
 193  * Returns the byte index of the given character in the utf-8 string.
 194  *
 195  * The string *must* be null terminated.
 196  *
 197  * This will return the byte length of a utf-8 string
 198  * if given the char length.
 199  */
 200 static MAYBE_UNUSED inline int utf8_index (const char *str, int index) {
 201   const char *s = str;
 202   //
 203   while (index--) {
 204     int c;
 205     s += utf8_tounicode(s, &c);
 206   }
 207   return s-str;
 208 }
 209
 210
 211 /**
 212  * Returns the number of bytes before 'str' that the previous
 213  * utf-8 character sequence starts (which may be the middle of a sequence).
 214  *
 215  * Looks back at most 'len' bytes backwards, which must be > 0.
 216  * If no start char is found, returns -len
 217  */
 218 static MAYBE_UNUSED inline int utf8_prev_len (const char *str, int len) {
 219   int n = 1;
 220   //
 221   //assert(len > 0);
 222   /* look up to len chars backward for a start-of-char byte */
 223   while (--len) {
 224     if ((str[-n] & 0x80) == 0) break; /* start of a 1-byte char */
 225     if ((str[-n] & 0xc0) == 0xc0) break; /* start of a multi-byte char */
 226     ++n;
 227   }
 228   return n;
 229 }
 230
 231
 232 /**
 233  * Returns the upper-case variant of the given unicode codepoint.
 234  *
 235  * Does not support unicode code points > \uffff
 236  */
 237 static MAYBE_UNUSED int utf8_upper (int uc) {
 238   if (isascii(uc)) return toupper(uc);
 239   return utf8_map_case(uc, 1);
 240 }
 241
 242
 243 /**
 244  * Returns the lower-case variant of the given unicode codepoint.
 245  *
 246  * NOTE: Use utf8_upper() in preference for case-insensitive matching.
 247  *
 248  * Does not support unicode code points > \uffff
 249  */
 250 static MAYBE_UNUSED int utf8_lower (int uc) {
 251   if (isascii(uc)) return tolower(uc);
 252   return utf8_map_case(uc, 0);
 253 }
 254
 255
 256 static MAYBE_UNUSED inline int utf8_charequal (const char *s1, const char *s2) {
 257   int c1, c2;
 258   //
 259   utf8_tounicode(s1, &c1);
 260   utf8_tounicode(s2, &c2);
 261   return c1==c2;
 262 }
 263
 264
 265 #define UCHAR(c)  ((unsigned char)(c))
 266
 267
 268 /* This *MUST* be less than (255-20-2)/2=116 */
 269 #define HSRX_MAX_PAREN  (100)
 270
 271 /*
 272  * Structure for regexp "program". This is essentially a linear encoding
 273  * of a nondeterministic finite-state machine (aka syntax charts or
 274  * "railroad normal form" in parsing technology). Each node is an opcode
 275  * plus a "next" pointer, possibly plus an operand. "Next" pointers of
 276  * all nodes except BRANCH implement concatenation; a "next" pointer with
 277  * a BRANCH on both ends of it is connecting two alternatives. Here we
 278  * have one of the subtle syntax dependencies: an individual BRANCH (as
 279  * opposed to a collection of them) is never concatenated with anything
 280  * because of operator precedence. The operand of some types of node is
 281  * a literal string; for others, it is a node leading into a sub-FSM.
 282  * In particular, the operand of a BRANCH node is the first node of the
 283  * branch.
 284  * (NB this is *not* a tree structure: the tail of the branch connects
 285  * to the thing following the set of BRANCHes.)
 286  * The opcodes are:
 287  */
 288 /* definition_num   opnd?       meaning */
 289 #define END       0 /* no       end of program */
 290 #define BOL       1 /* no       match "" at beginning of line */
 291 #define EOL       2 /* no       match "" at end of line */
 292 #define ANY       3 /* no       match any one character */
 293 #define ANYOF     4 /* str      match any character in this string */
 294 #define ANYBUT    5 /* str      match any character not in this string */
 295 #define BRANCH    6 /* node     match this alternative, or the next... */
 296 #define BACK      7 /* no       match "", "next" ptr points backward */
 297 #define EXACTLY   8 /* str      match this string */
 298 #define NOTHING   9 /* no       match empty string */
 299 #define REP      10 /* max,min  match this (simple) thing [min,max] times */
 300 #define REPMIN   11 /* max,min  match this (simple) thing [min,max] times, mininal match */
 301 #define REPX     12 /* max,min  match this (complex) thing [min,max] times */
 302 #define REPXMIN  13 /* max,min  match this (complex) thing [min,max] times, minimal match */
 303 /* no 14 */
 304 #define WORDA    15 /* no       match "" at wordchar, where prev is nonword */
 305 #define WORDZ    16 /* no       match "" at nonwordchar, where prev is word */
 306 #define BACKREF  17 /* captnum  match previous capture here */
 307 #define WBRK     18 /* no       match word break -- \b */
 308 #define WNBRK    19 /* no       match non-word-break -- \B */
 309 /* */
 310 #define OPEN     20 /* no       mark this point in input as start of #n (OPEN+1 is number 1, etc) */
 311 #define CLOSE    (OPEN+HSRX_MAX_PAREN) /* analogous to OPEN */
 312 #define CLOSEEND (CLOSE+HSRX_MAX_PAREN)
 313
 314 #define NCOPEN    (CLOSEEND+0)
 315 #define NCCLOSE   (CLOSEEND+1)
 316
 317
 318 /*
 319  * The first byte of the regexp internal "program" is actually this magic
 320  * number; the start node begins in the second byte.
 321  */
 322 #define HSRX_MAGIC  (0xFADED00DUL)
 323
 324
 325 /*
 326  * Opcode notes:
 327  *
 328  * BRANCH The set of branches constituting a single choice are hooked
 329  *    together with their "next" pointers, since precedence prevents
 330  *    anything being concatenated to any individual branch.  The
 331  *    "next" pointer of the last BRANCH in a choice points to the
 332  *    thing following the whole choice.  This is also where the
 333  *    final "next" pointer of each individual branch points; each
 334  *    branch starts with the operand node of a BRANCH node.
 335  *
 336  * BACK   Normal "next" pointers all implicitly point forward; BACK
 337  *    exists to make loop structures possible.
 338  *
 339  * STAR,PLUS  '?', and complex '*' and '+', are implemented as circular
 340  *    BRANCH structures using BACK.  Simple cases (one character
 341  *    per match) are implemented with STAR and PLUS for speed
 342  *    and to minimize recursive plunges.
 343  *
 344  * OPEN,CLOSE ...are numbered at compile time.
 345  */
 346
 347
 348 /*
 349  * A node is one char of opcode followed by two chars of "next" pointer.
 350  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 351  * value is a positive offset from the opcode of the node containing it.
 352  * An operand, if any, simply follows the node.  (Note that much of the
 353  * code generation knows about this implicit relationship.)
 354  *
 355  * Using two bytes for the "next" pointer is vast overkill for most things,
 356  * but allows patterns to get big without disasters.
 357  */
 358 #define OP(preg, p)    (preg->program[p])
 359 #define NEXT(preg, p)  (preg->program[p+1])
 360 #define OPERAND(p)     ((p)+2)
 361
 362
 363 /*
 364  * Utility definitions.
 365  */
 366
 367 #define FAIL(R,M)   {(R)->err = (M); return (M);}
 368 #define ISMULT(c)   ((c)=='*'||(c)=='+'||(c)=='?'||(c) == '{')
 369 #define META        "$.[()|?{+*"
 370
 371
 372 /*
 373  * Flags to be passed up and down.
 374  */
 375 #define WORST     0 /* worst case */
 376 #define HASWIDTH  1 /* known never to match null string */
 377 #define SIMPLE    2 /* simple enough to be STAR/PLUS operand */
 378 #define SPSTART   4 /* starts with * or + */
 379
 380
 381 #define MAX_REP_COUNT  (1000000)
 382
 383
 384 /*
 385  * Forward declarations for hsrxCompile()'s friends.
 386  */
 387 static int reg (HSRegExp *preg, int paren /*parenthesized?*/, int *flagp);
 388 static int regpiece (HSRegExp *preg, int *flagp);
 389 static int regbranch (HSRegExp *preg, int *flagp);
 390 static int regatom (HSRegExp *preg, int *flagp);
 391 static int regnode (HSRegExp *preg, int op);
 392 static int regnext (HSRegExp *preg, int p);
 393 static void regc (HSRegExp *preg, int b);
 394 static int reginsert (HSRegExp *preg, int op, int size, int opnd);
 395 static void regtail_ (HSRegExp *preg, int p, int val, int line);
 396 static void regoptail (HSRegExp *preg, int p, int val);
 397 #define regtail(PREG, P, VAL)  regtail_(PREG, P, VAL, __LINE__)
 398
 399 static int reg_range_find (HSRegExp *preg, const int *string, int c);
 400 static const char *str_find (HSRegExp *preg, const char *string, int c);
 401 static int prefix_cmp (HSRegExp *preg, const int *prog, int proglen, const char *string);
 402
 403 #ifdef HSRX_DEBUG
 404 int hsrxNarrate = 0;
 405 void hsrxDump (HSRegExp *preg);
 406 static const char *regprop (int op);
 407 #endif
 408
 409
 410 /**
 411  * Returns the length of the null-terminated integer sequence.
 412  */
 413 static
 414 #if __GNUC__ > 2
 415 __attribute__((pure))
 416 #endif
 417 inline int str_int_len (const int *seq) {
 418   int n = 0;
 419   while (*seq++) ++n;
 420   return n;
 421 }
 422
 423
 424 /*
 425  - hsrxCompile - compile a regular expression into internal code
 426  *
 427  * We can't allocate space until we know how big the compiled form will be,
 428  * but we can't compile it (and thus know how big it is) until we've got a
 429  * place to put the code.  So we cheat:  we compile it twice, once with code
 430  * generation turned off and size counting turned on, and once "for real".
 431  * This also means that we don't allocate space until we are sure that the
 432  * thing really will compile successfully, and we never have to move the
 433  * code and thus invalidate pointers into it.  (Note that it has to be in
 434  * one piece because free() must be able to free it all.)
 435  *
 436  * Beware that the optimization-preparation code in here knows about some
 437  * of the structure of the compiled regexp.
 438  */
 439 int hsrxCompile (HSRegExp *preg, const char *exp, int cflags) {
 440   int scan, longest, flags;
 441   unsigned int len;
 442   //
 443 #ifdef HSRX_DEBUG
 444   fprintf(stderr, "Compiling: '%s'\n", exp);
 445 #endif
 446   memset(preg, 0, sizeof(*preg));
 447   if (exp == NULL) FAIL(preg, HSRX_ERR_NULL_ARGUMENT);
 448   /* first pass: determine size, legality */
 449   preg->start = exp; /* for '^' special */
 450   preg->cflags = cflags;
 451   preg->regparse = exp;
 452   /* XXX: for now, start unallocated */
 453   preg->program = NULL;
 454   preg->proglen = 0;
 455 #if 1
 456   /* allocate space */
 457   preg->proglen = (strlen(exp)+1)*5;
 458   preg->program = malloc(preg->proglen*sizeof(int));
 459   if (preg->program == NULL) FAIL(preg, HSRX_ERR_NOMEM);
 460 #endif
 461   /* note that since we store a magic value as the first item in the program, program offsets will never be 0 */
 462   regc(preg, HSRX_MAGIC);
 463   if (reg(preg, 0, &flags) == 0) return preg->err;
 464   /* small enough for pointer-storage convention? */
 465   if (preg->re_nsub >= HSRX_MAX_PAREN) FAIL(preg, HSRX_ERR_TOO_BIG); /* probably could be 65535L */
 466   preg->bmatch = malloc(sizeof(HSRxMatch)*(preg->re_nsub+1));
 467   if (preg->bmatch == NULL) FAIL(preg, HSRX_ERR_NOMEM);
 468   /* dig out information for optimizations. */
 469   preg->regstart = 0; /* worst-case defaults */
 470   preg->reganch = 0;
 471   preg->regmust = 0;
 472   preg->regmlen = 0;
 473   scan = 1; /* first BRANCH */
 474   if (OP(preg, regnext(preg, scan)) == END) {
 475     /* only one top-level choice */
 476     scan = OPERAND(scan);
 477     /* starting-point info */
 478     if (OP(preg, scan) == EXACTLY) preg->regstart = preg->program[OPERAND(scan)];
 479     else if (OP(preg, scan) == BOL) ++preg->reganch;
 480     /*
 481      * If there's something expensive in the r.e., find the
 482      * longest literal string that must appear and make it the
 483      * regmust.  Resolve ties in favor of later strings, since
 484      * the regstart check works with the beginning of the r.e.
 485      * and avoiding duplication strengthens checking.  Not a
 486      * strong reason, but sufficient in the absence of others.
 487      */
 488     if (flags&SPSTART) {
 489       longest = 0;
 490       len = 0;
 491       for (; scan != 0; scan = regnext(preg, scan)) {
 492         if (OP(preg, scan) == EXACTLY) {
 493           int plen = str_int_len(preg->program+OPERAND(scan));
 494           if (plen >= len) {
 495             longest = OPERAND(scan);
 496             len = plen;
 497           }
 498         }
 499       }
 500       preg->regmust = longest;
 501       preg->regmlen = len;
 502     }
 503   }
 504 #ifdef HSRX_DEBUG
 505   hsrxDump(preg);
 506 #endif
 507   return 0;
 508 }
 509
 510
 511 /*
 512  - reg - regular expression, i.e. main body or parenthesized thing
 513  *
 514  * Caller must absorb opening parenthesis.
 515  *
 516  * Combining parenthesis handling with the base level of regular expression
 517  * is a trifle forced, but the need to tie the tails of the branches to what
 518  * follows makes it hard to avoid.
 519  *
 520  * paren:
 521  *   0: not parentesized
 522  *   1: normal parens
 523  *   2: non-capturing parens
 524  */
 525 static int reg (HSRegExp *preg, int paren, int *flagp) {
 526   int ret, br, ender, flags;
 527   int parno = 0;
 528   //
 529   *flagp = HASWIDTH; /* tentatively */
 530   /* make an OPEN node, if parenthesized */
 531   if (paren) {
 532     // new (capturing) paren
 533     parno = paren==1?++preg->re_nsub:-666;
 534     ret = regnode(preg, paren==1?OPEN+parno:NCOPEN);
 535   } else {
 536     ret = 0;
 537   }
 538   /* pick up the branches, linking them together */
 539   br = regbranch(preg, &flags);
 540   if (br == 0) return 0;
 541   if (ret != 0) regtail(preg, ret, br); /* OPEN -> first */ else ret = br;
 542   if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
 543   *flagp |= flags&SPSTART;
 544   while (*preg->regparse == '|') {
 545     ++preg->regparse;
 546     br = regbranch(preg, &flags);
 547     if (br == 0) return 0;
 548     regtail(preg, ret, br); /* BRANCH -> BRANCH */
 549     if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
 550     *flagp |= flags&SPSTART;
 551   }
 552   /* make a closing node, and hook it on the end */
 553   ender = regnode(preg, (paren)?(paren==1?CLOSE+parno:NCCLOSE):END);
 554   regtail(preg, ret, ender);
 555   /* hook the tails of the branches to the closing node */
 556   for (br = ret; br != 0; br = regnext(preg, br)) regoptail(preg, br, ender);
 557   /* check for proper termination */
 558   if (paren) {
 559     if (preg->cflags&HSRX_EXTENDED) {
 560       if (*preg->regparse++ != ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
 561     } else {
 562       if (preg->regparse[0] != '\\' || preg->regparse[1] != ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
 563       preg->regparse += 2;
 564     }
 565   }
 566   if (!paren && *preg->regparse != '\0') {
 567     if (preg->cflags&HSRX_EXTENDED) {
 568       if (*preg->regparse == ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
 569     } else {
 570       if (preg->regparse[0] == '\\' && preg->regparse[1] == ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
 571     }
 572     preg->err = HSRX_ERR_JUNK_ON_END;
 573     return 0;
 574   }
 575   preg->start = NULL;
 576   return ret;
 577 }
 578
 579
 580 /*
 581  - regbranch - one alternative of an | operator
 582  *
 583  * Implements the concatenation operator.
 584  */
 585 static int regbranch (HSRegExp *preg, int *flagp) {
 586   int ret, chain, latest, flags;
 587   //
 588   *flagp = WORST; /* tentatively */
 589   ret = regnode(preg, BRANCH);
 590   chain = 0;
 591   while (*preg->regparse != '\0') {
 592     if (preg->cflags&HSRX_EXTENDED) {
 593       if (*preg->regparse == ')' || *preg->regparse == '|') break;
 594     } else {
 595       if (preg->regparse[0] == '\\' && preg->regparse[1] == ')') break;
 596     }
 597     latest = regpiece(preg, &flags);
 598     if (latest == 0) return 0;
 599     *flagp |= flags&HASWIDTH;
 600     if (chain == 0) *flagp |= flags&SPSTART; /* first piece */ else regtail(preg, chain, latest);
 601     chain = latest;
 602   }
 603   if (chain == 0) regnode(preg, NOTHING); /* loop ran zero times */
 604   return ret;
 605 }
 606
 607
 608 /*
 609  - regpiece - something followed by possible [*+?]
 610  *
 611  * Note that the branching code sequences used for ? and the general cases
 612  * of * and + are somewhat optimized:  they use the same NOTHING node as
 613  * both the endmarker for their branch list and the body of the last branch.
 614  * It might seem that this node could be dispensed with entirely, but the
 615  * endmarker role is not redundant.
 616  */
 617 static int regpiece (HSRegExp *preg, int *flagp) {
 618   char op;
 619   int ret, next, flags, min, max;
 620   int chain = 0;
 621   //
 622   ret = regatom(preg, &flags);
 623   if (ret == 0) return 0;
 624   op = *preg->regparse;
 625   if (preg->cflags&HSRX_EXTENDED) {
 626     if (!ISMULT(op)) { *flagp = flags; return ret; }
 627     if (!(flags&HASWIDTH) && op != '?') { preg->err = HSRX_ERR_OPERAND_COULD_BE_EMPTY; return 0; }
 628   } else {
 629     if (op != '*') {
 630       if (op == '\\' && preg->regparse[1] == '{') {
 631         op = '{';
 632         ++preg->regparse; /* skip slash */
 633       } else {
 634         *flagp = flags;
 635         return ret;
 636       }
 637     }
 638   }
 639   /* handle braces (counted repetition) by expansion */
 640   if (op == '{') {
 641     /* for non-extended regexps this is the only possible path */
 642     char *end;
 643     //
 644     ++preg->regparse; /* skip paren */
 645     while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
 646     /* first number */
 647     if (*preg->regparse == ',') {
 648       ++preg->regparse;
 649       min = 0;
 650     } else {
 651       min = strtoul(preg->regparse, &end, 10);
 652       if (end == preg->regparse) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
 653       preg->regparse = end;
 654     }
 655     /* second number */
 656     while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
 657     if (*preg->regparse == ',') {
 658       ++preg->regparse;
 659       while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
 660       if (!(isdigit(*preg->regparse))) {
 661         max = MAX_REP_COUNT;
 662       } else {
 663         max = strtoul(preg->regparse, &end, 10);
 664         if (end == preg->regparse) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
 665         preg->regparse = end;
 666       }
 667     } else {
 668       max = min;
 669     }
 670     if (max > min || min < 0 || max < 0 || min > MAX_REP_COUNT || max > MAX_REP_COUNT) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
 671     /* check for proper termination */
 672     while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
 673     if (preg->cflags&HSRX_EXTENDED) {
 674       if (*preg->regparse != '}') { preg->err = HSRX_ERR_UNMATCHED_BRACES; return 0; }
 675       ++preg->regparse;
 676     } else {
 677       if (preg->regparse[0] != '\\' || preg->regparse[1] != '}') { preg->err = HSRX_ERR_UNMATCHED_BRACES; return 0; }
 678       preg->regparse += 2;
 679     }
 680   } else {
 681     min = op=='+'?1:0;
 682     max = op=='?'?1:MAX_REP_COUNT;
 683     ++preg->regparse; /* skip special */
 684   }
 685   /* insert 'repeat' opcode */
 686   if ((preg->cflags&HSRX_EXTENDED) && *preg->regparse == '?') {
 687     ++preg->regparse;
 688     next = reginsert(preg, flags&SIMPLE?REPMIN:REPXMIN, 5, ret);
 689   } else {
 690     next = reginsert(preg, flags&SIMPLE?REP:REPX, 5, ret);
 691   }
 692   preg->program[ret+2] = max;
 693   preg->program[ret+3] = min;
 694   preg->program[ret+4] = 0;
 695   *flagp = (min)?(WORST|HASWIDTH):(WORST|SPSTART);
 696   if (!(flags&SIMPLE)) {
 697     int back = regnode(preg, BACK);
 698     regtail(preg, back, ret);
 699     regtail(preg, next, back);
 700   }
 701   if (preg->cflags&HSRX_EXTENDED) {
 702     if (ISMULT(*preg->regparse)) { preg->err = HSRX_ERR_NESTED_COUNT; return 0; }
 703   } else {
 704     if (*preg->regparse == '*' || (preg->regparse[0] == '\\' && preg->regparse[1] == '{')) { preg->err = HSRX_ERR_NESTED_COUNT; return 0; }
 705   }
 706   return chain?chain:ret;
 707 }
 708
 709
 710 /**
 711  * Add all characters in the inclusive range between lower and upper.
 712  *
 713  * Handles a swapped range (upper < lower).
 714  */
 715 static void reg_addrange (HSRegExp *preg, int lower, int upper) {
 716   if (lower > upper) reg_addrange(preg, upper, lower);
 717   /* add a range as length, start */
 718   regc(preg, upper-lower+1);
 719   regc(preg, lower);
 720 }
 721
 722
 723 /**
 724  * Add a null-terminated literal string as a set of ranges.
 725  */
 726 /*
 727 static void reg_addrange_str (HSRegExp *preg, const char *str) {
 728   while (*str) { reg_addrange(preg, *str, *str); ++str; }
 729 }
 730 */
 731
 732
 733 /**
 734  * Extracts the next unicode char from utf8.
 735  *
 736  * If 'upper' is set, converts the char to uppercase.
 737  */
 738 static inline int reg_utf8_tounicode_case (HSRegExp *preg, const char *s, int *uc, int upper) {
 739   if (preg->cflags&HSRX_UTF8) {
 740     int l = utf8_tounicode(s, uc);
 741     if (upper) *uc = utf8_upper(*uc);
 742     return l;
 743   }
 744   *uc = upper?toupper(*s):(unsigned char)(*s);
 745   return 1;
 746 }
 747
 748
 749 /**
 750  * Extracts the next unicode char from utf8.
 751  *
 752  * If 'upper' is set, converts the char to uppercase.
 753  */
 754 static inline int reg_utf8_tounicode_caseR (HSRegExp *preg, const char *s, int *uc, int upper) {
 755   if (preg->eflags&HSRX_UTF8) {
 756     int l = utf8_tounicode(s, uc);
 757     if (upper) *uc = utf8_upper(*uc);
 758     return l;
 759   }
 760   *uc = upper?toupper(*s):(unsigned char)(*s);
 761   return 1;
 762 }
 763
 764
 765 /**
 766  * Converts a hex digit to decimal.
 767  *
 768  * Returns -1 for an invalid hex digit.
 769  */
 770 static inline int hexdigitval (int c) {
 771   if (c >= '0' && c <= '9') return c-'0';
 772   if (c >= 'a' && c <= 'f') return c-'a'+10;
 773   if (c >= 'A' && c <= 'F') return c-'A'+10;
 774   return -1;
 775 }
 776
 777
 778 /**
 779  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 780  *
 781  * Returns the number of hex digits parsed.
 782  * If there are no hex digits, returns 0 and stores nothing.
 783  */
 784 static int parse_hex (const char *s, int n, int *uc) {
 785   int k, val = 0;
 786   //
 787   for (k = 0; k < n; ++k) {
 788     int c = hexdigitval(*s++);
 789     if (c == -1) break;
 790     val = (val<<4)|c;
 791   }
 792   if (k) *uc = val;
 793   return k;
 794 }
 795
 796
 797 /**
 798  * Call for chars after a backlash to decode the escape sequence.
 799  *
 800  * Stores the result in *ch.
 801  *
 802  * Returns the number of bytes consumed.
 803  */
 804 static int reg_decode_escape (const char *s, int *ch) {
 805   int n;
 806   const char *s0 = s;
 807   //
 808   *ch = *s++;
 809   switch (*ch) {
 810     case 'a': *ch = '\a'; break;
 811     case 'b': *ch = '\b'; break;
 812     case 'e': *ch = 27; break;
 813     case 'f': *ch = '\f'; break;
 814     case 'n': *ch = '\n'; break;
 815     case 'r': *ch = '\r'; break;
 816     case 't': *ch = '\t'; break;
 817     case 'v': *ch = '\v'; break;
 818     case 'u':
 819       if ((n = parse_hex(s, 4, ch)) > 0) s += n;
 820       break;
 821     case 'x':
 822       if ((n = parse_hex(s, 2, ch)) > 0) s += n;
 823       break;
 824     case '\0':
 825       --s;
 826       *ch = '\\';
 827       break;
 828   }
 829   return s-s0;
 830 }
 831
 832
 833 /*
 834  - regatom - the lowest level
 835  *
 836  * Optimization:  gobbles an entire sequence of ordinary characters so that
 837  * it can turn them into a single node, which is smaller to store and
 838  * faster to run.  Backslashed characters are exceptions, each becoming a
 839  * separate node; the code is simpler that way and it's not worth fixing.
 840  */
 841 static int regatom (HSRegExp *preg, int *flagp) {
 842   int ret, flags, ch;
 843   int nocase = preg->cflags&HSRX_ICASE;
 844   int n = reg_utf8_tounicode_case(preg, preg->regparse, &ch, nocase);
 845   int forced = 0;
 846   //
 847   *flagp = WORST; /* tentatively */
 848   preg->regparse += n;
 849   switch (ch) {
 850     case '^': /* this char only have special meaning at beging of pat */
 851       if (preg->regparse-n != preg->start) {
 852         switch (*(preg->regparse-n-1)) {
 853           case '(': break;
 854           case '|': if (preg->cflags&HSRX_EXTENDED) break; /* fallthru */
 855           default: forced = 1; goto de_fault;
 856         }
 857       }
 858       ret = regnode(preg, BOL);
 859       if (!(preg->cflags&HSRX_EXTENDED) && *preg->regparse == '*') goto de_fault;
 860       break;
 861     case '$': /* this char only have special meaning at end of pat */
 862       switch (*preg->regparse) {
 863         case 0: case ')': break;
 864         case '|': if (preg->cflags&HSRX_EXTENDED) break; /* fallthru */
 865         default: forced = 1; goto de_fault;
 866       }
 867       ret = regnode(preg, EOL);
 868       break;
 869     case '.':
 870       ret = regnode(preg, ANY);
 871       *flagp |= HASWIDTH|SIMPLE;
 872       break;
 873     case '[': {
 874       const char *pattern = preg->regparse;
 875       //
 876       if (*pattern == '^') {
 877         /* complement of range */
 878         ret = regnode(preg, ANYBUT);
 879         ++pattern;
 880       } else {
 881         ret = regnode(preg, ANYOF);
 882       }
 883       /* special case: if the first char is ']' or '-', it is part of the set */
 884       if (*pattern == ']' || *pattern == '-') {
 885         reg_addrange(preg, *pattern, *pattern);
 886         ++pattern;
 887       }
 888       while (*pattern && *pattern != ']') {
 889         /* is this a range? a-z */
 890         int start, end;
 891         //
 892         pattern += reg_utf8_tounicode_case(preg, pattern, &start, nocase);
 893         if (start == '\\') {
 894           pattern += reg_decode_escape(pattern, &start);
 895           if (start == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
 896         }
 897         if (pattern[0] == '-' && pattern[1]) {
 898           ++pattern; /* skip '-' */
 899           pattern += reg_utf8_tounicode_case(preg, pattern, &end, nocase);
 900           if (end == '\\') {
 901             pattern += reg_decode_escape(pattern, &end);
 902             if (end == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
 903           }
 904           reg_addrange(preg, start, end);
 905           continue;
 906         }
 907         if ((preg->cflags&HSRX_EXTENDED) && start == '[') {
 908           if (strncmp(pattern, ":alpha:]", 8) == 0) {
 909             pattern += 8;
 910             if (!nocase) reg_addrange(preg, 'a', 'z');
 911             reg_addrange(preg, 'A', 'Z');
 912             continue;
 913           }
 914           if (strncmp(pattern, ":alnum:]", 8) == 0) {
 915             pattern += 8;
 916             if (!nocase) reg_addrange(preg, 'a', 'z');
 917             reg_addrange(preg, 'A', 'Z');
 918             reg_addrange(preg, '0', '9');
 919             continue;
 920           }
 921           if (strncmp(pattern, ":space:]", 8) == 0) {
 922             pattern += 8;
 923             //reg_addrange_str(preg, " \t\r\n\f\v");
 924             reg_addrange(preg, 9, 13);
 925             reg_addrange(preg, 32, 32);
 926             continue;
 927           }
 928           if (strncmp(pattern, ":blank:]", 8) == 0) {
 929             pattern += 8;
 930             //reg_addrange_str(preg, " \t\f\v");
 931             reg_addrange(preg, 9, 9);
 932             reg_addrange(preg, 11, 12);
 933             reg_addrange(preg, 32, 32);
 934             continue;
 935           }
 936           if (strncmp(pattern, ":digit:]", 8) == 0) {
 937             pattern += 8;
 938             reg_addrange(preg, '0', '9');
 939             continue;
 940           }
 941           if (strncmp(pattern, ":lower:]", 8) == 0) {
 942             pattern += 8;
 943             if (nocase) reg_addrange(preg, 'A', 'Z'); else reg_addrange(preg, 'a', 'z');
 944             continue;
 945           }
 946           if (strncmp(pattern, ":upper:]", 8) == 0) {
 947             pattern += 8;
 948             reg_addrange(preg, 'A', 'Z');
 949             continue;
 950           }
 951           if (strncmp(pattern, ":xdigit:]", 9) == 0) {
 952             pattern += 9;
 953             reg_addrange(preg, '0', '9');
 954             if (!nocase) reg_addrange(preg, 'a', 'f');
 955             reg_addrange(preg, 'A', 'F');
 956             continue;
 957           }
 958         }
 959         /* not a range, so just add the char */
 960         reg_addrange(preg, start, start);
 961       }
 962       regc(preg, 0);
 963       if (*pattern) ++pattern;
 964       preg->regparse = pattern;
 965       *flagp |= HASWIDTH|SIMPLE;
 966       } break;
 967     case '(':
 968       // only for exteded mode
 969       if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 970         // non-capturing bracket
 971         preg->regparse += 2;
 972         ret = reg(preg, 2, &flags);
 973         goto bracecont;
 974       }
 975 bracestart:
 976       ret = reg(preg, 1, &flags);
 977 bracecont:
 978       if (ret == 0) return 0;
 979       *flagp |= flags&(HASWIDTH|SPSTART);
 980       break;
 981     case '\0': preg->err = HSRX_ERR_INTERNAL; return 0; /* supposed to be caught earlier */
 982     case ')':
 983 #if OBEY_POSIX_BUGS
 984       forced = 1;
 985       goto de_fault;
 986 #endif
 987     case '|':
 988       if (preg->cflags&HSRX_EXTENDED) { preg->err = HSRX_ERR_INTERNAL; return 0; } /* supposed to be caught earlier */
 989       forced = 1;
 990       goto de_fault;
 991     case '?':
 992     case '+':
 993     case '{':
 994     case '*':
 995       if (preg->cflags&HSRX_EXTENDED) { preg->err = HSRX_ERR_COUNT_FOLLOWS_NOTHING; return 0; }
 996       forced = 1;
 997       goto de_fault;
 998     case '\\':
 999       switch (*preg->regparse++) {
1000         case '\0': preg->err = HSRX_ERR_TRAILING_BACKSLASH; return 0;
1001         case '<': case 'm': ret = regnode(preg, WORDA); break;
1002         case '>': case 'M': ret = regnode(preg, WORDZ); break;
1003         case 'd': case 'D':
1004           ret = regnode(preg, preg->regparse[-1]=='d'?ANYOF:ANYBUT);
1005           reg_addrange(preg, '0', '9');
1006           regc(preg, 0);
1007           *flagp |= HASWIDTH|SIMPLE;
1008           break;
1009         case 'b': ret = regnode(preg, WBRK); break;
1010         case 'B': ret = regnode(preg, WNBRK); break;
1011         case 'w': case 'W':
1012           ret = regnode(preg, preg->regparse[-1]=='w'?ANYOF:ANYBUT);
1013           if (!(preg->cflags&HSRX_ICASE)) reg_addrange(preg, 'a', 'z');
1014           reg_addrange(preg, 'A', 'Z');
1015           reg_addrange(preg, '0', '9');
1016           reg_addrange(preg, '_', '_');
1017           regc(preg, 0);
1018           *flagp |= HASWIDTH|SIMPLE;
1019           break;
1020         case 's': case 'S':
1021           ret = regnode(preg, preg->regparse[-1]=='s'?ANYOF:ANYBUT);
1022           //reg_addrange_str(preg," \t\r\n\f\v");
1023           reg_addrange(preg, 9, 13);
1024           reg_addrange(preg, 32, 32);
1025           regc(preg, 0);
1026           *flagp |= HASWIDTH|SIMPLE;
1027           break;
1028         case '(':
1029           if (!(preg->cflags&HSRX_EXTENDED)) goto bracestart;
1030           goto escnotspecial;
1031         case '{':
1032           if (!(preg->cflags&HSRX_EXTENDED)) { preg->err = HSRX_ERR_COUNT_FOLLOWS_NOTHING; return 0; }
1033           goto escnotspecial;
1034         case ')':
1035         case '}':
1036           if (!(preg->cflags&HSRX_EXTENDED)) { preg->err = HSRX_ERR_INTERNAL; return 0; } /* supposed to be caught earlier */
1037           goto escnotspecial;
1038         case '0' ... '9':
1039           n = 0; ch = 0; --(preg->regparse);
1040           while (isdigit(preg->regparse[0])) {
1041             n = (n*10)+preg->regparse[0]-'0';
1042             if (n > HSRX_MAX_PAREN) { preg->regparse -= ch; preg->err = HSRX_ERR_INVALID_BACKREF; return 0; }
1043             ++(preg->regparse);
1044             ++ch;
1045           }
1046           if (ch > 1) {
1047             /* eat possible ';' */
1048             if (preg->regparse[0] == ';') { ++(preg->regparse); ++ch; }
1049           }
1050           if (n < 1 || n > preg->re_nsub) { preg->regparse -= ch; preg->err = HSRX_ERR_INVALID_BACKREF; return 0; }
1051           ret = regnode(preg, BACKREF);
1052           regc(preg, n);
1053           break;
1054         default:
1055           /* handle general quoted chars in exact-match routine */
1056           /* back up to include the backslash */
1057 escnotspecial:
1058           --preg->regparse;
1059           forced = 1;
1060           goto de_fault;
1061       }
1062       break;
1063 de_fault:
1064     default: {
1065       /* encode a string of characters to be matched exactly */
1066       int added = 0;
1067       /* back up to pick up the first char of interest */
1068       preg->regparse -= n;
1069       ret = regnode(preg, EXACTLY);
1070       /* note that a META operator such as ? or * consumes the preceding char */
1071       /* thus we must be careful to look ahead by 2 and add the last char as it's own EXACTLY if necessary */
1072       /* until end of string or a META char is reached */
1073       while (*preg->regparse && (forced || strchr(META, *preg->regparse) == NULL)) {
1074         forced = 0;
1075         n = reg_utf8_tounicode_case(preg, preg->regparse, &ch, preg->cflags&HSRX_ICASE);
1076         if (ch == '\\' && preg->regparse[n]) {
1077           /* non-trailing backslash: is this a special escape, or a regular escape? */
1078           if (strchr("<>mMwds", preg->regparse[n])) break; /* a special escape; all done with EXACTLY */
1079           if (!(preg->cflags&HSRX_EXTENDED) && strchr("(){}", preg->regparse[n])) break; /* a special escape; all done with EXACTLY */
1080           /* decode it; note that we add the length for the escape
1081            * sequence to the length for the backlash so we can skip
1082            * the entire sequence, or not as required
1083            */
1084           n += reg_decode_escape(preg->regparse+n, &ch);
1085           if (ch == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
1086         }
1087         /* now we have one char 'ch' of length 'n'; check to see if the following char is a MULT */
1088         if (((preg->cflags&HSRX_EXTENDED) && ISMULT(preg->regparse[n])) ||
1089             (!(preg->cflags&HSRX_EXTENDED) && (preg->regparse[n] == '*' || (preg->regparse[n] == '\\' && preg->regparse[n+1] == '{')))) {
1090           /* yes, but do we already have some EXACTLY chars? */
1091           if (added) break; /* yes, so return what we have and pick up the current char next time around */
1092           /* no, so add this single char and finish */
1093           regc(preg, ch);
1094           ++added;
1095           preg->regparse += n;
1096           break;
1097         }
1098         /* no, so just add this char normally */
1099         regc(preg, ch);
1100         ++added;
1101         preg->regparse += n;
1102       }
1103       regc(preg, 0);
1104       *flagp |= HASWIDTH;
1105       if (added == 1) *flagp |= SIMPLE;
1106       break;
1107     }
1108     break;
1109   }
1110   return ret;
1111 }
1112
1113
1114 static void reg_grow (HSRegExp *preg, int n) {
1115   if (preg->p+n >= preg->proglen) {
1116     preg->proglen = (preg->p+n)*2;
1117     preg->program = realloc(preg->program, preg->proglen*sizeof(int));
1118   }
1119 }
1120
1121
1122 /*
1123  - regnode - emit a node
1124  */
1125 /* location */
1126 static int regnode (HSRegExp *preg, int op) {
1127   reg_grow(preg, 2);
1128   preg->program[preg->p++] = op;
1129   preg->program[preg->p++] = 0;
1130   /* return the start of the node */
1131   return preg->p-2;
1132 }
1133
1134
1135 /*
1136  - regc - emit (if appropriate) a byte of code
1137  */
1138 static void regc (HSRegExp *preg, int b) {
1139   reg_grow(preg, 1);
1140   preg->program[preg->p++] = b;
1141 }
1142
1143
1144 /*
1145  - reginsert - insert an operator in front of already-emitted operand
1146  *
1147  * Means relocating the operand.
1148  * Returns the new location of the original operand.
1149  */
1150 static int reginsert (HSRegExp *preg, int op, int size, int opnd) {
1151   reg_grow(preg, size);
1152   /* move everything from opnd up */
1153   memmove(preg->program+opnd+size, preg->program+opnd, sizeof(int)*(preg->p-opnd));
1154   /* zero out the new space */
1155   memset(preg->program+opnd, 0, sizeof(int)*size);
1156   preg->program[opnd] = op;
1157   preg->p += size;
1158   return opnd+size;
1159 }
1160
1161
1162 /*
1163  - regtail - set the next-pointer at the end of a node chain
1164  */
1165 static void regtail_ (HSRegExp *preg, int p, int val, int line) {
1166   int scan, temp, offset;
1167   /* find last node */
1168   scan = p;
1169   for (;;) {
1170     temp = regnext(preg, scan);
1171     if (temp == 0) break;
1172     scan = temp;
1173   }
1174   offset = OP(preg, scan)==BACK?scan-val:val-scan;
1175   preg->program[scan+1] = offset;
1176 }
1177
1178
1179 /*
1180  - regoptail - regtail on operand of first argument; nop if operandless
1181  */
1182 static void regoptail (HSRegExp *preg, int p, int val) {
1183   /* "operandless" and "op != BRANCH" are synonymous in practice */
1184   if (p != 0 && OP(preg, p) == BRANCH) regtail(preg, OPERAND(p), val);
1185 }
1186
1187
1188 /*
1189  * hsrxExec and friends
1190  */
1191
1192 /*
1193  * forwards
1194  */
1195 static int regtry (HSRegExp *preg, const char *string);
1196 static int regmatch (HSRegExp *preg, int prog);
1197 static int regrepeat (HSRegExp *preg, int p, int max);
1198
1199
1200 /*
1201  - hsrxExec - match a regexp against a string
1202  */
1203 int hsrxExec (HSRegExp *preg, const char *string, size_t nmatch, HSRxMatch pmatch[], int eflags) {
1204   const char *s;
1205   int scan;
1206   //int parens[HSRX_MAX_PAREN+1];
1207   /* be paranoid... */
1208   if (preg == NULL || preg->program == NULL || string == NULL) return HSRX_ERR_NULL_ARGUMENT;
1209   /* check validity of program */
1210   if (*preg->program != HSRX_MAGIC) return HSRX_ERR_CORRUPTED;
1211 #ifdef HSRX_DEBUG
1212   fprintf(stderr, "hsrxExec: %s\n", string);
1213   hsrxDump(preg);
1214 #endif
1215   preg->eflags = eflags;
1216   preg->pmatch = pmatch;
1217   preg->nmatch = nmatch;
1218   preg->start = string; /* all offsets are computed from here */
1219   /* must clear out the embedded repeat counts */
1220   for (scan = OPERAND(1); scan != 0; scan = regnext(preg, scan)) {
1221     switch (OP(preg, scan)) {
1222       case REP: case REPMIN: case REPX: case REPXMIN: preg->program[scan+4] = 0; break;
1223     }
1224   }
1225   /* clear matches for backrefing */
1226   for (scan = 0; scan <= preg->re_nsub; ++scan) preg->bmatch[scan].rm_so = preg->bmatch[scan].rm_eo = -1;
1227   /* if there is a "must appear" string, look for it */
1228   if (preg->regmust != 0) {
1229     s = string;
1230     while ((s = str_find(preg, s, preg->program[preg->regmust])) != NULL) {
1231       if (prefix_cmp(preg, preg->program+preg->regmust, preg->regmlen, s) >= 0) {
1232         break;
1233       }
1234       ++s;
1235     }
1236     if (s == NULL) return HSRX_NOMATCH; /* not present */
1237   }
1238   /* mark beginning of line for ^ */
1239   preg->regbol = string;
1240   /* simplest case: anchored match need be tried only once (maybe per line) */
1241   if (preg->reganch) {
1242     if (!(preg->cflags&HSRX_NEWLINE) && (eflags&HSRX_NOTBOL)) {
1243       /* this is an anchored search, but not an BOL, so possibly skip to the next line */
1244       goto nextline;
1245     }
1246     for (;;) {
1247       int ret = regtry(preg, string);
1248       if (ret) return HSRX_NOERROR;
1249       if (*string) {
1250 nextline:
1251         if (preg->cflags&HSRX_NEWLINE) {
1252           /* try the next anchor? */
1253           string = strchr(string, '\n');
1254           if (string == NULL) { preg->regbol = ++string; continue; }
1255         }
1256       }
1257       return HSRX_NOMATCH;
1258     }
1259   }
1260   /* messy cases: unanchored match */
1261   s = string;
1262   if (preg->regstart != '\0') {
1263     /* we know what char it must start with */
1264     while ((s = str_find(preg, s, preg->regstart)) != NULL) {
1265       if (regtry(preg, s)) return HSRX_NOERROR;
1266       ++s;
1267     }
1268   } else {
1269     /* we don't -- general case */
1270     while (*string) {
1271       int l;
1272       if (regtry(preg, s)) return HSRX_NOERROR;
1273       if (*s == '\0') break;
1274       l = eflags&HSRX_UTF8?utf8_charlen(*s):1;
1275       s += l;
1276     }
1277   }
1278   /* failure */
1279   return HSRX_NOMATCH;
1280 }
1281
1282
1283 /*
1284  - regtry - try match at specific point
1285  */
1286 /* 0 failure, 1 success */
1287 static int regtry (HSRegExp *preg, const char *string) {
1288   int i;
1289   //
1290   preg->reginput = string;
1291   for (i = 0; i < preg->nmatch; ++i) preg->pmatch[i].rm_so = preg->pmatch[i].rm_eo = -1;
1292   if (regmatch(preg, 1)) {
1293     if (preg->nmatch > 0 && preg->pmatch != NULL) {
1294       preg->pmatch[0].rm_so = string-preg->start;
1295       preg->pmatch[0].rm_eo = preg->reginput-preg->start;
1296     }
1297     return 1;
1298   }
1299   return 0;
1300 }
1301
1302
1303 /**
1304  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1305  *
1306  * If 'nocase' is non-zero, does a case-insensitive match.
1307  *
1308  * Returns -1 on not found.
1309  */
1310 static int prefix_cmp (HSRegExp *preg, const int *prog, int proglen, const char *string) {
1311   const char *s = string;
1312   int nocase = preg->cflags&HSRX_ICASE;
1313   //
1314   while (proglen && *s) {
1315     int ch, n = reg_utf8_tounicode_caseR(preg, s, &ch, nocase);
1316     //
1317     if (ch != *prog) return -1;
1318     ++prog;
1319     s += n;
1320     --proglen;
1321   }
1322   if (proglen == 0) return s-string;
1323   return -1;
1324 }
1325
1326
1327 /**
1328  * Search for the character 'c' in the utf-8 string 'string'.
1329  *
1330  * If 'nocase' is set, the 'string' is assumed to be uppercase
1331  * and 'c' is converted to uppercase before matching.
1332  *
1333  * Returns the byte position in the string where the 'c' was found, or
1334  * NULL if not found.
1335  */
1336 static const char *str_find (HSRegExp *preg, const char *string, int c) {
1337   int nocase = preg->cflags&HSRX_ICASE;
1338   //
1339   if (nocase) c = utf8_upper(c);
1340   while (*string) {
1341     int ch, n = reg_utf8_tounicode_caseR(preg, string, &ch, nocase);
1342     //
1343     if (c == ch) return string;
1344     string += n;
1345   }
1346   return NULL;
1347 }
1348
1349
1350 /**
1351  * Searchs for 'c' in the range 'range'.
1352  *
1353  * Returns 1 if found, or 0 if not.
1354  */
1355 static int reg_range_find (HSRegExp *preg, const int *range, int c) {
1356   if (preg->cflags&HSRX_ICASE) c = utf8_upper(c);
1357   while (*range) {
1358     /*printf("checking %d in range [%d,%d]\n", c, range[1], (range[0]+range[1]-1));*/
1359     if (c >= range[1] && c <= range[0]+range[1]-1) return 1;
1360     range += 2;
1361   }
1362   return 0;
1363 }
1364
1365
1366 /**
1367  * Returns true if 'ch' is an end-of-line char.
1368  *
1369  * In HSRX_NEWLINE mode, \n is considered EOL in
1370  * addition to \0
1371  */
1372 static int reg_iseol (HSRegExp *preg, int ch) {
1373   if (preg->cflags&HSRX_NEWLINE) return ch=='\0'||ch=='\n';
1374   return ch=='\0';
1375 }
1376
1377
1378 static int regmatchsimplerepeat (HSRegExp *preg, int scan, int matchmin) {
1379   int no, c;
1380   const char *save;
1381   int nextch = '\0';
1382   int max = preg->program[scan+2];
1383   int min = preg->program[scan+3];
1384   int next = regnext(preg, scan);
1385   /* lookahead to avoid useless match attempts when we know what character comes next */
1386   if (OP(preg, next) == EXACTLY) nextch = preg->program[OPERAND(next)];
1387   save = preg->reginput;
1388   no = regrepeat(preg, scan+5, max);
1389   if (no < min) return 0;
1390   if (matchmin) {
1391     /* from min up to no */
1392     max = no;
1393     no = min;
1394   }
1395   /* else from no down to min */
1396   for (;;) {
1397     if (matchmin) {
1398       if (no > max) break;
1399     } else {
1400       if (no < min) break;
1401     }
1402     preg->reginput = save+utf8_index(save, no);
1403     reg_utf8_tounicode_caseR(preg, preg->reginput, &c, preg->cflags&HSRX_ICASE);
1404     /* if it could work, try it */
1405     if (reg_iseol(preg, nextch) || c == nextch) {
1406       if (regmatch(preg, next)) return 1;
1407     }
1408     if (matchmin) {
1409       /* couldn't or didn't, add one more */
1410       ++no;
1411     } else {
1412       /* couldn't or didn't, back up */
1413       --no;
1414     }
1415   }
1416   return 0;
1417 }
1418
1419
1420 static int regmatchrepeat (HSRegExp *preg, int scan, int matchmin) {
1421   int *scanpt = preg->program+scan;
1422   int max = scanpt[2];
1423   int min = scanpt[3];
1424   //
1425   /* have we reached min? */
1426   if (scanpt[4] < min) {
1427     /* no, so get another one */
1428     ++(scanpt[4]);
1429     if (regmatch(preg, scan+5)) return 1;
1430     --(scanpt[4]);
1431     return 0;
1432   }
1433   if (scanpt[4] > max) return 0;
1434   if (matchmin) {
1435     /* minimal, so try other branch first */
1436     if (regmatch(preg, regnext(preg, scan))) return 1;
1437     /* no, so try one more */
1438     ++(scanpt[4]);
1439     if (regmatch(preg, scan+5)) return 1;
1440     --(scanpt[4]);
1441     return 0;
1442   }
1443   /* maximal, so try this branch again */
1444   if (scanpt[4] < max) {
1445     ++(scanpt[4]);
1446     if (regmatch(preg, scan+5)) return 1;
1447     --(scanpt[4]);
1448   }
1449   /* at this point we are at max with no match: try the other branch */
1450   return regmatch(preg, regnext(preg, scan));
1451 }
1452
1453
1454 static int prevChar (HSRegExp *preg) {
1455   if (preg->reginput == preg->start) return 0;
1456   if (preg->eflags&HSRX_UTF8) {
1457     int c;
1458     const char *pcp = preg->reginput-utf8_prev_len(preg->reginput, preg->reginput-preg->start);
1459     reg_utf8_tounicode_caseR(preg, pcp, &c, preg->cflags&HSRX_ICASE);
1460     return c;
1461   }
1462   return (unsigned char)(preg->reginput[-1]);
1463 }
1464
1465
1466 /*TODO: unicode word breaks?*/
1467 static inline int isWordChar (int ch) {
1468   return
1469     (ch >= 'a' && ch <= 'z') ||
1470     (ch >= 'A' && ch <= 'Z') ||
1471     (ch >= '0' && ch <= '9') ||
1472     (ch == '_');
1473 }
1474
1475
1476 /*
1477  - regmatch - main matching routine
1478  *
1479  * Conceptually the strategy is simple:  check to see whether the current
1480  * node matches, call self recursively to see whether the rest matches,
1481  * and then act accordingly.  In practice we make some effort to avoid
1482  * recursion, in particular by going through "ordinary" nodes (that don't
1483  * need to know whether the rest of the match failed) by a loop instead of
1484  * by recursion.
1485  */
1486 /* 0 failure, 1 success */
1487 static int regmatch (HSRegExp *preg, int prog) {
1488   int scan; /* current node */
1489   int next; /* next node */
1490   //
1491   scan = prog;
1492 #ifdef HSRX_DEBUG
1493   if (scan != 0 && hsrxNarrate) fprintf(stderr, "%s(\n", regprop(scan));
1494 #endif
1495   while (scan != 0) {
1496     int n, c, s;
1497 #ifdef HSRX_DEBUG
1498     if (hsrxNarrate) {
1499       //fprintf(stderr, "%s...\n", regprop(scan));
1500       fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* where, what */
1501     }
1502 #endif
1503     next = regnext(preg, scan);
1504     n = reg_utf8_tounicode_caseR(preg, preg->reginput, &c, preg->cflags&HSRX_ICASE);
1505     switch (OP(preg, scan)) {
1506       case BOL:
1507         if ((preg->cflags&HSRX_NEWLINE) || !(preg->eflags&HSRX_NOTBOL)) {
1508           if (preg->reginput != preg->regbol) return 0;
1509         }
1510         if (!(preg->cflags&HSRX_NEWLINE) && (preg->eflags&HSRX_NOTBOL)) return 0;
1511         break;
1512       case EOL:
1513         if ((preg->cflags&HSRX_NEWLINE) || !(preg->eflags&HSRX_NOTEOL)) {
1514           if (!reg_iseol(preg, c)) return 0;
1515         }
1516         if (!(preg->cflags&HSRX_NEWLINE) && (preg->eflags&HSRX_NOTEOL)) return 0;
1517         break;
1518       case WORDA:
1519         /* must be looking at a letter, digit, or _ */
1520         if (!isalnum(UCHAR(c)) && c != '_') return 0;
1521         /* prev must be BOL or nonword */
1522         if (preg->reginput > preg->regbol && (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_')) return 0;
1523         break;
1524       case WORDZ:
1525         /* can't match at BOL */
1526         if (preg->reginput > preg->regbol) {
1527           /* current must be EOL or nonword */
1528           if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1529             c = preg->reginput[-1];
1530             /* previous must be word */
1531             if (isalnum(UCHAR(c)) || c == '_') break;
1532           }
1533         }
1534         /* no */
1535         return 0;
1536       case WBRK:
1537         if (isWordChar(prevChar(preg)) == isWordChar(c)) return 0; /* failed */
1538         break;
1539       case WNBRK:
1540         if (isWordChar(prevChar(preg)) != isWordChar(c)) return 0; /* failed */
1541         break;
1542       case ANY:
1543         if (reg_iseol(preg, c)) return 0;
1544         preg->reginput += n;
1545         break;
1546       case EXACTLY: {
1547         int opnd = OPERAND(scan);
1548         int len = str_int_len(preg->program+opnd);
1549         int slen = prefix_cmp(preg, preg->program+opnd, len, preg->reginput);
1550         //
1551         if (slen < 0) return 0;
1552         preg->reginput += slen;
1553         } break;
1554       case ANYOF:
1555         if (reg_iseol(preg, c) || reg_range_find(preg, preg->program+OPERAND(scan), c) == 0) return 0;
1556         preg->reginput += n;
1557         break;
1558       case ANYBUT:
1559         if (reg_iseol(preg, c) || reg_range_find(preg, preg->program+OPERAND(scan), c) != 0) return 0;
1560         preg->reginput += n;
1561         break;
1562       case NOTHING: break;
1563       case BACK: break;
1564       case BRANCH:
1565         if (OP(preg, next) != BRANCH) {
1566           /* no choice */
1567           next = OPERAND(scan); /* avoid recursion */
1568         } else {
1569           const char *save;
1570           //
1571           do {
1572             save = preg->reginput;
1573             if (regmatch(preg, OPERAND(scan))) return 1;
1574             preg->reginput = save;
1575             scan = regnext(preg, scan);
1576           } while (scan != 0 && OP(preg, scan) == BRANCH);
1577           return 0;
1578           /* NOTREACHED */
1579         }
1580         break;
1581       case REP:
1582       case REPMIN:
1583         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1584       case REPX:
1585       case REPXMIN:
1586         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1587       case END:
1588         return 1; /* success! */
1589       case BACKREF: /* backreference */
1590         c = preg->program[OPERAND(scan)];
1591 #ifdef HSRX_DEBUG
1592         printf("BACKREF: %d (%d)\n", c, preg->re_nsub);
1593 #endif
1594         if (c < 1 || c > preg->re_nsub) return 0; /* invalid backref can't be matched */
1595         s = preg->bmatch[c].rm_so;
1596 #ifdef HSRX_DEBUG
1597         printf("  %d;%d\n", s, preg->bmatch[c].rm_eo);
1598 #endif
1599         if (s < 0 || preg->bmatch[c].rm_eo < s) return 0; /* capture not found yet */
1600         /* check string */
1601         n = preg->bmatch[c].rm_eo-s; /* capture length */
1602         /* dumb compare */
1603         for (c = 0; c < n; ++c) {
1604 #ifdef HSRX_DEBUG
1605           printf(" [%c] [%c]\n", preg->reginput[c], preg->start[s+c]);
1606 #endif
1607           if (!preg->reginput[c] || preg->reginput[c] != preg->start[s+c]) {
1608 #ifdef HSRX_DEBUG
1609             printf("  FAIL!\n");
1610 #endif
1611             return 0; /* alas */
1612           }
1613         }
1614 #ifdef HSRX_DEBUG
1615         printf("  HIT!\n");
1616 #endif
1617         /* move position and return success */
1618         preg->reginput += n;
1619         break;
1620       default:
1621         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) <= /*CLOSEEND*/NCCLOSE) {
1622           const char *save = preg->reginput;
1623           //int saveso = 0, saveeo = 0;
1624           //
1625           if (OP(preg, scan) < CLOSEEND) {
1626             if (OP(preg, scan) < CLOSE) {
1627               c = OP(preg, scan)-OPEN;
1628               preg->bmatch[c].rm_so = save-preg->start;
1629             } else {
1630               c = OP(preg, scan)-CLOSE;
1631               preg->bmatch[c].rm_eo = save-preg->start;
1632             }
1633           }
1634           //
1635           if (regmatch(preg, next)) {
1636             /* don't set startp if some later invocation of the same parentheses already has */
1637             if (OP(preg, scan) < CLOSEEND) {
1638               /* capturing parens */
1639               if (OP(preg, scan) < CLOSE) {
1640                 /* open */
1641                 c = OP(preg, scan)-OPEN;
1642                 if (c < preg->nmatch && preg->pmatch[c].rm_so == -1) preg->pmatch[c].rm_so = save-preg->start;
1643               } else {
1644                 /* close */
1645                 c = OP(preg, scan)-CLOSE;
1646                 if (c < preg->nmatch && preg->pmatch[c].rm_eo == -1) preg->pmatch[c].rm_eo = save-preg->start;
1647               }
1648             }
1649             return 1;
1650           }
1651           //
1652           if (OP(preg, scan) < CLOSEEND) {
1653             if (OP(preg, scan) < CLOSE) {
1654               c = OP(preg, scan)-OPEN;
1655               preg->bmatch[c].rm_so = -1;
1656             } else {
1657               c = OP(preg, scan)-CLOSE;
1658               preg->bmatch[c].rm_eo = -1;
1659             }
1660           }
1661           //
1662           return 0;
1663         }
1664         return HSRX_ERR_INTERNAL;
1665     }
1666     scan = next;
1667   }
1668   /* we get here only if there's trouble -- normally "case END" is the terminating point */
1669   return HSRX_ERR_INTERNAL;
1670 }
1671
1672
1673 /*
1674  - regrepeat - repeatedly match something simple, report how many
1675  */
1676 static int regrepeat (HSRegExp *preg, int p, int max) {
1677   const char *scan;
1678   int opnd, ch, n;
1679   int count = 0;
1680   //
1681   scan = preg->reginput;
1682   opnd = OPERAND(p);
1683   switch (OP(preg, p)) {
1684     case ANY:
1685       /* no need to handle utf8 specially here */
1686       while (!reg_iseol(preg, *scan) && count < max) { ++count; ++scan; }
1687       break;
1688     case EXACTLY:
1689       while (count < max) {
1690         n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1691         if (preg->program[opnd] != ch) break;
1692         ++count;
1693         scan += n;
1694       }
1695       break;
1696     case ANYOF:
1697       while (count < max) {
1698         n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1699         if (reg_iseol(preg, ch) || reg_range_find(preg, preg->program+opnd, ch) == 0) break;
1700         ++count;
1701         scan += n;
1702       }
1703       break;
1704     case ANYBUT:
1705       while (count < max) {
1706         n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1707         if (reg_iseol(preg, ch) || reg_range_find(preg, preg->program+opnd, ch) != 0) break;
1708         ++count;
1709         scan += n;
1710       }
1711       break;
1712     default: /* ih dear! called inappropriately */
1713       preg->err = HSRX_ERR_INTERNAL;
1714       count = 0; /* best compromise */
1715       break;
1716   }
1717   preg->reginput = scan;
1718   return count;
1719 }
1720
1721
1722 /*
1723  - regnext - dig the "next" pointer out of a node
1724  */
1725 static int regnext (HSRegExp *preg, int p) {
1726   int offset = NEXT(preg, p);
1727   //
1728   if (offset == 0) return 0;
1729   if (OP(preg, p) == BACK) return p-offset;
1730   return p+offset;
1731 }
1732
1733
1734 #ifdef HSRX_DEBUG
1735 /*
1736  - hsrxDump - dump a regexp onto stdout in vaguely comprehensible form
1737  */
1738 static void dumpChar (int ch) {
1739   if (ch > 255) {
1740     printf("{0x%x}", (unsigned int)ch);
1741   } else if (ch < 32 || ch >= 127) {
1742     printf("{%d}", ch);
1743   } else {
1744     putchar(ch);
1745   }
1746 }
1747
1748
1749 void hsrxDump (HSRegExp *preg) {
1750   int s, next, i;
1751   char buf[4];
1752   int op = EXACTLY; /* arbitrary non-END op */
1753   //
1754   for (i = 1; i < preg->p; ++i) {
1755     printf("%02x ", preg->program[i]);
1756     if (i%16 == 15) printf("\n");
1757   }
1758   printf("\n");
1759   //
1760   s = 1;
1761   /* while that wasn't END last time... */
1762   while (op != END && s < preg->p) {
1763     op = OP(preg, s);
1764     printf("%3d: %-12s", s, regprop(op)); /* where, what */
1765     next = regnext(preg, s);
1766     if (next == 0) {
1767       /* next ptr */
1768       printf("(0)");
1769     } else {
1770       printf("(%d)", next);
1771     }
1772     s += 2;
1773     if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1774       int max = preg->program[s];
1775       int min = preg->program[s+1];
1776       //
1777       if (max == 65535) printf(" {%d,*}", min); else printf(" {%d,%d}", min, max);
1778       printf(" %d", preg->program[s+2]);
1779       s += 3;
1780     } else if (op == ANYOF || op == ANYBUT) {
1781       /* set of ranges */
1782       printf(" r: [");
1783       while (preg->program[s]) {
1784         int len = preg->program[s++];
1785         int first = preg->program[s++];
1786         /*
1787         buf[utf8_fromunicode(buf, first)] = 0;
1788         printf("%s", buf);
1789         if (len > 1) {
1790           buf[utf8_fromunicode(buf, first+len-1)] = 0;
1791           printf("-%s", buf);
1792         }
1793         */
1794         dumpChar(first);
1795         printf("-");
1796         if (len > 1) {
1797           dumpChar(first+len-1);
1798         } else {
1799           dumpChar(first);
1800         }
1801       }
1802       printf("]");
1803       ++s;
1804     } else if (op == EXACTLY) {
1805       /* literal string, where present */
1806       printf(" l: '");
1807       while (preg->program[s]) {
1808         /*
1809         buf[utf8_fromunicode(buf, preg->program[s])] = 0;
1810         printf("%s", buf);
1811         */
1812         dumpChar(preg->program[s]);
1813         ++s;
1814       }
1815       printf("'");
1816       ++s;
1817     } else if (op == BACKREF) {
1818       printf(" refno: %d", preg->program[s++]);
1819     }
1820     putchar('\n');
1821   }
1822   if (op == END) {
1823     /* header fields of interest */
1824     if (preg->regstart) {
1825       buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1826       printf("start '%s' ", buf);
1827     }
1828     if (preg->reganch) printf("anchored ");
1829     if (preg->regmust != 0) {
1830       int i;
1831       //
1832       printf("must have:");
1833       for (i = 0; i < preg->regmlen; ++i) putchar(preg->program[preg->regmust+i]);
1834       putchar('\n');
1835     }
1836   }
1837   printf("\n");
1838 }
1839
1840
1841 /*
1842  - regprop - printable representation of opcode
1843  */
1844 static const char *regprop (int op) {
1845   static char buf[50];
1846   //
1847   switch (op) {
1848     case BOL: return "BOL";
1849     case EOL: return "EOL";
1850     case ANY: return "ANY";
1851     case ANYOF: return "ANYOF";
1852     case ANYBUT: return "ANYBUT";
1853     case BRANCH: return "BRANCH";
1854     case EXACTLY: return "EXACTLY";
1855     case NOTHING: return "NOTHING";
1856     case BACK: return "BACK";
1857     case END: return "END";
1858     case REP: return "REP";
1859     case REPMIN: return "REPMIN";
1860     case REPX: return "REPX";
1861     case REPXMIN: return "REPXMIN";
1862     case WORDA: return "WORDA";
1863     case WORDZ: return "WORDZ";
1864     case BACKREF: return "BACKREF";
1865     case NCOPEN: return "NCOPEN";
1866     case NCCLOSE: return "NCCLOSE";
1867     default:
1868       if (op >= OPEN && op < CLOSE) snprintf(buf, sizeof(buf), "OPEN:%d", op-OPEN);
1869       else if (op >= CLOSE && op < CLOSEEND) snprintf(buf, sizeof(buf), "CLOSE:%d", op-CLOSE);
1870       else snprintf(buf, sizeof(buf), "?%d?\n", op);
1871       return buf;
1872   }
1873 }
1874 #endif
1875
1876
1877 size_t hsrxError (int errcode, const HSRegExp *preg, char *errbuf,  size_t errbuf_size) {
1878   static const char *error_strings[] = {
1879     "success",
1880     "no match",
1881     "bad pattern",
1882     "null argument",
1883     "unknown error",
1884     "too big",
1885     "out of memory",
1886     "too many ()",
1887     "parentheses () not balanced",
1888     "braces {} not balanced",
1889     "invalid repetition count(s)",
1890     "extra characters",
1891     "*+ of empty atom",
1892     "nested count",
1893     "internal error",
1894     "count follows nothing",
1895     "trailing backslash",
1896     "corrupted program",
1897     "contains null char",
1898     "invalid backreference number",
1899   };
1900   const char *err;
1901   //
1902   if (errcode < 0 || errcode >= HSRX_ERR_NUM) {
1903     err = "Bad error code";
1904   } else {
1905     err = error_strings[errcode];
1906   }
1907   return snprintf(errbuf, errbuf_size, "%s", err);
1908 }
1909
1910
1911 void hsrxFree (HSRegExp *preg) {
1912   if (preg != NULL) {
1913     if (preg->bmatch != NULL) free(preg->bmatch); preg->bmatch = NULL;
1914     if (preg->program != NULL) free(preg->program); preg->program = NULL;
1915   }
1916 }