jimregexp.c

   1 /*
   2  * vi:se ts=8:
   3  *
   4  * regcomp and regexec -- regsub and regerror are elsewhere
   5  *
   6  *      Copyright (c) 1986 by University of Toronto.
   7  *      Written by Henry Spencer.  Not derived from licensed software.
   8  *
   9  *      Permission is granted to anyone to use this software for any
  10  *      purpose on any computer system, and to redistribute it freely,
  11  *      subject to the following restrictions:
  12  *
  13  *      1. The author is not responsible for the consequences of use of
  14  *              this software, no matter how awful, even if they arise
  15  *              from defects in it.
  16  *
  17  *      2. The origin of this software must not be misrepresented, either
  18  *              by explicit claim or by omission.
  19  *
  20  *      3. Altered versions must be plainly marked as such, and must not
  21  *              be misrepresented as being the original software.
  22  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  23  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  24  *** to assist in implementing egrep.
  25  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  26  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  27  *** as in BSD grep and ex.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  29  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  31  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  32  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  33  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  34  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  35  *** to avoid conflicting with the system regexp.h.  Const, bless its
  36  *** soul, was removed so it can compile everywhere.  The declaration
  37  *** of strchr() was in conflict on AIX, so it was removed (as it is
  38  *** happily defined in string.h).
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  41  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  42  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  43  *
  44  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  45  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  46  *   This includes counted repetitions, UTF-8 support, character classes,
  47  *   shorthand character classes, increased number of parentheses to 100,
  48  *   backslash escape sequences. It also removes \n as an alternative to |.
  49  *
  50  * Beware that some of this code is subtly aware of the way operator
  51  * precedence is structured in regular expressions.  Serious changes in
  52  * regular-expression syntax might require a total rethink.
  53  */
  54
  55 #include "jimautoconf.h"
  56
  57 #if defined(JIM_REGEXP)
  58 #include <stdio.h>
  59 #include <ctype.h>
  60 #include <stdlib.h>
  61 #include <string.h>
  62
  63 #include "jim.h"
  64 #include "jimregexp.h"
  65 #include "utf8.h"
  66
  67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
  68 #define REG_MAX_PAREN 100
  69
  70 /*
  71  * Structure for regexp "program".  This is essentially a linear encoding
  72  * of a nondeterministic finite-state machine (aka syntax charts or
  73  * "railroad normal form" in parsing technology).  Each node is an opcode
  74  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  75  * all nodes except BRANCH implement concatenation; a "next" pointer with
  76  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  77  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  78  * opposed to a collection of them) is never concatenated with anything
  79  * because of operator precedence.)  The operand of some types of node is
  80  * a literal string; for others, it is a node leading into a sub-FSM.  In
  81  * particular, the operand of a BRANCH node is the first node of the branch.
  82  * (NB this is *not* a tree structure:  the tail of the branch connects
  83  * to the thing following the set of BRANCHes.)  The opcodes are:
  84  */
  85
  86 /* definition           number  opnd?   meaning */
  87 #define END     0       /* no   End of program. */
  88 #define BOL     1       /* no   Match "" at beginning of line. */
  89 #define EOL     2       /* no   Match "" at end of line. */
  90 #define ANY     3       /* no   Match any one character. */
  91 #define ANYOF   4       /* str  Match any character in this string. */
  92 #define ANYBUT  5       /* str  Match any character not in this string. */
  93 #define BRANCH  6       /* node Match this alternative, or the next... */
  94 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  95 #define EXACTLY 8       /* str  Match this string. */
  96 #define NOTHING 9       /* no   Match empty string. */
  97 #define REP     10      /* max,min      Match this (simple) thing [min,max] times. */
  98 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, minimal match. */
  99 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
 100 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
 101 #define BOLX    14      /* no   Match "" at beginning of input. */
 102 #define EOLX    15      /* no   Match "" at end of input. */
 103 #define WORDA   16      /* no   Match "" at wordchar, where prev is nonword */
 104 #define WORDZ   17      /* no   Match "" at nonwordchar, where prev is word */
 105
 106 #define OPENNC  1000    /* no   Non-capturing parentheses - must be OPEN-1 */
 107 #define OPEN    1001    /* no   Mark this point in input as start of #n. */
 108                         /*      OPEN+1 is number 1, etc. */
 109
 110 /* must not be any other opts between OPEN and CLOSE */
 111
 112 #define CLOSENC 2000    /* no   Non-capturing parentheses - must be CLOSE-1 */
 113 #define CLOSE   2001    /* no   Analogous to OPEN. */
 114 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 115
 116 /*
 117  * The first word of the regexp internal "program" is actually this magic
 118  * number; the start node begins in the second word.
 119  */
 120 #define REG_MAGIC       0xFADED00D
 121
 122 /*
 123  * Opcode notes:
 124  *
 125  * BRANCH       The set of branches constituting a single choice are hooked
 126  *              together with their "next" pointers, since precedence prevents
 127  *              anything being concatenated to any individual branch.  The
 128  *              "next" pointer of the last BRANCH in a choice points to the
 129  *              thing following the whole choice.  This is also where the
 130  *              final "next" pointer of each individual branch points; each
 131  *              branch starts with the operand node of a BRANCH node.
 132  *
 133  * BACK         Normal "next" pointers all implicitly point forward; BACK
 134  *              exists to make loop structures possible.
 135  *
 136  * REP,REPX     Repeated matches ('?', '*', '+' and {min,max}) are implemented
 137  *              as either simple repeats (REP) or complex repeats (REPX).
 138  *              These opcodes include a "min" and "max" count after the opcode.
 139  *              This is followed by a fourth "current count" word that is
 140  *              only used by REPX, as it implements a recursive match.
 141  *              REPMIN and REPXMIN are identical except they implement minimal repeats.
 142  *
 143  * OPEN,CLOSE   ...are numbered at compile time.
 144  */
 145
 146 /*
 147  * A node is one word of opcode followed by one word of "next" pointer.
 148  * The "next" pointer value is a positive offset from the opcode of the node
 149  * containing it.
 150  * An operand, if any, simply follows the node.  (Note that much of the
 151  * code generation knows about this implicit relationship.)
 152  */
 153 #define OP(preg, p)     (preg->program[p])
 154 #define NEXT(preg, p)   (preg->program[p + 1])
 155 #define OPERAND(p)      ((p) + 2)
 156
 157 /*
 158  * See regmagic.h for one further detail of program structure.
 159  */
 160
 161
 162 /*
 163  * Utility definitions.
 164  */
 165
 166 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 167 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 168 #define META            "^$.[()|?{+*"
 169
 170 /*
 171  * Flags to be passed up and down.
 172  */
 173 #define HASWIDTH        1       /* Known never to match null string. */
 174 #define SIMPLE          2       /* Simple enough to be STAR/PLUS operand. */
 175 #define SPSTART         4       /* Starts with * or +. */
 176 #define WORST           0       /* Worst case. */
 177
 178 #define MAX_REP_COUNT 1000000
 179
 180 /*
 181  * Forward declarations for regcomp()'s friends.
 182  */
 183 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 184 static int regpiece(regex_t *preg, int *flagp );
 185 static int regbranch(regex_t *preg, int *flagp );
 186 static int regatom(regex_t *preg, int *flagp );
 187 static int regnode(regex_t *preg, int op );
 188 static int regnext(regex_t *preg, int p );
 189 static void regc(regex_t *preg, int b );
 190 static int reginsert(regex_t *preg, int op, int size, int opnd );
 191 static void regtail(regex_t *preg, int p, int val);
 192 static void regoptail(regex_t *preg, int p, int val );
 193 static int regopsize(regex_t *preg, int p );
 194
 195 static int reg_range_find(const int *string, int c);
 196 static const char *str_find(const char *string, int c, int nocase);
 197 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 198
 199 /*#define DEBUG*/
 200 #ifdef DEBUG
 201 static int regnarrate = 0;
 202 static void regdump(regex_t *preg);
 203 static const char *regprop( int op );
 204 #endif
 205
 206
 207 /**
 208  * Returns the length of the null-terminated integer sequence.
 209  */
 210 static int str_int_len(const int *seq)
 211 {
 212         int n = 0;
 213         while (*seq++) {
 214                 n++;
 215         }
 216         return n;
 217 }
 218
 219 /*
 220  - regcomp - compile a regular expression into internal code
 221  *
 222  * We can't allocate space until we know how big the compiled form will be,
 223  * but we can't compile it (and thus know how big it is) until we've got a
 224  * place to put the code.  So we cheat:  we compile it twice, once with code
 225  * generation turned off and size counting turned on, and once "for real".
 226  * This also means that we don't allocate space until we are sure that the
 227  * thing really will compile successfully, and we never have to move the
 228  * code and thus invalidate pointers into it.  (Note that it has to be in
 229  * one piece because free() must be able to free it all.)
 230  *
 231  * Beware that the optimization-preparation code in here knows about some
 232  * of the structure of the compiled regexp.
 233  */
 234 int regcomp(regex_t *preg, const char *exp, int cflags)
 235 {
 236         int scan;
 237         int longest;
 238         unsigned len;
 239         int flags;
 240
 241 #ifdef DEBUG
 242         fprintf(stderr, "Compiling: '%s'\n", exp);
 243 #endif
 244         memset(preg, 0, sizeof(*preg));
 245
 246         if (exp == NULL)
 247                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 248
 249         /* First pass: determine size, legality. */
 250         preg->cflags = cflags;
 251         preg->regparse = exp;
 252
 253         /* Allocate space. */
 254         preg->proglen = (strlen(exp) + 1) * 5;
 255         preg->program = malloc(preg->proglen * sizeof(int));
 256         if (preg->program == NULL)
 257                 FAIL(preg, REG_ERR_NOMEM);
 258
 259         /* Note that since we store a magic value as the first item in the program,
 260          * program offsets will never be 0
 261          */
 262         regc(preg, REG_MAGIC);
 263         if (reg(preg, 0, &flags) == 0) {
 264                 return preg->err;
 265         }
 266
 267         /* Small enough for pointer-storage convention? */
 268         if (preg->re_nsub >= REG_MAX_PAREN)             /* Probably could be 65535L. */
 269                 FAIL(preg,REG_ERR_TOO_BIG);
 270
 271         /* Dig out information for optimizations. */
 272         preg->regstart = 0;     /* Worst-case defaults. */
 273         preg->reganch = 0;
 274         preg->regmust = 0;
 275         preg->regmlen = 0;
 276         scan = 1;                       /* First BRANCH. */
 277         if (OP(preg, regnext(preg, scan)) == END) {             /* Only one top-level choice. */
 278                 scan = OPERAND(scan);
 279
 280                 /* Starting-point info. */
 281                 if (OP(preg, scan) == EXACTLY) {
 282                         preg->regstart = preg->program[OPERAND(scan)];
 283                 }
 284                 else if (OP(preg, scan) == BOL)
 285                         preg->reganch++;
 286
 287                 /*
 288                  * If there's something expensive in the r.e., find the
 289                  * longest literal string that must appear and make it the
 290                  * regmust.  Resolve ties in favor of later strings, since
 291                  * the regstart check works with the beginning of the r.e.
 292                  * and avoiding duplication strengthens checking.  Not a
 293                  * strong reason, but sufficient in the absence of others.
 294                  */
 295                 if (flags&SPSTART) {
 296                         longest = 0;
 297                         len = 0;
 298                         for (; scan != 0; scan = regnext(preg, scan)) {
 299                                 if (OP(preg, scan) == EXACTLY) {
 300                                         int plen = str_int_len(preg->program + OPERAND(scan));
 301                                         if (plen >= len) {
 302                                                 longest = OPERAND(scan);
 303                                                 len = plen;
 304                                         }
 305                                 }
 306                         }
 307                         preg->regmust = longest;
 308                         preg->regmlen = len;
 309                 }
 310         }
 311
 312 #ifdef DEBUG
 313         regdump(preg);
 314 #endif
 315
 316         return 0;
 317 }
 318
 319 /*
 320  - reg - regular expression, i.e. main body or parenthesized thing
 321  *
 322  * Caller must absorb opening parenthesis.
 323  *
 324  * Combining parenthesis handling with the base level of regular expression
 325  * is a trifle forced, but the need to tie the tails of the branches to what
 326  * follows makes it hard to avoid.
 327  */
 328 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 329 {
 330         int ret;
 331         int br;
 332         int ender;
 333         int parno = 0;
 334         int flags;
 335
 336         *flagp = HASWIDTH;      /* Tentatively. */
 337
 338         /* Make an OPEN node, if parenthesized. */
 339         if (paren) {
 340                 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 341                         /* non-capturing paren */
 342                         preg->regparse += 2;
 343                         parno = -1;
 344                 }
 345                 else {
 346                         parno = ++preg->re_nsub;
 347                 }
 348                 ret = regnode(preg, OPEN+parno);
 349         } else
 350                 ret = 0;
 351
 352         /* Pick up the branches, linking them together. */
 353         br = regbranch(preg, &flags);
 354         if (br == 0)
 355                 return 0;
 356         if (ret != 0)
 357                 regtail(preg, ret, br); /* OPEN -> first. */
 358         else
 359                 ret = br;
 360         if (!(flags&HASWIDTH))
 361                 *flagp &= ~HASWIDTH;
 362         *flagp |= flags&SPSTART;
 363         while (*preg->regparse == '|') {
 364                 preg->regparse++;
 365                 br = regbranch(preg, &flags);
 366                 if (br == 0)
 367                         return 0;
 368                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 369                 if (!(flags&HASWIDTH))
 370                         *flagp &= ~HASWIDTH;
 371                 *flagp |= flags&SPSTART;
 372         }
 373
 374         /* Make a closing node, and hook it on the end. */
 375         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 376         regtail(preg, ret, ender);
 377
 378         /* Hook the tails of the branches to the closing node. */
 379         for (br = ret; br != 0; br = regnext(preg, br))
 380                 regoptail(preg, br, ender);
 381
 382         /* Check for proper termination. */
 383         if (paren && *preg->regparse++ != ')') {
 384                 preg->err = REG_ERR_UNMATCHED_PAREN;
 385                 return 0;
 386         } else if (!paren && *preg->regparse != '\0') {
 387                 if (*preg->regparse == ')') {
 388                         preg->err = REG_ERR_UNMATCHED_PAREN;
 389                         return 0;
 390                 } else {
 391                         preg->err = REG_ERR_JUNK_ON_END;
 392                         return 0;
 393                 }
 394         }
 395
 396         return(ret);
 397 }
 398
 399 /*
 400  - regbranch - one alternative of an | operator
 401  *
 402  * Implements the concatenation operator.
 403  */
 404 static int regbranch(regex_t *preg, int *flagp )
 405 {
 406         int ret;
 407         int chain;
 408         int latest;
 409         int flags;
 410
 411         *flagp = WORST;         /* Tentatively. */
 412
 413         ret = regnode(preg, BRANCH);
 414         chain = 0;
 415         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 416                *preg->regparse != '|') {
 417                 latest = regpiece(preg, &flags);
 418                 if (latest == 0)
 419                         return 0;
 420                 *flagp |= flags&HASWIDTH;
 421                 if (chain == 0) {/* First piece. */
 422                         *flagp |= flags&SPSTART;
 423                 }
 424                 else {
 425                         regtail(preg, chain, latest);
 426                 }
 427                 chain = latest;
 428         }
 429         if (chain == 0) /* Loop ran zero times. */
 430                 (void) regnode(preg, NOTHING);
 431
 432         return(ret);
 433 }
 434
 435 /*
 436  - regpiece - something followed by possible [*+?]
 437  *
 438  * Note that the branching code sequences used for ? and the general cases
 439  * of * and + are somewhat optimized:  they use the same NOTHING node as
 440  * both the endmarker for their branch list and the body of the last branch.
 441  * It might seem that this node could be dispensed with entirely, but the
 442  * endmarker role is not redundant.
 443  */
 444 static int regpiece(regex_t *preg, int *flagp)
 445 {
 446         int ret;
 447         char op;
 448         int next;
 449         int flags;
 450         int min;
 451         int max;
 452
 453         ret = regatom(preg, &flags);
 454         if (ret == 0)
 455                 return 0;
 456
 457         op = *preg->regparse;
 458         if (!ISMULT(op)) {
 459                 *flagp = flags;
 460                 return(ret);
 461         }
 462
 463         if (!(flags&HASWIDTH) && op != '?') {
 464                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 465                 return 0;
 466         }
 467
 468         /* Handle braces (counted repetition) by expansion */
 469         if (op == '{') {
 470                 char *end;
 471
 472                 min = strtoul(preg->regparse + 1, &end, 10);
 473                 if (end == preg->regparse + 1) {
 474                         preg->err = REG_ERR_BAD_COUNT;
 475                         return 0;
 476                 }
 477                 if (*end == '}') {
 478                         max = min;
 479                 }
 480                 else if (*end == '\0') {
 481                         preg->err = REG_ERR_UNMATCHED_BRACES;
 482                         return 0;
 483                 }
 484                 else {
 485                         preg->regparse = end;
 486                         max = strtoul(preg->regparse + 1, &end, 10);
 487                         if (*end != '}') {
 488                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 489                                 return 0;
 490                         }
 491                 }
 492                 if (end == preg->regparse + 1) {
 493                         max = MAX_REP_COUNT;
 494                 }
 495                 else if (max < min || max >= 100) {
 496                         preg->err = REG_ERR_BAD_COUNT;
 497                         return 0;
 498                 }
 499                 if (min >= 100) {
 500                         preg->err = REG_ERR_BAD_COUNT;
 501                         return 0;
 502                 }
 503
 504                 preg->regparse = strchr(preg->regparse, '}');
 505         }
 506         else {
 507                 min = (op == '+');
 508                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 509         }
 510
 511         if (preg->regparse[1] == '?') {
 512                 preg->regparse++;
 513                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 514         }
 515         else {
 516                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 517         }
 518         preg->program[ret + 2] = max;
 519         preg->program[ret + 3] = min;
 520         preg->program[ret + 4] = 0;
 521
 522         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 523
 524         if (!(flags & SIMPLE)) {
 525                 int back = regnode(preg, BACK);
 526                 regtail(preg, back, ret);
 527                 regtail(preg, next, back);
 528         }
 529
 530         preg->regparse++;
 531         if (ISMULT(*preg->regparse)) {
 532                 preg->err = REG_ERR_NESTED_COUNT;
 533                 return 0;
 534         }
 535
 536         return ret;
 537 }
 538
 539 /**
 540  * Add all characters in the inclusive range between lower and upper.
 541  *
 542  * Handles a swapped range (upper < lower).
 543  */
 544 static void reg_addrange(regex_t *preg, int lower, int upper)
 545 {
 546         if (lower > upper) {
 547                 reg_addrange(preg, upper, lower);
 548         }
 549         /* Add a range as length, start */
 550         regc(preg, upper - lower + 1);
 551         regc(preg, lower);
 552 }
 553
 554 /**
 555  * Add a null-terminated literal string as a set of ranges.
 556  */
 557 static void reg_addrange_str(regex_t *preg, const char *str)
 558 {
 559         while (*str) {
 560                 reg_addrange(preg, *str, *str);
 561                 str++;
 562         }
 563 }
 564
 565 /**
 566  * Extracts the next unicode char from utf8.
 567  *
 568  * If 'upper' is set, converts the char to uppercase.
 569  */
 570 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 571 {
 572         int l = utf8_tounicode(s, uc);
 573         if (upper) {
 574                 *uc = utf8_upper(*uc);
 575         }
 576         return l;
 577 }
 578
 579 /**
 580  * Converts a hex digit to decimal.
 581  *
 582  * Returns -1 for an invalid hex digit.
 583  */
 584 static int hexdigitval(int c)
 585 {
 586         if (c >= '0' && c <= '9')
 587                 return c - '0';
 588         if (c >= 'a' && c <= 'f')
 589                 return c - 'a' + 10;
 590         if (c >= 'A' && c <= 'F')
 591                 return c - 'A' + 10;
 592         return -1;
 593 }
 594
 595 /**
 596  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 597  *
 598  * Returns the number of hex digits parsed.
 599  * If there are no hex digits, returns 0 and stores nothing.
 600  */
 601 static int parse_hex(const char *s, int n, int *uc)
 602 {
 603         int val = 0;
 604         int k;
 605
 606         for (k = 0; k < n; k++) {
 607                 int c = hexdigitval(*s++);
 608                 if (c == -1) {
 609                         break;
 610                 }
 611                 val = (val << 4) | c;
 612         }
 613         if (k) {
 614                 *uc = val;
 615         }
 616         return k;
 617 }
 618
 619 /**
 620  * Call for chars after a backlash to decode the escape sequence.
 621  *
 622  * Stores the result in *ch.
 623  *
 624  * Returns the number of bytes consumed.
 625  */
 626 static int reg_decode_escape(const char *s, int *ch)
 627 {
 628         int n;
 629         const char *s0 = s;
 630
 631         *ch = *s++;
 632
 633         switch (*ch) {
 634                 case 'b': *ch = '\b'; break;
 635                 case 'e': *ch = 27; break;
 636                 case 'f': *ch = '\f'; break;
 637                 case 'n': *ch = '\n'; break;
 638                 case 'r': *ch = '\r'; break;
 639                 case 't': *ch = '\t'; break;
 640                 case 'v': *ch = '\v'; break;
 641                 case 'u':
 642                         if (*s == '{') {
 643                                 /* Expect \u{NNNN} */
 644                                 n = parse_hex(s + 1, 6, ch);
 645                                 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
 646                                         s += n + 2;
 647                                 }
 648                                 else {
 649                                         /* Invalid, so just treat as an escaped 'u' */
 650                                         *ch = 'u';
 651                                 }
 652                         }
 653                         else if ((n = parse_hex(s, 4, ch)) > 0) {
 654                                 s += n;
 655                         }
 656                         break;
 657                 case 'U':
 658                         if ((n = parse_hex(s, 8, ch)) > 0) {
 659                                 s += n;
 660                         }
 661                         break;
 662                 case 'x':
 663                         if ((n = parse_hex(s, 2, ch)) > 0) {
 664                                 s += n;
 665                         }
 666                         break;
 667                 case '\0':
 668                         s--;
 669                         *ch = '\\';
 670                         break;
 671         }
 672         return s - s0;
 673 }
 674
 675 /*
 676  - regatom - the lowest level
 677  *
 678  * Optimization:  gobbles an entire sequence of ordinary characters so that
 679  * it can turn them into a single node, which is smaller to store and
 680  * faster to run.  Backslashed characters are exceptions, each becoming a
 681  * separate node; the code is simpler that way and it's not worth fixing.
 682  */
 683 static int regatom(regex_t *preg, int *flagp)
 684 {
 685         int ret;
 686         int flags;
 687         int nocase = (preg->cflags & REG_ICASE);
 688
 689         int ch;
 690         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 691
 692         *flagp = WORST;         /* Tentatively. */
 693
 694         preg->regparse += n;
 695         switch (ch) {
 696         /* FIXME: these chars only have meaning at beg/end of pat? */
 697         case '^':
 698                 ret = regnode(preg, BOL);
 699                 break;
 700         case '$':
 701                 ret = regnode(preg, EOL);
 702                 break;
 703         case '.':
 704                 ret = regnode(preg, ANY);
 705                 *flagp |= HASWIDTH|SIMPLE;
 706                 break;
 707         case '[': {
 708                         const char *pattern = preg->regparse;
 709
 710                         if (*pattern == '^') {  /* Complement of range. */
 711                                 ret = regnode(preg, ANYBUT);
 712                                 pattern++;
 713                         } else
 714                                 ret = regnode(preg, ANYOF);
 715
 716                         /* Special case. If the first char is ']' or '-', it is part of the set */
 717                         if (*pattern == ']' || *pattern == '-') {
 718                                 reg_addrange(preg, *pattern, *pattern);
 719                                 pattern++;
 720                         }
 721
 722                         while (*pattern && *pattern != ']') {
 723                                 /* Is this a range? a-z */
 724                                 int start;
 725                                 int end;
 726
 727                                 enum {
 728                                         CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
 729                                         CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
 730                                         CC_NUM
 731                                 };
 732                                 int cc;
 733
 734                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 735                                 if (start == '\\') {
 736                                         /* First check for class shorthand escapes */
 737                                         switch (*pattern) {
 738                                                 case 's':
 739                                                         pattern++;
 740                                                         cc = CC_SPACE;
 741                                                         goto cc_switch;
 742                                                 case 'd':
 743                                                         pattern++;
 744                                                         cc = CC_DIGIT;
 745                                                         goto cc_switch;
 746                                                 case 'w':
 747                                                         pattern++;
 748                                                         reg_addrange(preg, '_', '_');
 749                                                         cc = CC_ALNUM;
 750                                                         goto cc_switch;
 751                                         }
 752                                         pattern += reg_decode_escape(pattern, &start);
 753                                         if (start == 0) {
 754                                                 preg->err = REG_ERR_NULL_CHAR;
 755                                                 return 0;
 756                                         }
 757                                 }
 758                                 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
 759                                         /* skip '-' */
 760                                         pattern += utf8_tounicode(pattern, &end);
 761                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 762                                         if (end == '\\') {
 763                                                 pattern += reg_decode_escape(pattern, &end);
 764                                                 if (end == 0) {
 765                                                         preg->err = REG_ERR_NULL_CHAR;
 766                                                         return 0;
 767                                                 }
 768                                         }
 769
 770                                         reg_addrange(preg, start, end);
 771                                         continue;
 772                                 }
 773                                 if (start == '[' && pattern[0] == ':') {
 774                                         static const char *character_class[] = {
 775                                                 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
 776                                                 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
 777                                         };
 778
 779                                         for (cc = 0; cc < CC_NUM; cc++) {
 780                                                 n = strlen(character_class[cc]);
 781                                                 if (strncmp(pattern, character_class[cc], n) == 0) {
 782                                                         /* Found a character class */
 783                                                         pattern += n + 1;
 784                                                         break;
 785                                                 }
 786                                         }
 787                                         if (cc != CC_NUM) {
 788 cc_switch:
 789                                                 switch (cc) {
 790                                                         case CC_ALNUM:
 791                                                                 reg_addrange(preg, '0', '9');
 792                                                                 /* Fall through */
 793                                                         case CC_ALPHA:
 794                                                                 if ((preg->cflags & REG_ICASE) == 0) {
 795                                                                         reg_addrange(preg, 'a', 'z');
 796                                                                 }
 797                                                                 reg_addrange(preg, 'A', 'Z');
 798                                                                 break;
 799                                                         case CC_SPACE:
 800                                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 801                                                                 break;
 802                                                         case CC_BLANK:
 803                                                                 reg_addrange_str(preg, " \t");
 804                                                                 break;
 805                                                         case CC_UPPER:
 806                                                                 reg_addrange(preg, 'A', 'Z');
 807                                                                 break;
 808                                                         case CC_LOWER:
 809                                                                 reg_addrange(preg, 'a', 'z');
 810                                                                 break;
 811                                                         case CC_XDIGIT:
 812                                                                 reg_addrange(preg, 'a', 'f');
 813                                                                 reg_addrange(preg, 'A', 'F');
 814                                                                 /* Fall through */
 815                                                         case CC_DIGIT:
 816                                                                 reg_addrange(preg, '0', '9');
 817                                                                 break;
 818                                                         case CC_CNTRL:
 819                                                                 reg_addrange(preg, 0, 31);
 820                                                                 reg_addrange(preg, 127, 127);
 821                                                                 break;
 822                                                         case CC_PRINT:
 823                                                                 reg_addrange(preg, ' ', '~');
 824                                                                 break;
 825                                                         case CC_GRAPH:
 826                                                                 reg_addrange(preg, '!', '~');
 827                                                                 break;
 828                                                         case CC_PUNCT:
 829                                                                 reg_addrange(preg, '!', '/');
 830                                                                 reg_addrange(preg, ':', '@');
 831                                                                 reg_addrange(preg, '[', '`');
 832                                                                 reg_addrange(preg, '{', '~');
 833                                                                 break;
 834                                                 }
 835                                                 continue;
 836                                         }
 837                                 }
 838                                 /* Not a range, so just add the char */
 839                                 reg_addrange(preg, start, start);
 840                         }
 841                         regc(preg, '\0');
 842
 843                         if (*pattern) {
 844                                 pattern++;
 845                         }
 846                         preg->regparse = pattern;
 847
 848                         *flagp |= HASWIDTH|SIMPLE;
 849                 }
 850                 break;
 851         case '(':
 852                 ret = reg(preg, 1, &flags);
 853                 if (ret == 0)
 854                         return 0;
 855                 *flagp |= flags&(HASWIDTH|SPSTART);
 856                 break;
 857         case '\0':
 858         case '|':
 859         case ')':
 860                 preg->err = REG_ERR_INTERNAL;
 861                 return 0;       /* Supposed to be caught earlier. */
 862         case '?':
 863         case '+':
 864         case '*':
 865         case '{':
 866                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 867                 return 0;
 868         case '\\':
 869                 ch = *preg->regparse++;
 870                 switch (ch) {
 871                 case '\0':
 872                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 873                         return 0;
 874                 case 'A':
 875                         ret = regnode(preg, BOLX);
 876                         break;
 877                 case 'Z':
 878                         ret = regnode(preg, EOLX);
 879                         break;
 880                 case '<':
 881                 case 'm':
 882                         ret = regnode(preg, WORDA);
 883                         break;
 884                 case '>':
 885                 case 'M':
 886                         ret = regnode(preg, WORDZ);
 887                         break;
 888                 case 'd':
 889                 case 'D':
 890                         ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
 891                         reg_addrange(preg, '0', '9');
 892                         regc(preg, '\0');
 893                         *flagp |= HASWIDTH|SIMPLE;
 894                         break;
 895                 case 'w':
 896                 case 'W':
 897                         ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
 898                         if ((preg->cflags & REG_ICASE) == 0) {
 899                                 reg_addrange(preg, 'a', 'z');
 900                         }
 901                         reg_addrange(preg, 'A', 'Z');
 902                         reg_addrange(preg, '0', '9');
 903                         reg_addrange(preg, '_', '_');
 904                         regc(preg, '\0');
 905                         *flagp |= HASWIDTH|SIMPLE;
 906                         break;
 907                 case 's':
 908                 case 'S':
 909                         ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
 910                         reg_addrange_str(preg," \t\r\n\f\v");
 911                         regc(preg, '\0');
 912                         *flagp |= HASWIDTH|SIMPLE;
 913                         break;
 914                 /* FIXME: Someday handle \1, \2, ... */
 915                 default:
 916                         /* Handle general quoted chars in exact-match routine */
 917                         /* Back up to include the backslash */
 918                         preg->regparse--;
 919                         goto de_fault;
 920                 }
 921                 break;
 922         de_fault:
 923         default: {
 924                         /*
 925                          * Encode a string of characters to be matched exactly.
 926                          */
 927                         int added = 0;
 928
 929                         /* Back up to pick up the first char of interest */
 930                         preg->regparse -= n;
 931
 932                         ret = regnode(preg, EXACTLY);
 933
 934                         /* Note that a META operator such as ? or * consumes the
 935                          * preceding char.
 936                          * Thus we must be careful to look ahead by 2 and add the
 937                          * last char as it's own EXACTLY if necessary
 938                          */
 939
 940                         /* Until end of string or a META char is reached */
 941                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 942                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 943                                 if (ch == '\\' && preg->regparse[n]) {
 944                                         /* Non-trailing backslash.
 945                                          * Is this a special escape, or a regular escape?
 946                                          */
 947                                         if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
 948                                                 /* A special escape. All done with EXACTLY */
 949                                                 break;
 950                                         }
 951                                         /* Decode it. Note that we add the length for the escape
 952                                          * sequence to the length for the backlash so we can skip
 953                                          * the entire sequence, or not as required.
 954                                          */
 955                                         n += reg_decode_escape(preg->regparse + n, &ch);
 956                                         if (ch == 0) {
 957                                                 preg->err = REG_ERR_NULL_CHAR;
 958                                                 return 0;
 959                                         }
 960                                 }
 961
 962                                 /* Now we have one char 'ch' of length 'n'.
 963                                  * Check to see if the following char is a MULT
 964                                  */
 965
 966                                 if (ISMULT(preg->regparse[n])) {
 967                                         /* Yes. But do we already have some EXACTLY chars? */
 968                                         if (added) {
 969                                                 /* Yes, so return what we have and pick up the current char next time around */
 970                                                 break;
 971                                         }
 972                                         /* No, so add this single char and finish */
 973                                         regc(preg, ch);
 974                                         added++;
 975                                         preg->regparse += n;
 976                                         break;
 977                                 }
 978
 979                                 /* No, so just add this char normally */
 980                                 regc(preg, ch);
 981                                 added++;
 982                                 preg->regparse += n;
 983                         }
 984                         regc(preg, '\0');
 985
 986                         *flagp |= HASWIDTH;
 987                         if (added == 1)
 988                                 *flagp |= SIMPLE;
 989                         break;
 990                 }
 991                 break;
 992         }
 993
 994         return(ret);
 995 }
 996
 997 static void reg_grow(regex_t *preg, int n)
 998 {
 999         if (preg->p + n >= preg->proglen) {
1000                 preg->proglen = (preg->p + n) * 2;
1001                 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
1002         }
1003 }
1004
1005 /*
1006  - regnode - emit a node
1007  */
1008 /* Location. */
1009 static int regnode(regex_t *preg, int op)
1010 {
1011         reg_grow(preg, 2);
1012
1013         /* The OP followed by a next pointer */
1014         preg->program[preg->p++] = op;
1015         preg->program[preg->p++] = 0;
1016
1017         /* Return the start of the node */
1018         return preg->p - 2;
1019 }
1020
1021 /*
1022  - regc - emit (if appropriate) a byte of code
1023  */
1024 static void regc(regex_t *preg, int b )
1025 {
1026         reg_grow(preg, 1);
1027         preg->program[preg->p++] = b;
1028 }
1029
1030 /*
1031  - reginsert - insert an operator in front of already-emitted operand
1032  *
1033  * Means relocating the operand.
1034  * Returns the new location of the original operand.
1035  */
1036 static int reginsert(regex_t *preg, int op, int size, int opnd )
1037 {
1038         reg_grow(preg, size);
1039
1040         /* Move everything from opnd up */
1041         memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1042         /* Zero out the new space */
1043         memset(preg->program + opnd, 0, sizeof(int) * size);
1044
1045         preg->program[opnd] = op;
1046
1047         preg->p += size;
1048
1049         return opnd + size;
1050 }
1051
1052 /*
1053  - regtail - set the next-pointer at the end of a node chain
1054  */
1055 static void regtail(regex_t *preg, int p, int val)
1056 {
1057         int scan;
1058         int temp;
1059         int offset;
1060
1061         /* Find last node. */
1062         scan = p;
1063         for (;;) {
1064                 temp = regnext(preg, scan);
1065                 if (temp == 0)
1066                         break;
1067                 scan = temp;
1068         }
1069
1070         if (OP(preg, scan) == BACK)
1071                 offset = scan - val;
1072         else
1073                 offset = val - scan;
1074
1075         preg->program[scan + 1] = offset;
1076 }
1077
1078 /*
1079  - regoptail - regtail on operand of first argument; nop if operandless
1080  */
1081
1082 static void regoptail(regex_t *preg, int p, int val )
1083 {
1084         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1085         if (p != 0 && OP(preg, p) == BRANCH) {
1086                 regtail(preg, OPERAND(p), val);
1087         }
1088 }
1089
1090 /*
1091  * regexec and friends
1092  */
1093
1094 /*
1095  * Forwards.
1096  */
1097 static int regtry(regex_t *preg, const char *string );
1098 static int regmatch(regex_t *preg, int prog);
1099 static int regrepeat(regex_t *preg, int p, int max);
1100
1101 /*
1102  - regexec - match a regexp against a string
1103  */
1104 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1105 {
1106         const char *s;
1107         int scan;
1108
1109         /* Be paranoid... */
1110         if (preg == NULL || preg->program == NULL || string == NULL) {
1111                 return REG_ERR_NULL_ARGUMENT;
1112         }
1113
1114         /* Check validity of program. */
1115         if (*preg->program != REG_MAGIC) {
1116                 return REG_ERR_CORRUPTED;
1117         }
1118
1119 #ifdef DEBUG
1120         fprintf(stderr, "regexec: %s\n", string);
1121         regdump(preg);
1122 #endif
1123
1124         preg->eflags = eflags;
1125         preg->pmatch = pmatch;
1126         preg->nmatch = nmatch;
1127         preg->start = string;   /* All offsets are computed from here */
1128
1129         /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1130         for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1131                 int op = OP(preg, scan);
1132                 if (op == END)
1133                         break;
1134                 if (op == REPX || op == REPXMIN)
1135                         preg->program[scan + 4] = 0;
1136         }
1137
1138         /* If there is a "must appear" string, look for it. */
1139         if (preg->regmust != 0) {
1140                 s = string;
1141                 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1142                         if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1143                                 break;
1144                         }
1145                         s++;
1146                 }
1147                 if (s == NULL)  /* Not present. */
1148                         return REG_NOMATCH;
1149         }
1150
1151         /* Mark beginning of line for ^ . */
1152         preg->regbol = string;
1153
1154         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1155         if (preg->reganch) {
1156                 if (eflags & REG_NOTBOL) {
1157                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1158                         goto nextline;
1159                 }
1160                 while (1) {
1161                         if (regtry(preg, string)) {
1162                                 return REG_NOERROR;
1163                         }
1164                         if (*string) {
1165 nextline:
1166                                 if (preg->cflags & REG_NEWLINE) {
1167                                         /* Try the next anchor? */
1168                                         string = strchr(string, '\n');
1169                                         if (string) {
1170                                                 preg->regbol = ++string;
1171                                                 continue;
1172                                         }
1173                                 }
1174                         }
1175                         return REG_NOMATCH;
1176                 }
1177         }
1178
1179         /* Messy cases:  unanchored match. */
1180         s = string;
1181         if (preg->regstart != '\0') {
1182                 /* We know what char it must start with. */
1183                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1184                         if (regtry(preg, s))
1185                                 return REG_NOERROR;
1186                         s++;
1187                 }
1188         }
1189         else
1190                 /* We don't -- general case. */
1191                 while (1) {
1192                         if (regtry(preg, s))
1193                                 return REG_NOERROR;
1194                         if (*s == '\0') {
1195                                 break;
1196                         }
1197                         else {
1198                                 int c;
1199                                 s += utf8_tounicode(s, &c);
1200                         }
1201                 }
1202
1203         /* Failure. */
1204         return REG_NOMATCH;
1205 }
1206
1207 /*
1208  - regtry - try match at specific point
1209  */
1210                         /* 0 failure, 1 success */
1211 static int regtry( regex_t *preg, const char *string )
1212 {
1213         int i;
1214
1215         preg->reginput = string;
1216
1217         for (i = 0; i < preg->nmatch; i++) {
1218                 preg->pmatch[i].rm_so = -1;
1219                 preg->pmatch[i].rm_eo = -1;
1220         }
1221         if (regmatch(preg, 1)) {
1222                 preg->pmatch[0].rm_so = string - preg->start;
1223                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1224                 return(1);
1225         } else
1226                 return(0);
1227 }
1228
1229 /**
1230  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1231  *
1232  * If 'nocase' is non-zero, does a case-insensitive match.
1233  *
1234  * Returns -1 on not found.
1235  */
1236 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1237 {
1238         const char *s = string;
1239         while (proglen && *s) {
1240                 int ch;
1241                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1242                 if (ch != *prog) {
1243                         return -1;
1244                 }
1245                 prog++;
1246                 s += n;
1247                 proglen--;
1248         }
1249         if (proglen == 0) {
1250                 return s - string;
1251         }
1252         return -1;
1253 }
1254
1255 /**
1256  * Searchs for 'c' in the range 'range'.
1257  *
1258  * Returns 1 if found, or 0 if not.
1259  */
1260 static int reg_range_find(const int *range, int c)
1261 {
1262         while (*range) {
1263                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1264                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1265                         return 1;
1266                 }
1267                 range += 2;
1268         }
1269         return 0;
1270 }
1271
1272 /**
1273  * Search for the character 'c' in the utf-8 string 'string'.
1274  *
1275  * If 'nocase' is set, the 'string' is assumed to be uppercase
1276  * and 'c' is converted to uppercase before matching.
1277  *
1278  * Returns the byte position in the string where the 'c' was found, or
1279  * NULL if not found.
1280  */
1281 static const char *str_find(const char *string, int c, int nocase)
1282 {
1283         if (nocase) {
1284                 /* The "string" should already be converted to uppercase */
1285                 c = utf8_upper(c);
1286         }
1287         while (*string) {
1288                 int ch;
1289                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1290                 if (c == ch) {
1291                         return string;
1292                 }
1293                 string += n;
1294         }
1295         return NULL;
1296 }
1297
1298 /**
1299  * Returns true if 'ch' is an end-of-line char.
1300  *
1301  * In REG_NEWLINE mode, \n is considered EOL in
1302  * addition to \0
1303  */
1304 static int reg_iseol(regex_t *preg, int ch)
1305 {
1306         if (preg->cflags & REG_NEWLINE) {
1307                 return ch == '\0' || ch == '\n';
1308         }
1309         else {
1310                 return ch == '\0';
1311         }
1312 }
1313
1314 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1315 {
1316         int nextch = '\0';
1317         const char *save;
1318         int no;
1319         int c;
1320
1321         int max = preg->program[scan + 2];
1322         int min = preg->program[scan + 3];
1323         int next = regnext(preg, scan);
1324
1325         /*
1326          * Lookahead to avoid useless match attempts
1327          * when we know what character comes next.
1328          */
1329         if (OP(preg, next) == EXACTLY) {
1330                 nextch = preg->program[OPERAND(next)];
1331         }
1332         save = preg->reginput;
1333         no = regrepeat(preg, scan + 5, max);
1334         if (no < min) {
1335                 return 0;
1336         }
1337         if (matchmin) {
1338                 /* from min up to no */
1339                 max = no;
1340                 no = min;
1341         }
1342         /* else from no down to min */
1343         while (1) {
1344                 if (matchmin) {
1345                         if (no > max) {
1346                                 break;
1347                         }
1348                 }
1349                 else {
1350                         if (no < min) {
1351                                 break;
1352                         }
1353                 }
1354                 preg->reginput = save + utf8_index(save, no);
1355                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1356                 /* If it could work, try it. */
1357                 if (reg_iseol(preg, nextch) || c == nextch) {
1358                         if (regmatch(preg, next)) {
1359                                 return(1);
1360                         }
1361                 }
1362                 if (matchmin) {
1363                         /* Couldn't or didn't, add one more */
1364                         no++;
1365                 }
1366                 else {
1367                         /* Couldn't or didn't -- back up. */
1368                         no--;
1369                 }
1370         }
1371         return(0);
1372 }
1373
1374 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1375 {
1376         int *scanpt = preg->program + scan;
1377
1378         int max = scanpt[2];
1379         int min = scanpt[3];
1380
1381         /* Have we reached min? */
1382         if (scanpt[4] < min) {
1383                 /* No, so get another one */
1384                 scanpt[4]++;
1385                 if (regmatch(preg, scan + 5)) {
1386                         return 1;
1387                 }
1388                 scanpt[4]--;
1389                 return 0;
1390         }
1391         if (scanpt[4] > max) {
1392                 return 0;
1393         }
1394
1395         if (matchmin) {
1396                 /* minimal, so try other branch first */
1397                 if (regmatch(preg, regnext(preg, scan))) {
1398                         return 1;
1399                 }
1400                 /* No, so try one more */
1401                 scanpt[4]++;
1402                 if (regmatch(preg, scan + 5)) {
1403                         return 1;
1404                 }
1405                 scanpt[4]--;
1406                 return 0;
1407         }
1408         /* maximal, so try this branch again */
1409         if (scanpt[4] < max) {
1410                 scanpt[4]++;
1411                 if (regmatch(preg, scan + 5)) {
1412                         return 1;
1413                 }
1414                 scanpt[4]--;
1415         }
1416         /* At this point we are at max with no match. Try the other branch */
1417         return regmatch(preg, regnext(preg, scan));
1418 }
1419
1420 /*
1421  - regmatch - main matching routine
1422  *
1423  * Conceptually the strategy is simple:  check to see whether the current
1424  * node matches, call self recursively to see whether the rest matches,
1425  * and then act accordingly.  In practice we make some effort to avoid
1426  * recursion, in particular by going through "ordinary" nodes (that don't
1427  * need to know whether the rest of the match failed) by a loop instead of
1428  * by recursion.
1429  */
1430 /* 0 failure, 1 success */
1431 static int regmatch(regex_t *preg, int prog)
1432 {
1433         int scan;       /* Current node. */
1434         int next;               /* Next node. */
1435         const char *save;
1436
1437         scan = prog;
1438
1439 #ifdef DEBUG
1440         if (scan != 0 && regnarrate)
1441                 fprintf(stderr, "%s(\n", regprop(scan));
1442 #endif
1443         while (scan != 0) {
1444                 int n;
1445                 int c;
1446 #ifdef DEBUG
1447                 if (regnarrate) {
1448                         fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1449                 }
1450 #endif
1451                 next = regnext(preg, scan);
1452                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1453
1454                 switch (OP(preg, scan)) {
1455                 case BOLX:
1456                         if ((preg->eflags & REG_NOTBOL)) {
1457                                 return(0);
1458                         }
1459                         /* Fall through */
1460                 case BOL:
1461                         if (preg->reginput != preg->regbol) {
1462                                 return(0);
1463                         }
1464                         break;
1465                 case EOLX:
1466                         if (c != 0) {
1467                                 /* For EOLX, only match real end of line, not newline */
1468                                 return 0;
1469                         }
1470                         break;
1471                 case EOL:
1472                         if (!reg_iseol(preg, c)) {
1473                                 return(0);
1474                         }
1475                         break;
1476                 case WORDA:
1477                         /* Must be looking at a letter, digit, or _ */
1478                         if ((!isalnum(UCHAR(c))) && c != '_')
1479                                 return(0);
1480                         /* Prev must be BOL or nonword */
1481                         if (preg->reginput > preg->regbol &&
1482                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1483                                 return(0);
1484                         break;
1485                 case WORDZ:
1486                         /* Can't match at BOL */
1487                         if (preg->reginput > preg->regbol) {
1488                                 /* Current must be EOL or nonword */
1489                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1490                                         c = preg->reginput[-1];
1491                                         /* Previous must be word */
1492                                         if (isalnum(UCHAR(c)) || c == '_') {
1493                                                 break;
1494                                         }
1495                                 }
1496                         }
1497                         /* No */
1498                         return(0);
1499
1500                 case ANY:
1501                         if (reg_iseol(preg, c))
1502                                 return 0;
1503                         preg->reginput += n;
1504                         break;
1505                 case EXACTLY: {
1506                                 int opnd;
1507                                 int len;
1508                                 int slen;
1509
1510                                 opnd = OPERAND(scan);
1511                                 len = str_int_len(preg->program + opnd);
1512
1513                                 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1514                                 if (slen < 0) {
1515                                         return(0);
1516                                 }
1517                                 preg->reginput += slen;
1518                         }
1519                         break;
1520                 case ANYOF:
1521                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1522                                 return(0);
1523                         }
1524                         preg->reginput += n;
1525                         break;
1526                 case ANYBUT:
1527                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1528                                 return(0);
1529                         }
1530                         preg->reginput += n;
1531                         break;
1532                 case NOTHING:
1533                         break;
1534                 case BACK:
1535                         break;
1536                 case BRANCH:
1537                         if (OP(preg, next) != BRANCH)           /* No choice. */
1538                                 next = OPERAND(scan);   /* Avoid recursion. */
1539                         else {
1540                                 do {
1541                                         save = preg->reginput;
1542                                         if (regmatch(preg, OPERAND(scan))) {
1543                                                 return(1);
1544                                         }
1545                                         preg->reginput = save;
1546                                         scan = regnext(preg, scan);
1547                                 } while (scan != 0 && OP(preg, scan) == BRANCH);
1548                                 return(0);
1549                                 /* NOTREACHED */
1550                         }
1551                         break;
1552                 case REP:
1553                 case REPMIN:
1554                         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1555
1556                 case REPX:
1557                 case REPXMIN:
1558                         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1559
1560                 case END:
1561                         return 1;       /* Success! */
1562
1563                 case OPENNC:
1564                 case CLOSENC:
1565                         return regmatch(preg, next);
1566
1567                 default:
1568                         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1569                                 save = preg->reginput;
1570                                 if (regmatch(preg, next)) {
1571                                         if (OP(preg, scan) < CLOSE) {
1572                                                 int no = OP(preg, scan) - OPEN;
1573                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1574                                                         preg->pmatch[no].rm_so = save - preg->start;
1575                                                 }
1576                                         }
1577                                         else {
1578                                                 int no = OP(preg, scan) - CLOSE;
1579                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1580                                                         preg->pmatch[no].rm_eo = save - preg->start;
1581                                                 }
1582                                         }
1583                                         return(1);
1584                                 }
1585                                 return(0);
1586                         }
1587                         return REG_ERR_INTERNAL;
1588                 }
1589
1590                 scan = next;
1591         }
1592
1593         /*
1594          * We get here only if there's trouble -- normally "case END" is
1595          * the terminating point.
1596          */
1597         return REG_ERR_INTERNAL;
1598 }
1599
1600 /*
1601  - regrepeat - repeatedly match something simple, report how many
1602  */
1603 static int regrepeat(regex_t *preg, int p, int max)
1604 {
1605         int count = 0;
1606         const char *scan;
1607         int opnd;
1608         int ch;
1609         int n;
1610
1611         scan = preg->reginput;
1612         opnd = OPERAND(p);
1613         switch (OP(preg, p)) {
1614         case ANY:
1615                 /* No need to handle utf8 specially here */
1616                 while (!reg_iseol(preg, *scan) && count < max) {
1617                         count++;
1618                         scan++;
1619                 }
1620                 break;
1621         case EXACTLY:
1622                 while (count < max) {
1623                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1624                         if (preg->program[opnd] != ch) {
1625                                 break;
1626                         }
1627                         count++;
1628                         scan += n;
1629                 }
1630                 break;
1631         case ANYOF:
1632                 while (count < max) {
1633                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1634                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1635                                 break;
1636                         }
1637                         count++;
1638                         scan += n;
1639                 }
1640                 break;
1641         case ANYBUT:
1642                 while (count < max) {
1643                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1644                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1645                                 break;
1646                         }
1647                         count++;
1648                         scan += n;
1649                 }
1650                 break;
1651         default:                /* Oh dear.  Called inappropriately. */
1652                 preg->err = REG_ERR_INTERNAL;
1653                 count = 0;      /* Best compromise. */
1654                 break;
1655         }
1656         preg->reginput = scan;
1657
1658         return(count);
1659 }
1660
1661 /*
1662  - regnext - dig the "next" pointer out of a node
1663  */
1664 static int regnext(regex_t *preg, int p )
1665 {
1666         int offset;
1667
1668         offset = NEXT(preg, p);
1669
1670         if (offset == 0)
1671                 return 0;
1672
1673         if (OP(preg, p) == BACK)
1674                 return(p-offset);
1675         else
1676                 return(p+offset);
1677 }
1678
1679 /*
1680  - regopsize - returns the size of opcode + operands at 'p' in words
1681  */
1682 static int regopsize(regex_t *preg, int p )
1683 {
1684         /* Almost all opcodes are 2 words, but some are more */
1685         switch (OP(preg, p)) {
1686                 case REP:
1687                 case REPMIN:
1688                 case REPX:
1689                 case REPXMIN:
1690                         return 5;
1691
1692                 case ANYOF:
1693                 case ANYBUT:
1694                 case EXACTLY: {
1695                         int s = p + 2;
1696                         while (preg->program[s++]) {
1697                         }
1698                         return s - p;
1699                 }
1700         }
1701         return 2;
1702 }
1703
1704 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1705
1706 /*
1707  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1708  */
1709 static void regdump(regex_t *preg)
1710 {
1711         int s;
1712         int op = EXACTLY;       /* Arbitrary non-END op. */
1713         int next;
1714         char buf[MAX_UTF8_LEN + 1];
1715
1716         int i;
1717         for (i = 1; i < preg->p; i++) {
1718                 printf("%02x ", (unsigned char)preg->program[i]);
1719                 if (i % 16 == 0) {
1720                         printf("\n");
1721                 }
1722         }
1723         printf("\n");
1724
1725         s = 1;
1726         while (op != END && s < preg->p) {      /* While that wasn't END last time... */
1727                 op = OP(preg, s);
1728                 printf("%3d: %s", s, regprop(op));      /* Where, what. */
1729                 next = regnext(preg, s);
1730                 if (next == 0)          /* Next ptr. */
1731                         printf("(0)");
1732                 else
1733                         printf("(%d)", next);
1734                 s += 2;
1735                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1736                         int max = preg->program[s];
1737                         int min = preg->program[s + 1];
1738                         if (max == 65535) {
1739                                 printf("{%d,*}", min);
1740                         }
1741                         else {
1742                                 printf("{%d,%d}", min, max);
1743                         }
1744                         printf(" %d", preg->program[s + 2]);
1745                         s += 3;
1746                 }
1747                 else if (op == ANYOF || op == ANYBUT) {
1748                         /* set of ranges */
1749
1750                         while (preg->program[s]) {
1751                                 int len = preg->program[s++];
1752                                 int first = preg->program[s++];
1753                                 buf[utf8_getchars(buf, first)] = 0;
1754                                 printf("%s", buf);
1755                                 if (len > 1) {
1756                                         buf[utf8_getchars(buf, first + len - 1)] = 0;
1757                                         printf("-%s", buf);
1758                                 }
1759                         }
1760                         s++;
1761                 }
1762                 else if (op == EXACTLY) {
1763                         /* Literal string, where present. */
1764
1765                         while (preg->program[s]) {
1766                                 buf[utf8_getchars(buf, preg->program[s])] = 0;
1767                                 printf("%s", buf);
1768                                 s++;
1769                         }
1770                         s++;
1771                 }
1772                 putchar('\n');
1773         }
1774
1775         if (op == END) {
1776                 /* Header fields of interest. */
1777                 if (preg->regstart) {
1778                         buf[utf8_getchars(buf, preg->regstart)] = 0;
1779                         printf("start '%s' ", buf);
1780                 }
1781                 if (preg->reganch)
1782                         printf("anchored ");
1783                 if (preg->regmust != 0) {
1784                         int i;
1785                         printf("must have:");
1786                         for (i = 0; i < preg->regmlen; i++) {
1787                                 putchar(preg->program[preg->regmust + i]);
1788                         }
1789                         putchar('\n');
1790                 }
1791         }
1792         printf("\n");
1793 }
1794
1795 /*
1796  - regprop - printable representation of opcode
1797  */
1798 static const char *regprop( int op )
1799 {
1800         static char buf[50];
1801
1802         switch (op) {
1803         case BOL:
1804                 return "BOL";
1805         case EOL:
1806                 return "EOL";
1807         case BOLX:
1808                 return "BOLX";
1809         case EOLX:
1810                 return "EOLX";
1811         case ANY:
1812                 return "ANY";
1813         case ANYOF:
1814                 return "ANYOF";
1815         case ANYBUT:
1816                 return "ANYBUT";
1817         case BRANCH:
1818                 return "BRANCH";
1819         case EXACTLY:
1820                 return "EXACTLY";
1821         case NOTHING:
1822                 return "NOTHING";
1823         case BACK:
1824                 return "BACK";
1825         case END:
1826                 return "END";
1827         case REP:
1828                 return "REP";
1829         case REPMIN:
1830                 return "REPMIN";
1831         case REPX:
1832                 return "REPX";
1833         case REPXMIN:
1834                 return "REPXMIN";
1835         case WORDA:
1836                 return "WORDA";
1837         case WORDZ:
1838                 return "WORDZ";
1839         case OPENNC:
1840                 return "OPEN";
1841         case CLOSENC:
1842                 return "CLOSE";
1843         default:
1844                 if (op >= OPEN && op < CLOSE) {
1845                         snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1846                 }
1847                 else if (op >= CLOSE && op < CLOSE_END) {
1848                         snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1849                 }
1850                 else {
1851                         snprintf(buf, sizeof(buf), "?%d?\n", op);
1852                 }
1853                 return(buf);
1854         }
1855 }
1856 #endif /* JIM_BOOTSTRAP */
1857
1858 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1859 {
1860         static const char *error_strings[] = {
1861                 "success",
1862                 "no match",
1863                 "bad pattern",
1864                 "null argument",
1865                 "unknown error",
1866                 "too big",
1867                 "out of memory",
1868                 "too many ()",
1869                 "parentheses () not balanced",
1870                 "braces {} not balanced",
1871                 "invalid repetition count(s)",
1872                 "extra characters",
1873                 "*+ of empty atom",
1874                 "nested count",
1875                 "internal error",
1876                 "count follows nothing",
1877                 "trailing backslash",
1878                 "corrupted program",
1879                 "contains null char",
1880         };
1881         const char *err;
1882
1883         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1884                 err = "Bad error code";
1885         }
1886         else {
1887                 err = error_strings[errcode];
1888         }
1889
1890         return snprintf(errbuf, errbuf_size, "%s", err);
1891 }
1892
1893 void regfree(regex_t *preg)
1894 {
1895         free(preg->program);
1896 }
1897
1898 #endif