jimregexp.c

   1 /*
   2  * vi:se ts=8:
   3  *
   4  * regcomp and regexec -- regsub and regerror are elsewhere
   5  *
   6  *      Copyright (c) 1986 by University of Toronto.
   7  *      Written by Henry Spencer.  Not derived from licensed software.
   8  *
   9  *      Permission is granted to anyone to use this software for any
  10  *      purpose on any computer system, and to redistribute it freely,
  11  *      subject to the following restrictions:
  12  *
  13  *      1. The author is not responsible for the consequences of use of
  14  *              this software, no matter how awful, even if they arise
  15  *              from defects in it.
  16  *
  17  *      2. The origin of this software must not be misrepresented, either
  18  *              by explicit claim or by omission.
  19  *
  20  *      3. Altered versions must be plainly marked as such, and must not
  21  *              be misrepresented as being the original software.
  22  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  23  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  24  *** to assist in implementing egrep.
  25  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  26  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  27  *** as in BSD grep and ex.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  29  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  31  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  32  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  33  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  34  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  35  *** to avoid conflicting with the system regexp.h.  Const, bless its
  36  *** soul, was removed so it can compile everywhere.  The declaration
  37  *** of strchr() was in conflict on AIX, so it was removed (as it is
  38  *** happily defined in string.h).
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  41  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  42  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  43  *
  44  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  45  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  46  *   This includes counted repetitions, UTF-8 support, character classes,
  47  *   shorthand character classes, increased number of parentheses to 100,
  48  *   backslash escape sequences. It also removes \n as an alternative to |.
  49  *
  50  * Beware that some of this code is subtly aware of the way operator
  51  * precedence is structured in regular expressions.  Serious changes in
  52  * regular-expression syntax might require a total rethink.
  53  */
  54
  55 #include "jimautoconf.h"
  56
  57 #if defined(JIM_REGEXP)
  58 #include <stdio.h>
  59 #include <ctype.h>
  60 #include <stdlib.h>
  61 #include <string.h>
  62
  63 #include "jim.h"
  64 #include "jimregexp.h"
  65 #include "utf8.h"
  66
  67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
  68 #define REG_MAX_PAREN 100
  69
  70 /*
  71  * Structure for regexp "program".  This is essentially a linear encoding
  72  * of a nondeterministic finite-state machine (aka syntax charts or
  73  * "railroad normal form" in parsing technology).  Each node is an opcode
  74  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  75  * all nodes except BRANCH implement concatenation; a "next" pointer with
  76  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  77  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  78  * opposed to a collection of them) is never concatenated with anything
  79  * because of operator precedence.)  The operand of some types of node is
  80  * a literal string; for others, it is a node leading into a sub-FSM.  In
  81  * particular, the operand of a BRANCH node is the first node of the branch.
  82  * (NB this is *not* a tree structure:  the tail of the branch connects
  83  * to the thing following the set of BRANCHes.)  The opcodes are:
  84  */
  85
  86 /* definition           number  opnd?   meaning */
  87 #define END     0       /* no   End of program. */
  88 #define BOL     1       /* no   Match "" at beginning of line. */
  89 #define EOL     2       /* no   Match "" at end of line. */
  90 #define ANY     3       /* no   Match any one character. */
  91 #define ANYOF   4       /* str  Match any character in this string. */
  92 #define ANYBUT  5       /* str  Match any character not in this string. */
  93 #define BRANCH  6       /* node Match this alternative, or the next... */
  94 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  95 #define EXACTLY 8       /* str  Match this string. */
  96 #define NOTHING 9       /* no   Match empty string. */
  97 #define REP     10      /* max,min      Match this (simple) thing [min,max] times. */
  98 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, minimal match. */
  99 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
 100 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
 101 #define BOLX    14      /* no   Match "" at beginning of input. */
 102 #define EOLX    15      /* no   Match "" at end of input. */
 103 #define WORDA   16      /* no   Match "" at wordchar, where prev is nonword */
 104 #define WORDZ   17      /* no   Match "" at nonwordchar, where prev is word */
 105
 106 #define OPENNC  1000    /* no   Non-capturing parentheses - must be OPEN-1 */
 107 #define OPEN    1001    /* no   Mark this point in input as start of #n. */
 108                         /*      OPEN+1 is number 1, etc. */
 109
 110 /* must not be any other opts between OPEN and CLOSE */
 111
 112 #define CLOSENC 2000    /* no   Non-capturing parentheses - must be CLOSE-1 */
 113 #define CLOSE   2001    /* no   Analogous to OPEN. */
 114 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 115
 116 /*
 117  * The first word of the regexp internal "program" is actually this magic
 118  * number; the start node begins in the second word.
 119  */
 120 #define REG_MAGIC       0xFADED00D
 121
 122 /*
 123  * Opcode notes:
 124  *
 125  * BRANCH       The set of branches constituting a single choice are hooked
 126  *              together with their "next" pointers, since precedence prevents
 127  *              anything being concatenated to any individual branch.  The
 128  *              "next" pointer of the last BRANCH in a choice points to the
 129  *              thing following the whole choice.  This is also where the
 130  *              final "next" pointer of each individual branch points; each
 131  *              branch starts with the operand node of a BRANCH node.
 132  *
 133  * BACK         Normal "next" pointers all implicitly point forward; BACK
 134  *              exists to make loop structures possible.
 135  *
 136  * REP,REPX     Repeated matches ('?', '*', '+' and {min,max}) are implemented
 137  *              as either simple repeats (REP) or complex repeats (REPX).
 138  *              These opcodes include a "min" and "max" count after the opcode.
 139  *              This is followed by a fourth "current count" word that is
 140  *              only used by REPX, as it implements a recursive match.
 141  *              REPMIN and REPXMIN are identical except they implement minimal repeats.
 142  *
 143  * OPEN,CLOSE   ...are numbered at compile time.
 144  */
 145
 146 /*
 147  * A node is one word of opcode followed by one word of "next" pointer.
 148  * The "next" pointer value is a positive offset from the opcode of the node
 149  * containing it.
 150  * An operand, if any, simply follows the node.  (Note that much of the
 151  * code generation knows about this implicit relationship.)
 152  */
 153 #define OP(preg, p)     (preg->program[p])
 154 #define NEXT(preg, p)   (preg->program[p + 1])
 155 #define OPERAND(p)      ((p) + 2)
 156
 157 /*
 158  * See regmagic.h for one further detail of program structure.
 159  */
 160
 161
 162 /*
 163  * Utility definitions.
 164  */
 165
 166 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 167 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 168 #define META            "^$.[()|?{+*"
 169
 170 /*
 171  * Flags to be passed up and down.
 172  */
 173 #define HASWIDTH        1       /* Known never to match null string. */
 174 #define SIMPLE          2       /* Simple enough to be STAR/PLUS operand. */
 175 #define SPSTART         4       /* Starts with * or +. */
 176 #define WORST           0       /* Worst case. */
 177
 178 #define MAX_REP_COUNT 1000000
 179
 180 /*
 181  * Forward declarations for regcomp()'s friends.
 182  */
 183 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 184 static int regpiece(regex_t *preg, int *flagp );
 185 static int regbranch(regex_t *preg, int *flagp );
 186 static int regatom(regex_t *preg, int *flagp );
 187 static int regnode(regex_t *preg, int op );
 188 static int regnext(regex_t *preg, int p );
 189 static void regc(regex_t *preg, int b );
 190 static int reginsert(regex_t *preg, int op, int size, int opnd );
 191 static void regtail(regex_t *preg, int p, int val);
 192 static void regoptail(regex_t *preg, int p, int val );
 193 static int regopsize(regex_t *preg, int p );
 194
 195 static int reg_range_find(const int *string, int c);
 196 static const char *str_find(const char *string, int c, int nocase);
 197 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 198
 199 /*#define DEBUG*/
 200 #ifdef DEBUG
 201 static int regnarrate = 0;
 202 static void regdump(regex_t *preg);
 203 static const char *regprop( int op );
 204 #endif
 205
 206
 207 /**
 208  * Returns the length of the null-terminated integer sequence.
 209  */
 210 static int str_int_len(const int *seq)
 211 {
 212         int n = 0;
 213         while (*seq++) {
 214                 n++;
 215         }
 216         return n;
 217 }
 218
 219 /*
 220  - regcomp - compile a regular expression into internal code
 221  *
 222  * We can't allocate space until we know how big the compiled form will be,
 223  * but we can't compile it (and thus know how big it is) until we've got a
 224  * place to put the code.  So we cheat:  we compile it twice, once with code
 225  * generation turned off and size counting turned on, and once "for real".
 226  * This also means that we don't allocate space until we are sure that the
 227  * thing really will compile successfully, and we never have to move the
 228  * code and thus invalidate pointers into it.  (Note that it has to be in
 229  * one piece because free() must be able to free it all.)
 230  *
 231  * Beware that the optimization-preparation code in here knows about some
 232  * of the structure of the compiled regexp.
 233  */
 234 int regcomp(regex_t *preg, const char *exp, int cflags)
 235 {
 236         int scan;
 237         int longest;
 238         unsigned len;
 239         int flags;
 240
 241 #ifdef DEBUG
 242         fprintf(stderr, "Compiling: '%s'\n", exp);
 243 #endif
 244         memset(preg, 0, sizeof(*preg));
 245
 246         if (exp == NULL)
 247                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 248
 249         /* First pass: determine size, legality. */
 250         preg->cflags = cflags;
 251         preg->regparse = exp;
 252
 253         /* Allocate space. */
 254         preg->proglen = (strlen(exp) + 1) * 5;
 255         preg->program = malloc(preg->proglen * sizeof(int));
 256         if (preg->program == NULL)
 257                 FAIL(preg, REG_ERR_NOMEM);
 258
 259         /* Note that since we store a magic value as the first item in the program,
 260          * program offsets will never be 0
 261          */
 262         regc(preg, REG_MAGIC);
 263         if (reg(preg, 0, &flags) == 0) {
 264                 return preg->err;
 265         }
 266
 267         /* Small enough for pointer-storage convention? */
 268         if (preg->re_nsub >= REG_MAX_PAREN)             /* Probably could be 65535L. */
 269                 FAIL(preg,REG_ERR_TOO_BIG);
 270
 271         /* Dig out information for optimizations. */
 272         preg->regstart = 0;     /* Worst-case defaults. */
 273         preg->reganch = 0;
 274         preg->regmust = 0;
 275         preg->regmlen = 0;
 276         scan = 1;                       /* First BRANCH. */
 277         if (OP(preg, regnext(preg, scan)) == END) {             /* Only one top-level choice. */
 278                 scan = OPERAND(scan);
 279
 280                 /* Starting-point info. */
 281                 if (OP(preg, scan) == EXACTLY) {
 282                         preg->regstart = preg->program[OPERAND(scan)];
 283                 }
 284                 else if (OP(preg, scan) == BOL)
 285                         preg->reganch++;
 286
 287                 /*
 288                  * If there's something expensive in the r.e., find the
 289                  * longest literal string that must appear and make it the
 290                  * regmust.  Resolve ties in favor of later strings, since
 291                  * the regstart check works with the beginning of the r.e.
 292                  * and avoiding duplication strengthens checking.  Not a
 293                  * strong reason, but sufficient in the absence of others.
 294                  */
 295                 if (flags&SPSTART) {
 296                         longest = 0;
 297                         len = 0;
 298                         for (; scan != 0; scan = regnext(preg, scan)) {
 299                                 if (OP(preg, scan) == EXACTLY) {
 300                                         int plen = str_int_len(preg->program + OPERAND(scan));
 301                                         if (plen >= len) {
 302                                                 longest = OPERAND(scan);
 303                                                 len = plen;
 304                                         }
 305                                 }
 306                         }
 307                         preg->regmust = longest;
 308                         preg->regmlen = len;
 309                 }
 310         }
 311
 312 #ifdef DEBUG
 313         regdump(preg);
 314 #endif
 315
 316         return 0;
 317 }
 318
 319 /*
 320  - reg - regular expression, i.e. main body or parenthesized thing
 321  *
 322  * Caller must absorb opening parenthesis.
 323  *
 324  * Combining parenthesis handling with the base level of regular expression
 325  * is a trifle forced, but the need to tie the tails of the branches to what
 326  * follows makes it hard to avoid.
 327  */
 328 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 329 {
 330         int ret;
 331         int br;
 332         int ender;
 333         int parno = 0;
 334         int flags;
 335
 336         *flagp = HASWIDTH;      /* Tentatively. */
 337
 338         /* Make an OPEN node, if parenthesized. */
 339         if (paren) {
 340                 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 341                         /* non-capturing paren */
 342                         preg->regparse += 2;
 343                         parno = -1;
 344                 }
 345                 else {
 346                         parno = ++preg->re_nsub;
 347                 }
 348                 ret = regnode(preg, OPEN+parno);
 349         } else
 350                 ret = 0;
 351
 352         /* Pick up the branches, linking them together. */
 353         br = regbranch(preg, &flags);
 354         if (br == 0)
 355                 return 0;
 356         if (ret != 0)
 357                 regtail(preg, ret, br); /* OPEN -> first. */
 358         else
 359                 ret = br;
 360         if (!(flags&HASWIDTH))
 361                 *flagp &= ~HASWIDTH;
 362         *flagp |= flags&SPSTART;
 363         while (*preg->regparse == '|') {
 364                 preg->regparse++;
 365                 br = regbranch(preg, &flags);
 366                 if (br == 0)
 367                         return 0;
 368                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 369                 if (!(flags&HASWIDTH))
 370                         *flagp &= ~HASWIDTH;
 371                 *flagp |= flags&SPSTART;
 372         }
 373
 374         /* Make a closing node, and hook it on the end. */
 375         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 376         regtail(preg, ret, ender);
 377
 378         /* Hook the tails of the branches to the closing node. */
 379         for (br = ret; br != 0; br = regnext(preg, br))
 380                 regoptail(preg, br, ender);
 381
 382         /* Check for proper termination. */
 383         if (paren && *preg->regparse++ != ')') {
 384                 preg->err = REG_ERR_UNMATCHED_PAREN;
 385                 return 0;
 386         } else if (!paren && *preg->regparse != '\0') {
 387                 if (*preg->regparse == ')') {
 388                         preg->err = REG_ERR_UNMATCHED_PAREN;
 389                         return 0;
 390                 } else {
 391                         preg->err = REG_ERR_JUNK_ON_END;
 392                         return 0;
 393                 }
 394         }
 395
 396         return(ret);
 397 }
 398
 399 /*
 400  - regbranch - one alternative of an | operator
 401  *
 402  * Implements the concatenation operator.
 403  */
 404 static int regbranch(regex_t *preg, int *flagp )
 405 {
 406         int ret;
 407         int chain;
 408         int latest;
 409         int flags;
 410
 411         *flagp = WORST;         /* Tentatively. */
 412
 413         ret = regnode(preg, BRANCH);
 414         chain = 0;
 415         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 416                *preg->regparse != '|') {
 417                 latest = regpiece(preg, &flags);
 418                 if (latest == 0)
 419                         return 0;
 420                 *flagp |= flags&HASWIDTH;
 421                 if (chain == 0) {/* First piece. */
 422                         *flagp |= flags&SPSTART;
 423                 }
 424                 else {
 425                         regtail(preg, chain, latest);
 426                 }
 427                 chain = latest;
 428         }
 429         if (chain == 0) /* Loop ran zero times. */
 430                 (void) regnode(preg, NOTHING);
 431
 432         return(ret);
 433 }
 434
 435 /*
 436  - regpiece - something followed by possible [*+?]
 437  *
 438  * Note that the branching code sequences used for ? and the general cases
 439  * of * and + are somewhat optimized:  they use the same NOTHING node as
 440  * both the endmarker for their branch list and the body of the last branch.
 441  * It might seem that this node could be dispensed with entirely, but the
 442  * endmarker role is not redundant.
 443  */
 444 static int regpiece(regex_t *preg, int *flagp)
 445 {
 446         int ret;
 447         char op;
 448         int next;
 449         int flags;
 450         int min;
 451         int max;
 452
 453         ret = regatom(preg, &flags);
 454         if (ret == 0)
 455                 return 0;
 456
 457         op = *preg->regparse;
 458         if (!ISMULT(op)) {
 459                 *flagp = flags;
 460                 return(ret);
 461         }
 462
 463         if (!(flags&HASWIDTH) && op != '?') {
 464                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 465                 return 0;
 466         }
 467
 468         /* Handle braces (counted repetition) by expansion */
 469         if (op == '{') {
 470                 char *end;
 471
 472                 min = strtoul(preg->regparse + 1, &end, 10);
 473                 if (end == preg->regparse + 1) {
 474                         preg->err = REG_ERR_BAD_COUNT;
 475                         return 0;
 476                 }
 477                 if (*end == '}') {
 478                         max = min;
 479                 }
 480                 else if (*end == '\0') {
 481                         preg->err = REG_ERR_UNMATCHED_BRACES;
 482                         return 0;
 483                 }
 484                 else {
 485                         preg->regparse = end;
 486                         max = strtoul(preg->regparse + 1, &end, 10);
 487                         if (*end != '}') {
 488                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 489                                 return 0;
 490                         }
 491                 }
 492                 if (end == preg->regparse + 1) {
 493                         max = MAX_REP_COUNT;
 494                 }
 495                 else if (max < min || max >= 100) {
 496                         preg->err = REG_ERR_BAD_COUNT;
 497                         return 0;
 498                 }
 499                 if (min >= 100) {
 500                         preg->err = REG_ERR_BAD_COUNT;
 501                         return 0;
 502                 }
 503
 504                 preg->regparse = strchr(preg->regparse, '}');
 505         }
 506         else {
 507                 min = (op == '+');
 508                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 509         }
 510
 511         if (preg->regparse[1] == '?') {
 512                 preg->regparse++;
 513                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 514         }
 515         else {
 516                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 517         }
 518         preg->program[ret + 2] = max;
 519         preg->program[ret + 3] = min;
 520         preg->program[ret + 4] = 0;
 521
 522         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 523
 524         if (!(flags & SIMPLE)) {
 525                 int back = regnode(preg, BACK);
 526                 regtail(preg, back, ret);
 527                 regtail(preg, next, back);
 528         }
 529
 530         preg->regparse++;
 531         if (ISMULT(*preg->regparse)) {
 532                 preg->err = REG_ERR_NESTED_COUNT;
 533                 return 0;
 534         }
 535
 536         return ret;
 537 }
 538
 539 /**
 540  * Add all characters in the inclusive range between lower and upper.
 541  *
 542  * Handles a swapped range (upper < lower).
 543  */
 544 static void reg_addrange(regex_t *preg, int lower, int upper)
 545 {
 546         if (lower > upper) {
 547                 reg_addrange(preg, upper, lower);
 548         }
 549         /* Add a range as length, start */
 550         regc(preg, upper - lower + 1);
 551         regc(preg, lower);
 552 }
 553
 554 /**
 555  * Add a null-terminated literal string as a set of ranges.
 556  */
 557 static void reg_addrange_str(regex_t *preg, const char *str)
 558 {
 559         while (*str) {
 560                 reg_addrange(preg, *str, *str);
 561                 str++;
 562         }
 563 }
 564
 565 /**
 566  * Extracts the next unicode char from utf8.
 567  *
 568  * If 'upper' is set, converts the char to uppercase.
 569  */
 570 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 571 {
 572         int l = utf8_tounicode(s, uc);
 573         if (upper) {
 574                 *uc = utf8_upper(*uc);
 575         }
 576         return l;
 577 }
 578
 579 /**
 580  * Converts a hex digit to decimal.
 581  *
 582  * Returns -1 for an invalid hex digit.
 583  */
 584 static int hexdigitval(int c)
 585 {
 586         if (c >= '0' && c <= '9')
 587                 return c - '0';
 588         if (c >= 'a' && c <= 'f')
 589                 return c - 'a' + 10;
 590         if (c >= 'A' && c <= 'F')
 591                 return c - 'A' + 10;
 592         return -1;
 593 }
 594
 595 /**
 596  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 597  *
 598  * Returns the number of hex digits parsed.
 599  * If there are no hex digits, returns 0 and stores nothing.
 600  */
 601 static int parse_hex(const char *s, int n, int *uc)
 602 {
 603         int val = 0;
 604         int k;
 605
 606         for (k = 0; k < n; k++) {
 607                 int c = hexdigitval(*s++);
 608                 if (c == -1) {
 609                         break;
 610                 }
 611                 val = (val << 4) | c;
 612         }
 613         if (k) {
 614                 *uc = val;
 615         }
 616         return k;
 617 }
 618
 619 /**
 620  * Call for chars after a backlash to decode the escape sequence.
 621  *
 622  * Stores the result in *ch.
 623  *
 624  * Returns the number of bytes consumed.
 625  */
 626 static int reg_decode_escape(const char *s, int *ch)
 627 {
 628         int n;
 629         const char *s0 = s;
 630
 631         *ch = *s++;
 632
 633         switch (*ch) {
 634                 case 'b': *ch = '\b'; break;
 635                 case 'e': *ch = 27; break;
 636                 case 'f': *ch = '\f'; break;
 637                 case 'n': *ch = '\n'; break;
 638                 case 'r': *ch = '\r'; break;
 639                 case 't': *ch = '\t'; break;
 640                 case 'v': *ch = '\v'; break;
 641                 case 'u':
 642                         if (*s == '{') {
 643                                 /* Expect \u{NNNN} */
 644                                 n = parse_hex(s + 1, 6, ch);
 645                                 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
 646                                         s += n + 2;
 647                                 }
 648                                 else {
 649                                         /* Invalid, so just treat as an escaped 'u' */
 650                                         *ch = 'u';
 651                                 }
 652                         }
 653                         else if ((n = parse_hex(s, 4, ch)) > 0) {
 654                                 s += n;
 655                         }
 656                         break;
 657                 case 'U':
 658                         if ((n = parse_hex(s, 8, ch)) > 0) {
 659                                 s += n;
 660                         }
 661                         break;
 662                 case 'x':
 663                         if ((n = parse_hex(s, 2, ch)) > 0) {
 664                                 s += n;
 665                         }
 666                         break;
 667                 case '\0':
 668                         s--;
 669                         *ch = '\\';
 670                         break;
 671         }
 672         return s - s0;
 673 }
 674
 675 /*
 676  - regatom - the lowest level
 677  *
 678  * Optimization:  gobbles an entire sequence of ordinary characters so that
 679  * it can turn them into a single node, which is smaller to store and
 680  * faster to run.  Backslashed characters are exceptions, each becoming a
 681  * separate node; the code is simpler that way and it's not worth fixing.
 682  */
 683 static int regatom(regex_t *preg, int *flagp)
 684 {
 685         int ret;
 686         int flags;
 687         int nocase = (preg->cflags & REG_ICASE);
 688
 689         int ch;
 690         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 691
 692         *flagp = WORST;         /* Tentatively. */
 693
 694         preg->regparse += n;
 695         switch (ch) {
 696         /* FIXME: these chars only have meaning at beg/end of pat? */
 697         case '^':
 698                 ret = regnode(preg, BOL);
 699                 break;
 700         case '$':
 701                 ret = regnode(preg, EOL);
 702                 break;
 703         case '.':
 704                 ret = regnode(preg, ANY);
 705                 *flagp |= HASWIDTH|SIMPLE;
 706                 break;
 707         case '[': {
 708                         const char *pattern = preg->regparse;
 709
 710                         if (*pattern == '^') {  /* Complement of range. */
 711                                 ret = regnode(preg, ANYBUT);
 712                                 pattern++;
 713                         } else
 714                                 ret = regnode(preg, ANYOF);
 715
 716                         /* Special case. If the first char is ']' or '-', it is part of the set */
 717                         if (*pattern == ']' || *pattern == '-') {
 718                                 reg_addrange(preg, *pattern, *pattern);
 719                                 pattern++;
 720                         }
 721
 722                         while (*pattern != ']') {
 723                                 /* Is this a range? a-z */
 724                                 int start;
 725                                 int end;
 726
 727                                 enum {
 728                                         CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
 729                                         CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
 730                                         CC_NUM
 731                                 };
 732                                 int cc;
 733
 734                                 if (!*pattern) {
 735                                         preg->err = REG_ERR_UNMATCHED_BRACKET;
 736                                         return 0;
 737                                 }
 738
 739                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 740                                 if (start == '\\') {
 741                                         /* First check for class shorthand escapes */
 742                                         switch (*pattern) {
 743                                                 case 's':
 744                                                         pattern++;
 745                                                         cc = CC_SPACE;
 746                                                         goto cc_switch;
 747                                                 case 'd':
 748                                                         pattern++;
 749                                                         cc = CC_DIGIT;
 750                                                         goto cc_switch;
 751                                                 case 'w':
 752                                                         pattern++;
 753                                                         reg_addrange(preg, '_', '_');
 754                                                         cc = CC_ALNUM;
 755                                                         goto cc_switch;
 756                                         }
 757                                         pattern += reg_decode_escape(pattern, &start);
 758                                         if (start == 0) {
 759                                                 preg->err = REG_ERR_NULL_CHAR;
 760                                                 return 0;
 761                                         }
 762                                         if (start == '\\' && *pattern == 0) {
 763                                                 preg->err = REG_ERR_INVALID_ESCAPE;
 764                                                 return 0;
 765                                         }
 766                                 }
 767                                 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
 768                                         /* skip '-' */
 769                                         pattern += utf8_tounicode(pattern, &end);
 770                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 771                                         if (end == '\\') {
 772                                                 pattern += reg_decode_escape(pattern, &end);
 773                                                 if (end == 0) {
 774                                                         preg->err = REG_ERR_NULL_CHAR;
 775                                                         return 0;
 776                                                 }
 777                                                 if (start == '\\' && *pattern == 0) {
 778                                                         preg->err = REG_ERR_INVALID_ESCAPE;
 779                                                         return 0;
 780                                                 }
 781                                         }
 782
 783                                         reg_addrange(preg, start, end);
 784                                         continue;
 785                                 }
 786                                 if (start == '[' && pattern[0] == ':') {
 787                                         static const char *character_class[] = {
 788                                                 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
 789                                                 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
 790                                         };
 791
 792                                         for (cc = 0; cc < CC_NUM; cc++) {
 793                                                 n = strlen(character_class[cc]);
 794                                                 if (strncmp(pattern, character_class[cc], n) == 0) {
 795                                                         /* Found a character class */
 796                                                         pattern += n + 1;
 797                                                         break;
 798                                                 }
 799                                         }
 800                                         if (cc != CC_NUM) {
 801 cc_switch:
 802                                                 switch (cc) {
 803                                                         case CC_ALNUM:
 804                                                                 reg_addrange(preg, '0', '9');
 805                                                                 /* Fall through */
 806                                                         case CC_ALPHA:
 807                                                                 if ((preg->cflags & REG_ICASE) == 0) {
 808                                                                         reg_addrange(preg, 'a', 'z');
 809                                                                 }
 810                                                                 reg_addrange(preg, 'A', 'Z');
 811                                                                 break;
 812                                                         case CC_SPACE:
 813                                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 814                                                                 break;
 815                                                         case CC_BLANK:
 816                                                                 reg_addrange_str(preg, " \t");
 817                                                                 break;
 818                                                         case CC_UPPER:
 819                                                                 reg_addrange(preg, 'A', 'Z');
 820                                                                 break;
 821                                                         case CC_LOWER:
 822                                                                 reg_addrange(preg, 'a', 'z');
 823                                                                 break;
 824                                                         case CC_XDIGIT:
 825                                                                 reg_addrange(preg, 'a', 'f');
 826                                                                 reg_addrange(preg, 'A', 'F');
 827                                                                 /* Fall through */
 828                                                         case CC_DIGIT:
 829                                                                 reg_addrange(preg, '0', '9');
 830                                                                 break;
 831                                                         case CC_CNTRL:
 832                                                                 reg_addrange(preg, 0, 31);
 833                                                                 reg_addrange(preg, 127, 127);
 834                                                                 break;
 835                                                         case CC_PRINT:
 836                                                                 reg_addrange(preg, ' ', '~');
 837                                                                 break;
 838                                                         case CC_GRAPH:
 839                                                                 reg_addrange(preg, '!', '~');
 840                                                                 break;
 841                                                         case CC_PUNCT:
 842                                                                 reg_addrange(preg, '!', '/');
 843                                                                 reg_addrange(preg, ':', '@');
 844                                                                 reg_addrange(preg, '[', '`');
 845                                                                 reg_addrange(preg, '{', '~');
 846                                                                 break;
 847                                                 }
 848                                                 continue;
 849                                         }
 850                                 }
 851                                 /* Not a range, so just add the char */
 852                                 reg_addrange(preg, start, start);
 853                         }
 854                         regc(preg, '\0');
 855
 856                         if (*pattern) {
 857                                 pattern++;
 858                         }
 859                         preg->regparse = pattern;
 860
 861                         *flagp |= HASWIDTH|SIMPLE;
 862                 }
 863                 break;
 864         case '(':
 865                 ret = reg(preg, 1, &flags);
 866                 if (ret == 0)
 867                         return 0;
 868                 *flagp |= flags&(HASWIDTH|SPSTART);
 869                 break;
 870         case '\0':
 871         case '|':
 872         case ')':
 873                 preg->err = REG_ERR_INTERNAL;
 874                 return 0;       /* Supposed to be caught earlier. */
 875         case '?':
 876         case '+':
 877         case '*':
 878         case '{':
 879                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 880                 return 0;
 881         case '\\':
 882                 ch = *preg->regparse++;
 883                 switch (ch) {
 884                 case '\0':
 885                         preg->err = REG_ERR_INVALID_ESCAPE;
 886                         return 0;
 887                 case 'A':
 888                         ret = regnode(preg, BOLX);
 889                         break;
 890                 case 'Z':
 891                         ret = regnode(preg, EOLX);
 892                         break;
 893                 case '<':
 894                 case 'm':
 895                         ret = regnode(preg, WORDA);
 896                         break;
 897                 case '>':
 898                 case 'M':
 899                         ret = regnode(preg, WORDZ);
 900                         break;
 901                 case 'd':
 902                 case 'D':
 903                         ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
 904                         reg_addrange(preg, '0', '9');
 905                         regc(preg, '\0');
 906                         *flagp |= HASWIDTH|SIMPLE;
 907                         break;
 908                 case 'w':
 909                 case 'W':
 910                         ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
 911                         if ((preg->cflags & REG_ICASE) == 0) {
 912                                 reg_addrange(preg, 'a', 'z');
 913                         }
 914                         reg_addrange(preg, 'A', 'Z');
 915                         reg_addrange(preg, '0', '9');
 916                         reg_addrange(preg, '_', '_');
 917                         regc(preg, '\0');
 918                         *flagp |= HASWIDTH|SIMPLE;
 919                         break;
 920                 case 's':
 921                 case 'S':
 922                         ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
 923                         reg_addrange_str(preg," \t\r\n\f\v");
 924                         regc(preg, '\0');
 925                         *flagp |= HASWIDTH|SIMPLE;
 926                         break;
 927                 /* FIXME: Someday handle \1, \2, ... */
 928                 default:
 929                         /* Handle general quoted chars in exact-match routine */
 930                         /* Back up to include the backslash */
 931                         preg->regparse--;
 932                         goto de_fault;
 933                 }
 934                 break;
 935         de_fault:
 936         default: {
 937                         /*
 938                          * Encode a string of characters to be matched exactly.
 939                          */
 940                         int added = 0;
 941
 942                         /* Back up to pick up the first char of interest */
 943                         preg->regparse -= n;
 944
 945                         ret = regnode(preg, EXACTLY);
 946
 947                         /* Note that a META operator such as ? or * consumes the
 948                          * preceding char.
 949                          * Thus we must be careful to look ahead by 2 and add the
 950                          * last char as it's own EXACTLY if necessary
 951                          */
 952
 953                         /* Until end of string or a META char is reached */
 954                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 955                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 956                                 if (ch == '\\' && preg->regparse[n]) {
 957                                         /* Non-trailing backslash.
 958                                          * Is this a special escape, or a regular escape?
 959                                          */
 960                                         if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
 961                                                 /* A special escape. All done with EXACTLY */
 962                                                 break;
 963                                         }
 964                                         /* Decode it. Note that we add the length for the escape
 965                                          * sequence to the length for the backlash so we can skip
 966                                          * the entire sequence, or not as required.
 967                                          */
 968                                         n += reg_decode_escape(preg->regparse + n, &ch);
 969                                         if (ch == 0) {
 970                                                 preg->err = REG_ERR_NULL_CHAR;
 971                                                 return 0;
 972                                         }
 973                                 }
 974
 975                                 /* Now we have one char 'ch' of length 'n'.
 976                                  * Check to see if the following char is a MULT
 977                                  */
 978
 979                                 if (ISMULT(preg->regparse[n])) {
 980                                         /* Yes. But do we already have some EXACTLY chars? */
 981                                         if (added) {
 982                                                 /* Yes, so return what we have and pick up the current char next time around */
 983                                                 break;
 984                                         }
 985                                         /* No, so add this single char and finish */
 986                                         regc(preg, ch);
 987                                         added++;
 988                                         preg->regparse += n;
 989                                         break;
 990                                 }
 991
 992                                 /* No, so just add this char normally */
 993                                 regc(preg, ch);
 994                                 added++;
 995                                 preg->regparse += n;
 996                         }
 997                         regc(preg, '\0');
 998
 999                         *flagp |= HASWIDTH;
1000                         if (added == 1)
1001                                 *flagp |= SIMPLE;
1002                         break;
1003                 }
1004                 break;
1005         }
1006
1007         return(ret);
1008 }
1009
1010 static void reg_grow(regex_t *preg, int n)
1011 {
1012         if (preg->p + n >= preg->proglen) {
1013                 preg->proglen = (preg->p + n) * 2;
1014                 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
1015         }
1016 }
1017
1018 /*
1019  - regnode - emit a node
1020  */
1021 /* Location. */
1022 static int regnode(regex_t *preg, int op)
1023 {
1024         reg_grow(preg, 2);
1025
1026         /* The OP followed by a next pointer */
1027         preg->program[preg->p++] = op;
1028         preg->program[preg->p++] = 0;
1029
1030         /* Return the start of the node */
1031         return preg->p - 2;
1032 }
1033
1034 /*
1035  - regc - emit (if appropriate) a byte of code
1036  */
1037 static void regc(regex_t *preg, int b )
1038 {
1039         reg_grow(preg, 1);
1040         preg->program[preg->p++] = b;
1041 }
1042
1043 /*
1044  - reginsert - insert an operator in front of already-emitted operand
1045  *
1046  * Means relocating the operand.
1047  * Returns the new location of the original operand.
1048  */
1049 static int reginsert(regex_t *preg, int op, int size, int opnd )
1050 {
1051         reg_grow(preg, size);
1052
1053         /* Move everything from opnd up */
1054         memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1055         /* Zero out the new space */
1056         memset(preg->program + opnd, 0, sizeof(int) * size);
1057
1058         preg->program[opnd] = op;
1059
1060         preg->p += size;
1061
1062         return opnd + size;
1063 }
1064
1065 /*
1066  - regtail - set the next-pointer at the end of a node chain
1067  */
1068 static void regtail(regex_t *preg, int p, int val)
1069 {
1070         int scan;
1071         int temp;
1072         int offset;
1073
1074         /* Find last node. */
1075         scan = p;
1076         for (;;) {
1077                 temp = regnext(preg, scan);
1078                 if (temp == 0)
1079                         break;
1080                 scan = temp;
1081         }
1082
1083         if (OP(preg, scan) == BACK)
1084                 offset = scan - val;
1085         else
1086                 offset = val - scan;
1087
1088         preg->program[scan + 1] = offset;
1089 }
1090
1091 /*
1092  - regoptail - regtail on operand of first argument; nop if operandless
1093  */
1094
1095 static void regoptail(regex_t *preg, int p, int val )
1096 {
1097         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1098         if (p != 0 && OP(preg, p) == BRANCH) {
1099                 regtail(preg, OPERAND(p), val);
1100         }
1101 }
1102
1103 /*
1104  * regexec and friends
1105  */
1106
1107 /*
1108  * Forwards.
1109  */
1110 static int regtry(regex_t *preg, const char *string );
1111 static int regmatch(regex_t *preg, int prog);
1112 static int regrepeat(regex_t *preg, int p, int max);
1113
1114 /*
1115  - regexec - match a regexp against a string
1116  */
1117 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1118 {
1119         const char *s;
1120         int scan;
1121
1122         /* Be paranoid... */
1123         if (preg == NULL || preg->program == NULL || string == NULL) {
1124                 return REG_ERR_NULL_ARGUMENT;
1125         }
1126
1127         /* Check validity of program. */
1128         if (*preg->program != REG_MAGIC) {
1129                 return REG_ERR_CORRUPTED;
1130         }
1131
1132 #ifdef DEBUG
1133         fprintf(stderr, "regexec: %s\n", string);
1134         regdump(preg);
1135 #endif
1136
1137         preg->eflags = eflags;
1138         preg->pmatch = pmatch;
1139         preg->nmatch = nmatch;
1140         preg->start = string;   /* All offsets are computed from here */
1141
1142         /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1143         for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1144                 int op = OP(preg, scan);
1145                 if (op == END)
1146                         break;
1147                 if (op == REPX || op == REPXMIN)
1148                         preg->program[scan + 4] = 0;
1149         }
1150
1151         /* If there is a "must appear" string, look for it. */
1152         if (preg->regmust != 0) {
1153                 s = string;
1154                 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1155                         if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1156                                 break;
1157                         }
1158                         s++;
1159                 }
1160                 if (s == NULL)  /* Not present. */
1161                         return REG_NOMATCH;
1162         }
1163
1164         /* Mark beginning of line for ^ . */
1165         preg->regbol = string;
1166
1167         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1168         if (preg->reganch) {
1169                 if (eflags & REG_NOTBOL) {
1170                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1171                         goto nextline;
1172                 }
1173                 while (1) {
1174                         if (regtry(preg, string)) {
1175                                 return REG_NOERROR;
1176                         }
1177                         if (*string) {
1178 nextline:
1179                                 if (preg->cflags & REG_NEWLINE) {
1180                                         /* Try the next anchor? */
1181                                         string = strchr(string, '\n');
1182                                         if (string) {
1183                                                 preg->regbol = ++string;
1184                                                 continue;
1185                                         }
1186                                 }
1187                         }
1188                         return REG_NOMATCH;
1189                 }
1190         }
1191
1192         /* Messy cases:  unanchored match. */
1193         s = string;
1194         if (preg->regstart != '\0') {
1195                 /* We know what char it must start with. */
1196                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1197                         if (regtry(preg, s))
1198                                 return REG_NOERROR;
1199                         s++;
1200                 }
1201         }
1202         else
1203                 /* We don't -- general case. */
1204                 while (1) {
1205                         if (regtry(preg, s))
1206                                 return REG_NOERROR;
1207                         if (*s == '\0') {
1208                                 break;
1209                         }
1210                         else {
1211                                 int c;
1212                                 s += utf8_tounicode(s, &c);
1213                         }
1214                 }
1215
1216         /* Failure. */
1217         return REG_NOMATCH;
1218 }
1219
1220 /*
1221  - regtry - try match at specific point
1222  */
1223                         /* 0 failure, 1 success */
1224 static int regtry( regex_t *preg, const char *string )
1225 {
1226         int i;
1227
1228         preg->reginput = string;
1229
1230         for (i = 0; i < preg->nmatch; i++) {
1231                 preg->pmatch[i].rm_so = -1;
1232                 preg->pmatch[i].rm_eo = -1;
1233         }
1234         if (regmatch(preg, 1)) {
1235                 preg->pmatch[0].rm_so = string - preg->start;
1236                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1237                 return(1);
1238         } else
1239                 return(0);
1240 }
1241
1242 /**
1243  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1244  *
1245  * If 'nocase' is non-zero, does a case-insensitive match.
1246  *
1247  * Returns -1 on not found.
1248  */
1249 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1250 {
1251         const char *s = string;
1252         while (proglen && *s) {
1253                 int ch;
1254                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1255                 if (ch != *prog) {
1256                         return -1;
1257                 }
1258                 prog++;
1259                 s += n;
1260                 proglen--;
1261         }
1262         if (proglen == 0) {
1263                 return s - string;
1264         }
1265         return -1;
1266 }
1267
1268 /**
1269  * Searchs for 'c' in the range 'range'.
1270  *
1271  * Returns 1 if found, or 0 if not.
1272  */
1273 static int reg_range_find(const int *range, int c)
1274 {
1275         while (*range) {
1276                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1277                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1278                         return 1;
1279                 }
1280                 range += 2;
1281         }
1282         return 0;
1283 }
1284
1285 /**
1286  * Search for the character 'c' in the utf-8 string 'string'.
1287  *
1288  * If 'nocase' is set, the 'string' is assumed to be uppercase
1289  * and 'c' is converted to uppercase before matching.
1290  *
1291  * Returns the byte position in the string where the 'c' was found, or
1292  * NULL if not found.
1293  */
1294 static const char *str_find(const char *string, int c, int nocase)
1295 {
1296         if (nocase) {
1297                 /* The "string" should already be converted to uppercase */
1298                 c = utf8_upper(c);
1299         }
1300         while (*string) {
1301                 int ch;
1302                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1303                 if (c == ch) {
1304                         return string;
1305                 }
1306                 string += n;
1307         }
1308         return NULL;
1309 }
1310
1311 /**
1312  * Returns true if 'ch' is an end-of-line char.
1313  *
1314  * In REG_NEWLINE mode, \n is considered EOL in
1315  * addition to \0
1316  */
1317 static int reg_iseol(regex_t *preg, int ch)
1318 {
1319         if (preg->cflags & REG_NEWLINE) {
1320                 return ch == '\0' || ch == '\n';
1321         }
1322         else {
1323                 return ch == '\0';
1324         }
1325 }
1326
1327 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1328 {
1329         int nextch = '\0';
1330         const char *save;
1331         int no;
1332         int c;
1333
1334         int max = preg->program[scan + 2];
1335         int min = preg->program[scan + 3];
1336         int next = regnext(preg, scan);
1337
1338         /*
1339          * Lookahead to avoid useless match attempts
1340          * when we know what character comes next.
1341          */
1342         if (OP(preg, next) == EXACTLY) {
1343                 nextch = preg->program[OPERAND(next)];
1344         }
1345         save = preg->reginput;
1346         no = regrepeat(preg, scan + 5, max);
1347         if (no < min) {
1348                 return 0;
1349         }
1350         if (matchmin) {
1351                 /* from min up to no */
1352                 max = no;
1353                 no = min;
1354         }
1355         /* else from no down to min */
1356         while (1) {
1357                 if (matchmin) {
1358                         if (no > max) {
1359                                 break;
1360                         }
1361                 }
1362                 else {
1363                         if (no < min) {
1364                                 break;
1365                         }
1366                 }
1367                 preg->reginput = save + utf8_index(save, no);
1368                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1369                 /* If it could work, try it. */
1370                 if (reg_iseol(preg, nextch) || c == nextch) {
1371                         if (regmatch(preg, next)) {
1372                                 return(1);
1373                         }
1374                 }
1375                 if (matchmin) {
1376                         /* Couldn't or didn't, add one more */
1377                         no++;
1378                 }
1379                 else {
1380                         /* Couldn't or didn't -- back up. */
1381                         no--;
1382                 }
1383         }
1384         return(0);
1385 }
1386
1387 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1388 {
1389         int *scanpt = preg->program + scan;
1390
1391         int max = scanpt[2];
1392         int min = scanpt[3];
1393
1394         /* Have we reached min? */
1395         if (scanpt[4] < min) {
1396                 /* No, so get another one */
1397                 scanpt[4]++;
1398                 if (regmatch(preg, scan + 5)) {
1399                         return 1;
1400                 }
1401                 scanpt[4]--;
1402                 return 0;
1403         }
1404         if (scanpt[4] > max) {
1405                 return 0;
1406         }
1407
1408         if (matchmin) {
1409                 /* minimal, so try other branch first */
1410                 if (regmatch(preg, regnext(preg, scan))) {
1411                         return 1;
1412                 }
1413                 /* No, so try one more */
1414                 scanpt[4]++;
1415                 if (regmatch(preg, scan + 5)) {
1416                         return 1;
1417                 }
1418                 scanpt[4]--;
1419                 return 0;
1420         }
1421         /* maximal, so try this branch again */
1422         if (scanpt[4] < max) {
1423                 scanpt[4]++;
1424                 if (regmatch(preg, scan + 5)) {
1425                         return 1;
1426                 }
1427                 scanpt[4]--;
1428         }
1429         /* At this point we are at max with no match. Try the other branch */
1430         return regmatch(preg, regnext(preg, scan));
1431 }
1432
1433 /*
1434  - regmatch - main matching routine
1435  *
1436  * Conceptually the strategy is simple:  check to see whether the current
1437  * node matches, call self recursively to see whether the rest matches,
1438  * and then act accordingly.  In practice we make some effort to avoid
1439  * recursion, in particular by going through "ordinary" nodes (that don't
1440  * need to know whether the rest of the match failed) by a loop instead of
1441  * by recursion.
1442  */
1443 /* 0 failure, 1 success */
1444 static int regmatch(regex_t *preg, int prog)
1445 {
1446         int scan;       /* Current node. */
1447         int next;               /* Next node. */
1448         const char *save;
1449
1450         scan = prog;
1451
1452 #ifdef DEBUG
1453         if (scan != 0 && regnarrate)
1454                 fprintf(stderr, "%s(\n", regprop(scan));
1455 #endif
1456         while (scan != 0) {
1457                 int n;
1458                 int c;
1459 #ifdef DEBUG
1460                 if (regnarrate) {
1461                         fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1462                 }
1463 #endif
1464                 next = regnext(preg, scan);
1465                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1466
1467                 switch (OP(preg, scan)) {
1468                 case BOLX:
1469                         if ((preg->eflags & REG_NOTBOL)) {
1470                                 return(0);
1471                         }
1472                         /* Fall through */
1473                 case BOL:
1474                         if (preg->reginput != preg->regbol) {
1475                                 return(0);
1476                         }
1477                         break;
1478                 case EOLX:
1479                         if (c != 0) {
1480                                 /* For EOLX, only match real end of line, not newline */
1481                                 return 0;
1482                         }
1483                         break;
1484                 case EOL:
1485                         if (!reg_iseol(preg, c)) {
1486                                 return(0);
1487                         }
1488                         break;
1489                 case WORDA:
1490                         /* Must be looking at a letter, digit, or _ */
1491                         if ((!isalnum(UCHAR(c))) && c != '_')
1492                                 return(0);
1493                         /* Prev must be BOL or nonword */
1494                         if (preg->reginput > preg->regbol &&
1495                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1496                                 return(0);
1497                         break;
1498                 case WORDZ:
1499                         /* Can't match at BOL */
1500                         if (preg->reginput > preg->regbol) {
1501                                 /* Current must be EOL or nonword */
1502                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1503                                         c = preg->reginput[-1];
1504                                         /* Previous must be word */
1505                                         if (isalnum(UCHAR(c)) || c == '_') {
1506                                                 break;
1507                                         }
1508                                 }
1509                         }
1510                         /* No */
1511                         return(0);
1512
1513                 case ANY:
1514                         if (reg_iseol(preg, c))
1515                                 return 0;
1516                         preg->reginput += n;
1517                         break;
1518                 case EXACTLY: {
1519                                 int opnd;
1520                                 int len;
1521                                 int slen;
1522
1523                                 opnd = OPERAND(scan);
1524                                 len = str_int_len(preg->program + opnd);
1525
1526                                 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1527                                 if (slen < 0) {
1528                                         return(0);
1529                                 }
1530                                 preg->reginput += slen;
1531                         }
1532                         break;
1533                 case ANYOF:
1534                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1535                                 return(0);
1536                         }
1537                         preg->reginput += n;
1538                         break;
1539                 case ANYBUT:
1540                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1541                                 return(0);
1542                         }
1543                         preg->reginput += n;
1544                         break;
1545                 case NOTHING:
1546                         break;
1547                 case BACK:
1548                         break;
1549                 case BRANCH:
1550                         if (OP(preg, next) != BRANCH)           /* No choice. */
1551                                 next = OPERAND(scan);   /* Avoid recursion. */
1552                         else {
1553                                 do {
1554                                         save = preg->reginput;
1555                                         if (regmatch(preg, OPERAND(scan))) {
1556                                                 return(1);
1557                                         }
1558                                         preg->reginput = save;
1559                                         scan = regnext(preg, scan);
1560                                 } while (scan != 0 && OP(preg, scan) == BRANCH);
1561                                 return(0);
1562                                 /* NOTREACHED */
1563                         }
1564                         break;
1565                 case REP:
1566                 case REPMIN:
1567                         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1568
1569                 case REPX:
1570                 case REPXMIN:
1571                         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1572
1573                 case END:
1574                         return 1;       /* Success! */
1575
1576                 case OPENNC:
1577                 case CLOSENC:
1578                         return regmatch(preg, next);
1579
1580                 default:
1581                         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1582                                 save = preg->reginput;
1583                                 if (regmatch(preg, next)) {
1584                                         if (OP(preg, scan) < CLOSE) {
1585                                                 int no = OP(preg, scan) - OPEN;
1586                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1587                                                         preg->pmatch[no].rm_so = save - preg->start;
1588                                                 }
1589                                         }
1590                                         else {
1591                                                 int no = OP(preg, scan) - CLOSE;
1592                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1593                                                         preg->pmatch[no].rm_eo = save - preg->start;
1594                                                 }
1595                                         }
1596                                         return(1);
1597                                 }
1598                                 /* Restore input position after failure */
1599                                 preg->reginput = save;
1600                                 return(0);
1601                         }
1602                         return REG_ERR_INTERNAL;
1603                 }
1604
1605                 scan = next;
1606         }
1607
1608         /*
1609          * We get here only if there's trouble -- normally "case END" is
1610          * the terminating point.
1611          */
1612         return REG_ERR_INTERNAL;
1613 }
1614
1615 /*
1616  - regrepeat - repeatedly match something simple, report how many
1617  */
1618 static int regrepeat(regex_t *preg, int p, int max)
1619 {
1620         int count = 0;
1621         const char *scan;
1622         int opnd;
1623         int ch;
1624         int n;
1625
1626         scan = preg->reginput;
1627         opnd = OPERAND(p);
1628         switch (OP(preg, p)) {
1629         case ANY:
1630                 while (!reg_iseol(preg, *scan) && count < max) {
1631                         count++;
1632                         scan += utf8_charlen(*scan);
1633                 }
1634                 break;
1635         case EXACTLY:
1636                 while (count < max) {
1637                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1638                         if (preg->program[opnd] != ch) {
1639                                 break;
1640                         }
1641                         count++;
1642                         scan += n;
1643                 }
1644                 break;
1645         case ANYOF:
1646                 while (count < max) {
1647                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1648                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1649                                 break;
1650                         }
1651                         count++;
1652                         scan += n;
1653                 }
1654                 break;
1655         case ANYBUT:
1656                 while (count < max) {
1657                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1658                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1659                                 break;
1660                         }
1661                         count++;
1662                         scan += n;
1663                 }
1664                 break;
1665         default:                /* Oh dear.  Called inappropriately. */
1666                 preg->err = REG_ERR_INTERNAL;
1667                 count = 0;      /* Best compromise. */
1668                 break;
1669         }
1670         preg->reginput = scan;
1671
1672         return(count);
1673 }
1674
1675 /*
1676  - regnext - dig the "next" pointer out of a node
1677  */
1678 static int regnext(regex_t *preg, int p )
1679 {
1680         int offset;
1681
1682         offset = NEXT(preg, p);
1683
1684         if (offset == 0)
1685                 return 0;
1686
1687         if (OP(preg, p) == BACK)
1688                 return(p-offset);
1689         else
1690                 return(p+offset);
1691 }
1692
1693 /*
1694  - regopsize - returns the size of opcode + operands at 'p' in words
1695  */
1696 static int regopsize(regex_t *preg, int p )
1697 {
1698         /* Almost all opcodes are 2 words, but some are more */
1699         switch (OP(preg, p)) {
1700                 case REP:
1701                 case REPMIN:
1702                 case REPX:
1703                 case REPXMIN:
1704                         return 5;
1705
1706                 case ANYOF:
1707                 case ANYBUT:
1708                 case EXACTLY: {
1709                         int s = p + 2;
1710                         while (preg->program[s++]) {
1711                         }
1712                         return s - p;
1713                 }
1714         }
1715         return 2;
1716 }
1717
1718 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1719
1720 /*
1721  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1722  */
1723 static void regdump(regex_t *preg)
1724 {
1725         int s;
1726         int op = EXACTLY;       /* Arbitrary non-END op. */
1727         int next;
1728         char buf[MAX_UTF8_LEN + 1];
1729
1730         int i;
1731         for (i = 1; i < preg->p; i++) {
1732                 printf("%02x ", (unsigned char)preg->program[i]);
1733                 if (i % 16 == 0) {
1734                         printf("\n");
1735                 }
1736         }
1737         printf("\n");
1738
1739         s = 1;
1740         while (op != END && s < preg->p) {      /* While that wasn't END last time... */
1741                 op = OP(preg, s);
1742                 printf("%3d: %s", s, regprop(op));      /* Where, what. */
1743                 next = regnext(preg, s);
1744                 if (next == 0)          /* Next ptr. */
1745                         printf("(0)");
1746                 else
1747                         printf("(%d)", next);
1748                 s += 2;
1749                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1750                         int max = preg->program[s];
1751                         int min = preg->program[s + 1];
1752                         if (max == 65535) {
1753                                 printf("{%d,*}", min);
1754                         }
1755                         else {
1756                                 printf("{%d,%d}", min, max);
1757                         }
1758                         printf(" %d", preg->program[s + 2]);
1759                         s += 3;
1760                 }
1761                 else if (op == ANYOF || op == ANYBUT) {
1762                         /* set of ranges */
1763
1764                         while (preg->program[s]) {
1765                                 int len = preg->program[s++];
1766                                 int first = preg->program[s++];
1767                                 buf[utf8_getchars(buf, first)] = 0;
1768                                 printf("%s", buf);
1769                                 if (len > 1) {
1770                                         buf[utf8_getchars(buf, first + len - 1)] = 0;
1771                                         printf("-%s", buf);
1772                                 }
1773                         }
1774                         s++;
1775                 }
1776                 else if (op == EXACTLY) {
1777                         /* Literal string, where present. */
1778
1779                         while (preg->program[s]) {
1780                                 buf[utf8_getchars(buf, preg->program[s])] = 0;
1781                                 printf("%s", buf);
1782                                 s++;
1783                         }
1784                         s++;
1785                 }
1786                 putchar('\n');
1787         }
1788
1789         if (op == END) {
1790                 /* Header fields of interest. */
1791                 if (preg->regstart) {
1792                         buf[utf8_getchars(buf, preg->regstart)] = 0;
1793                         printf("start '%s' ", buf);
1794                 }
1795                 if (preg->reganch)
1796                         printf("anchored ");
1797                 if (preg->regmust != 0) {
1798                         int i;
1799                         printf("must have:");
1800                         for (i = 0; i < preg->regmlen; i++) {
1801                                 putchar(preg->program[preg->regmust + i]);
1802                         }
1803                         putchar('\n');
1804                 }
1805         }
1806         printf("\n");
1807 }
1808
1809 /*
1810  - regprop - printable representation of opcode
1811  */
1812 static const char *regprop( int op )
1813 {
1814         static char buf[50];
1815
1816         switch (op) {
1817         case BOL:
1818                 return "BOL";
1819         case EOL:
1820                 return "EOL";
1821         case BOLX:
1822                 return "BOLX";
1823         case EOLX:
1824                 return "EOLX";
1825         case ANY:
1826                 return "ANY";
1827         case ANYOF:
1828                 return "ANYOF";
1829         case ANYBUT:
1830                 return "ANYBUT";
1831         case BRANCH:
1832                 return "BRANCH";
1833         case EXACTLY:
1834                 return "EXACTLY";
1835         case NOTHING:
1836                 return "NOTHING";
1837         case BACK:
1838                 return "BACK";
1839         case END:
1840                 return "END";
1841         case REP:
1842                 return "REP";
1843         case REPMIN:
1844                 return "REPMIN";
1845         case REPX:
1846                 return "REPX";
1847         case REPXMIN:
1848                 return "REPXMIN";
1849         case WORDA:
1850                 return "WORDA";
1851         case WORDZ:
1852                 return "WORDZ";
1853         case OPENNC:
1854                 return "OPEN";
1855         case CLOSENC:
1856                 return "CLOSE";
1857         default:
1858                 if (op >= OPEN && op < CLOSE) {
1859                         snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1860                 }
1861                 else if (op >= CLOSE && op < CLOSE_END) {
1862                         snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1863                 }
1864                 else {
1865                         snprintf(buf, sizeof(buf), "?%d?\n", op);
1866                 }
1867                 return(buf);
1868         }
1869 }
1870 #endif /* JIM_BOOTSTRAP */
1871
1872 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1873 {
1874         static const char *error_strings[] = {
1875                 "success",
1876                 "no match",
1877                 "bad pattern",
1878                 "null argument",
1879                 "unknown error",
1880                 "too big",
1881                 "out of memory",
1882                 "too many ()",
1883                 "parentheses () not balanced",
1884                 "braces {} not balanced",
1885                 "invalid repetition count(s)",
1886                 "extra characters",
1887                 "*+ of empty atom",
1888                 "nested count",
1889                 "internal error",
1890                 "count follows nothing",
1891                 "invalid escape \\ sequence",
1892                 "corrupted program",
1893                 "contains null char",
1894                 "brackets [] not balanced",
1895         };
1896         const char *err;
1897
1898         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1899                 err = "Bad error code";
1900         }
1901         else {
1902                 err = error_strings[errcode];
1903         }
1904
1905         return snprintf(errbuf, errbuf_size, "%s", err);
1906 }
1907
1908 void regfree(regex_t *preg)
1909 {
1910         free(preg->program);
1911 }
1912
1913 #endif