jimregexp.c

   1 /*
   2  * vi:se ts=8:
   3  *
   4  * regcomp and regexec -- regsub and regerror are elsewhere
   5  *
   6  *      Copyright (c) 1986 by University of Toronto.
   7  *      Written by Henry Spencer.  Not derived from licensed software.
   8  *
   9  *      Permission is granted to anyone to use this software for any
  10  *      purpose on any computer system, and to redistribute it freely,
  11  *      subject to the following restrictions:
  12  *
  13  *      1. The author is not responsible for the consequences of use of
  14  *              this software, no matter how awful, even if they arise
  15  *              from defects in it.
  16  *
  17  *      2. The origin of this software must not be misrepresented, either
  18  *              by explicit claim or by omission.
  19  *
  20  *      3. Altered versions must be plainly marked as such, and must not
  21  *              be misrepresented as being the original software.
  22  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  23  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  24  *** to assist in implementing egrep.
  25  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  26  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  27  *** as in BSD grep and ex.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  29  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  31  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  32  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  33  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  34  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  35  *** to avoid conflicting with the system regexp.h.  Const, bless its
  36  *** soul, was removed so it can compile everywhere.  The declaration
  37  *** of strchr() was in conflict on AIX, so it was removed (as it is
  38  *** happily defined in string.h).
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  41  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  42  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  43  *
  44  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  45  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  46  *   This includes counted repetitions, UTF-8 support, character classes,
  47  *   shorthand character classes, increased number of parentheses to 100,
  48  *   backslash escape sequences. It also removes \n as an alternative to |.
  49  *
  50  * Beware that some of this code is subtly aware of the way operator
  51  * precedence is structured in regular expressions.  Serious changes in
  52  * regular-expression syntax might require a total rethink.
  53  */
  54
  55 #include "jimautoconf.h"
  56
  57 #if defined(JIM_REGEXP)
  58 #include <stdio.h>
  59 #include <ctype.h>
  60 #include <stdlib.h>
  61 #include <string.h>
  62
  63 #include "jim.h"
  64 #include "jimregexp.h"
  65 #include "utf8.h"
  66
  67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
  68 #define REG_MAX_PAREN 100
  69
  70 /*
  71  * Structure for regexp "program".  This is essentially a linear encoding
  72  * of a nondeterministic finite-state machine (aka syntax charts or
  73  * "railroad normal form" in parsing technology).  Each node is an opcode
  74  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  75  * all nodes except BRANCH implement concatenation; a "next" pointer with
  76  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  77  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  78  * opposed to a collection of them) is never concatenated with anything
  79  * because of operator precedence.)  The operand of some types of node is
  80  * a literal string; for others, it is a node leading into a sub-FSM.  In
  81  * particular, the operand of a BRANCH node is the first node of the branch.
  82  * (NB this is *not* a tree structure:  the tail of the branch connects
  83  * to the thing following the set of BRANCHes.)  The opcodes are:
  84  */
  85
  86 /* definition           number  opnd?   meaning */
  87 #define END     0       /* no   End of program. */
  88 #define BOL     1       /* no   Match "" at beginning of line. */
  89 #define EOL     2       /* no   Match "" at end of line. */
  90 #define ANY     3       /* no   Match any one character. */
  91 #define ANYOF   4       /* str  Match any character in this string. */
  92 #define ANYBUT  5       /* str  Match any character not in this string. */
  93 #define BRANCH  6       /* node Match this alternative, or the next... */
  94 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  95 #define EXACTLY 8       /* str  Match this string. */
  96 #define NOTHING 9       /* no   Match empty string. */
  97 #define REP     10      /* max,min      Match this (simple) thing [min,max] times. */
  98 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, minimal match. */
  99 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
 100 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
 101 #define BOLX    14      /* no   Match "" at beginning of input. */
 102 #define EOLX    15      /* no   Match "" at end of input. */
 103 #define WORDA   16      /* no   Match "" at wordchar, where prev is nonword */
 104 #define WORDZ   17      /* no   Match "" at nonwordchar, where prev is word */
 105
 106 #define OPENNC  1000    /* no   Non-capturing parentheses - must be OPEN-1 */
 107 #define OPEN    1001    /* no   Mark this point in input as start of #n. */
 108                         /*      OPEN+1 is number 1, etc. */
 109
 110 /* must not be any other opts between OPEN and CLOSE */
 111
 112 #define CLOSENC 2000    /* no   Non-capturing parentheses - must be CLOSE-1 */
 113 #define CLOSE   2001    /* no   Analogous to OPEN. */
 114 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 115
 116 /*
 117  * The first word of the regexp internal "program" is actually this magic
 118  * number; the start node begins in the second word.
 119  */
 120 #define REG_MAGIC       0xFADED00D
 121
 122 /*
 123  * Opcode notes:
 124  *
 125  * BRANCH       The set of branches constituting a single choice are hooked
 126  *              together with their "next" pointers, since precedence prevents
 127  *              anything being concatenated to any individual branch.  The
 128  *              "next" pointer of the last BRANCH in a choice points to the
 129  *              thing following the whole choice.  This is also where the
 130  *              final "next" pointer of each individual branch points; each
 131  *              branch starts with the operand node of a BRANCH node.
 132  *
 133  * BACK         Normal "next" pointers all implicitly point forward; BACK
 134  *              exists to make loop structures possible.
 135  *
 136  * REP,REPX     Repeated matches ('?', '*', '+' and {min,max}) are implemented
 137  *              as either simple repeats (REP) or complex repeats (REPX).
 138  *              These opcodes include a "min" and "max" count after the opcode.
 139  *              This is followed by a fourth "current count" word that is
 140  *              only used by REPX, as it implements a recursive match.
 141  *              REPMIN and REPXMIN are identical except they implement minimal repeats.
 142  *
 143  * OPEN,CLOSE   ...are numbered at compile time.
 144  */
 145
 146 /*
 147  * A node is one word of opcode followed by one word of "next" pointer.
 148  * The "next" pointer value is a positive offset from the opcode of the node
 149  * containing it.
 150  * An operand, if any, simply follows the node.  (Note that much of the
 151  * code generation knows about this implicit relationship.)
 152  */
 153 #define OP(preg, p)     (preg->program[p])
 154 #define NEXT(preg, p)   (preg->program[p + 1])
 155 #define OPERAND(p)      ((p) + 2)
 156
 157 /*
 158  * See regmagic.h for one further detail of program structure.
 159  */
 160
 161
 162 /*
 163  * Utility definitions.
 164  */
 165
 166 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 167 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 168 #define META            "^$.[()|?{+*"
 169
 170 /*
 171  * Flags to be passed up and down.
 172  */
 173 #define HASWIDTH        1       /* Known never to match null string. */
 174 #define SIMPLE          2       /* Simple enough to be STAR/PLUS operand. */
 175 #define SPSTART         4       /* Starts with * or +. */
 176 #define WORST           0       /* Worst case. */
 177
 178 #define MAX_REP_COUNT 1000000
 179
 180 /*
 181  * Forward declarations for regcomp()'s friends.
 182  */
 183 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 184 static int regpiece(regex_t *preg, int *flagp );
 185 static int regbranch(regex_t *preg, int *flagp );
 186 static int regatom(regex_t *preg, int *flagp );
 187 static int regnode(regex_t *preg, int op );
 188 static int regnext(regex_t *preg, int p );
 189 static void regc(regex_t *preg, int b );
 190 static int reginsert(regex_t *preg, int op, int size, int opnd );
 191 static void regtail(regex_t *preg, int p, int val);
 192 static void regoptail(regex_t *preg, int p, int val );
 193 static int regopsize(regex_t *preg, int p );
 194
 195 static int reg_range_find(const int *string, int c);
 196 static const char *str_find(const char *string, int c, int nocase);
 197 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 198
 199 /*#define DEBUG*/
 200 #ifdef DEBUG
 201 static int regnarrate = 0;
 202 static void regdump(regex_t *preg);
 203 static const char *regprop( int op );
 204 #endif
 205
 206
 207 /**
 208  * Returns the length of the null-terminated integer sequence.
 209  */
 210 static int str_int_len(const int *seq)
 211 {
 212         int n = 0;
 213         while (*seq++) {
 214                 n++;
 215         }
 216         return n;
 217 }
 218
 219 /*
 220  - regcomp - compile a regular expression into internal code
 221  *
 222  * We can't allocate space until we know how big the compiled form will be,
 223  * but we can't compile it (and thus know how big it is) until we've got a
 224  * place to put the code.  So we cheat:  we compile it twice, once with code
 225  * generation turned off and size counting turned on, and once "for real".
 226  * This also means that we don't allocate space until we are sure that the
 227  * thing really will compile successfully, and we never have to move the
 228  * code and thus invalidate pointers into it.  (Note that it has to be in
 229  * one piece because free() must be able to free it all.)
 230  *
 231  * Beware that the optimization-preparation code in here knows about some
 232  * of the structure of the compiled regexp.
 233  */
 234 int regcomp(regex_t *preg, const char *exp, int cflags)
 235 {
 236         int scan;
 237         int longest;
 238         unsigned len;
 239         int flags;
 240
 241 #ifdef DEBUG
 242         fprintf(stderr, "Compiling: '%s'\n", exp);
 243 #endif
 244         memset(preg, 0, sizeof(*preg));
 245
 246         if (exp == NULL)
 247                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 248
 249         /* First pass: determine size, legality. */
 250         preg->cflags = cflags;
 251         preg->regparse = exp;
 252
 253         /* Allocate space. */
 254         preg->proglen = (strlen(exp) + 1) * 5;
 255         preg->program = malloc(preg->proglen * sizeof(int));
 256         if (preg->program == NULL)
 257                 FAIL(preg, REG_ERR_NOMEM);
 258
 259         /* Note that since we store a magic value as the first item in the program,
 260          * program offsets will never be 0
 261          */
 262         regc(preg, REG_MAGIC);
 263         if (reg(preg, 0, &flags) == 0) {
 264                 return preg->err;
 265         }
 266
 267         /* Small enough for pointer-storage convention? */
 268         if (preg->re_nsub >= REG_MAX_PAREN)             /* Probably could be 65535L. */
 269                 FAIL(preg,REG_ERR_TOO_BIG);
 270
 271         /* Dig out information for optimizations. */
 272         preg->regstart = 0;     /* Worst-case defaults. */
 273         preg->reganch = 0;
 274         preg->regmust = 0;
 275         preg->regmlen = 0;
 276         scan = 1;                       /* First BRANCH. */
 277         if (OP(preg, regnext(preg, scan)) == END) {             /* Only one top-level choice. */
 278                 scan = OPERAND(scan);
 279
 280                 /* Starting-point info. */
 281                 if (OP(preg, scan) == EXACTLY) {
 282                         preg->regstart = preg->program[OPERAND(scan)];
 283                 }
 284                 else if (OP(preg, scan) == BOL)
 285                         preg->reganch++;
 286
 287                 /*
 288                  * If there's something expensive in the r.e., find the
 289                  * longest literal string that must appear and make it the
 290                  * regmust.  Resolve ties in favor of later strings, since
 291                  * the regstart check works with the beginning of the r.e.
 292                  * and avoiding duplication strengthens checking.  Not a
 293                  * strong reason, but sufficient in the absence of others.
 294                  */
 295                 if (flags&SPSTART) {
 296                         longest = 0;
 297                         len = 0;
 298                         for (; scan != 0; scan = regnext(preg, scan)) {
 299                                 if (OP(preg, scan) == EXACTLY) {
 300                                         int plen = str_int_len(preg->program + OPERAND(scan));
 301                                         if (plen >= len) {
 302                                                 longest = OPERAND(scan);
 303                                                 len = plen;
 304                                         }
 305                                 }
 306                         }
 307                         preg->regmust = longest;
 308                         preg->regmlen = len;
 309                 }
 310         }
 311
 312 #ifdef DEBUG
 313         regdump(preg);
 314 #endif
 315
 316         return 0;
 317 }
 318
 319 /*
 320  - reg - regular expression, i.e. main body or parenthesized thing
 321  *
 322  * Caller must absorb opening parenthesis.
 323  *
 324  * Combining parenthesis handling with the base level of regular expression
 325  * is a trifle forced, but the need to tie the tails of the branches to what
 326  * follows makes it hard to avoid.
 327  */
 328 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 329 {
 330         int ret;
 331         int br;
 332         int ender;
 333         int parno = 0;
 334         int flags;
 335
 336         *flagp = HASWIDTH;      /* Tentatively. */
 337
 338         /* Make an OPEN node, if parenthesized. */
 339         if (paren) {
 340                 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 341                         /* non-capturing paren */
 342                         preg->regparse += 2;
 343                         parno = -1;
 344                 }
 345                 else {
 346                         parno = ++preg->re_nsub;
 347                 }
 348                 ret = regnode(preg, OPEN+parno);
 349         } else
 350                 ret = 0;
 351
 352         /* Pick up the branches, linking them together. */
 353         br = regbranch(preg, &flags);
 354         if (br == 0)
 355                 return 0;
 356         if (ret != 0)
 357                 regtail(preg, ret, br); /* OPEN -> first. */
 358         else
 359                 ret = br;
 360         if (!(flags&HASWIDTH))
 361                 *flagp &= ~HASWIDTH;
 362         *flagp |= flags&SPSTART;
 363         while (*preg->regparse == '|') {
 364                 preg->regparse++;
 365                 br = regbranch(preg, &flags);
 366                 if (br == 0)
 367                         return 0;
 368                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 369                 if (!(flags&HASWIDTH))
 370                         *flagp &= ~HASWIDTH;
 371                 *flagp |= flags&SPSTART;
 372         }
 373
 374         /* Make a closing node, and hook it on the end. */
 375         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 376         regtail(preg, ret, ender);
 377
 378         /* Hook the tails of the branches to the closing node. */
 379         for (br = ret; br != 0; br = regnext(preg, br))
 380                 regoptail(preg, br, ender);
 381
 382         /* Check for proper termination. */
 383         if (paren && *preg->regparse++ != ')') {
 384                 preg->err = REG_ERR_UNMATCHED_PAREN;
 385                 return 0;
 386         } else if (!paren && *preg->regparse != '\0') {
 387                 if (*preg->regparse == ')') {
 388                         preg->err = REG_ERR_UNMATCHED_PAREN;
 389                         return 0;
 390                 } else {
 391                         preg->err = REG_ERR_JUNK_ON_END;
 392                         return 0;
 393                 }
 394         }
 395
 396         return(ret);
 397 }
 398
 399 /*
 400  - regbranch - one alternative of an | operator
 401  *
 402  * Implements the concatenation operator.
 403  */
 404 static int regbranch(regex_t *preg, int *flagp )
 405 {
 406         int ret;
 407         int chain;
 408         int latest;
 409         int flags;
 410
 411         *flagp = WORST;         /* Tentatively. */
 412
 413         ret = regnode(preg, BRANCH);
 414         chain = 0;
 415         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 416                *preg->regparse != '|') {
 417                 latest = regpiece(preg, &flags);
 418                 if (latest == 0)
 419                         return 0;
 420                 *flagp |= flags&HASWIDTH;
 421                 if (chain == 0) {/* First piece. */
 422                         *flagp |= flags&SPSTART;
 423                 }
 424                 else {
 425                         regtail(preg, chain, latest);
 426                 }
 427                 chain = latest;
 428         }
 429         if (chain == 0) /* Loop ran zero times. */
 430                 (void) regnode(preg, NOTHING);
 431
 432         return(ret);
 433 }
 434
 435 /*
 436  - regpiece - something followed by possible [*+?]
 437  *
 438  * Note that the branching code sequences used for ? and the general cases
 439  * of * and + are somewhat optimized:  they use the same NOTHING node as
 440  * both the endmarker for their branch list and the body of the last branch.
 441  * It might seem that this node could be dispensed with entirely, but the
 442  * endmarker role is not redundant.
 443  */
 444 static int regpiece(regex_t *preg, int *flagp)
 445 {
 446         int ret;
 447         char op;
 448         int next;
 449         int flags;
 450         int min;
 451         int max;
 452
 453         ret = regatom(preg, &flags);
 454         if (ret == 0)
 455                 return 0;
 456
 457         op = *preg->regparse;
 458         if (!ISMULT(op)) {
 459                 *flagp = flags;
 460                 return(ret);
 461         }
 462
 463         if (!(flags&HASWIDTH) && op != '?') {
 464                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 465                 return 0;
 466         }
 467
 468         /* Handle braces (counted repetition) by expansion */
 469         if (op == '{') {
 470                 char *end;
 471
 472                 min = strtoul(preg->regparse + 1, &end, 10);
 473                 if (end == preg->regparse + 1) {
 474                         preg->err = REG_ERR_BAD_COUNT;
 475                         return 0;
 476                 }
 477                 if (*end == '}') {
 478                         max = min;
 479                 }
 480                 else if (*end == '\0') {
 481                         preg->err = REG_ERR_UNMATCHED_BRACES;
 482                         return 0;
 483                 }
 484                 else {
 485                         preg->regparse = end;
 486                         max = strtoul(preg->regparse + 1, &end, 10);
 487                         if (*end != '}') {
 488                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 489                                 return 0;
 490                         }
 491                 }
 492                 if (end == preg->regparse + 1) {
 493                         max = MAX_REP_COUNT;
 494                 }
 495                 else if (max < min || max >= 100) {
 496                         preg->err = REG_ERR_BAD_COUNT;
 497                         return 0;
 498                 }
 499                 if (min >= 100) {
 500                         preg->err = REG_ERR_BAD_COUNT;
 501                         return 0;
 502                 }
 503
 504                 preg->regparse = strchr(preg->regparse, '}');
 505         }
 506         else {
 507                 min = (op == '+');
 508                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 509         }
 510
 511         if (preg->regparse[1] == '?') {
 512                 preg->regparse++;
 513                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 514         }
 515         else {
 516                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 517         }
 518         preg->program[ret + 2] = max;
 519         preg->program[ret + 3] = min;
 520         preg->program[ret + 4] = 0;
 521
 522         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 523
 524         if (!(flags & SIMPLE)) {
 525                 int back = regnode(preg, BACK);
 526                 regtail(preg, back, ret);
 527                 regtail(preg, next, back);
 528         }
 529
 530         preg->regparse++;
 531         if (ISMULT(*preg->regparse)) {
 532                 preg->err = REG_ERR_NESTED_COUNT;
 533                 return 0;
 534         }
 535
 536         return ret;
 537 }
 538
 539 /**
 540  * Add all characters in the inclusive range between lower and upper.
 541  *
 542  * Handles a swapped range (upper < lower).
 543  */
 544 static void reg_addrange(regex_t *preg, int lower, int upper)
 545 {
 546         if (lower > upper) {
 547                 reg_addrange(preg, upper, lower);
 548         }
 549         /* Add a range as length, start */
 550         regc(preg, upper - lower + 1);
 551         regc(preg, lower);
 552 }
 553
 554 /**
 555  * Add a null-terminated literal string as a set of ranges.
 556  */
 557 static void reg_addrange_str(regex_t *preg, const char *str)
 558 {
 559         while (*str) {
 560                 reg_addrange(preg, *str, *str);
 561                 str++;
 562         }
 563 }
 564
 565 /**
 566  * Extracts the next unicode char from utf8.
 567  *
 568  * If 'upper' is set, converts the char to uppercase.
 569  */
 570 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 571 {
 572         int l = utf8_tounicode(s, uc);
 573         if (upper) {
 574                 *uc = utf8_upper(*uc);
 575         }
 576         return l;
 577 }
 578
 579 /**
 580  * Converts a hex digit to decimal.
 581  *
 582  * Returns -1 for an invalid hex digit.
 583  */
 584 static int hexdigitval(int c)
 585 {
 586         if (c >= '0' && c <= '9')
 587                 return c - '0';
 588         if (c >= 'a' && c <= 'f')
 589                 return c - 'a' + 10;
 590         if (c >= 'A' && c <= 'F')
 591                 return c - 'A' + 10;
 592         return -1;
 593 }
 594
 595 /**
 596  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 597  *
 598  * Returns the number of hex digits parsed.
 599  * If there are no hex digits, returns 0 and stores nothing.
 600  */
 601 static int parse_hex(const char *s, int n, int *uc)
 602 {
 603         int val = 0;
 604         int k;
 605
 606         for (k = 0; k < n; k++) {
 607                 int c = hexdigitval(*s++);
 608                 if (c == -1) {
 609                         break;
 610                 }
 611                 val = (val << 4) | c;
 612         }
 613         if (k) {
 614                 *uc = val;
 615         }
 616         return k;
 617 }
 618
 619 /**
 620  * Call for chars after a backlash to decode the escape sequence.
 621  *
 622  * Stores the result in *ch.
 623  *
 624  * Returns the number of bytes consumed.
 625  */
 626 static int reg_decode_escape(const char *s, int *ch)
 627 {
 628         int n;
 629         const char *s0 = s;
 630
 631         *ch = *s++;
 632
 633         switch (*ch) {
 634                 case 'b': *ch = '\b'; break;
 635                 case 'e': *ch = 27; break;
 636                 case 'f': *ch = '\f'; break;
 637                 case 'n': *ch = '\n'; break;
 638                 case 'r': *ch = '\r'; break;
 639                 case 't': *ch = '\t'; break;
 640                 case 'v': *ch = '\v'; break;
 641                 case 'u':
 642                         if (*s == '{') {
 643                                 /* Expect \u{NNNN} */
 644                                 n = parse_hex(s + 1, 6, ch);
 645                                 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
 646                                         s += n + 2;
 647                                 }
 648                                 else {
 649                                         /* Invalid, so just treat as an escaped 'u' */
 650                                         *ch = 'u';
 651                                 }
 652                         }
 653                         else if ((n = parse_hex(s, 4, ch)) > 0) {
 654                                 s += n;
 655                         }
 656                         break;
 657                 case 'U':
 658                         if ((n = parse_hex(s, 8, ch)) > 0) {
 659                                 s += n;
 660                         }
 661                         break;
 662                 case 'x':
 663                         if ((n = parse_hex(s, 2, ch)) > 0) {
 664                                 s += n;
 665                         }
 666                         break;
 667                 case '\0':
 668                         s--;
 669                         *ch = '\\';
 670                         break;
 671         }
 672         return s - s0;
 673 }
 674
 675 /*
 676  - regatom - the lowest level
 677  *
 678  * Optimization:  gobbles an entire sequence of ordinary characters so that
 679  * it can turn them into a single node, which is smaller to store and
 680  * faster to run.  Backslashed characters are exceptions, each becoming a
 681  * separate node; the code is simpler that way and it's not worth fixing.
 682  */
 683 static int regatom(regex_t *preg, int *flagp)
 684 {
 685         int ret;
 686         int flags;
 687         int nocase = (preg->cflags & REG_ICASE);
 688
 689         int ch;
 690         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 691
 692         *flagp = WORST;         /* Tentatively. */
 693
 694         preg->regparse += n;
 695         switch (ch) {
 696         /* FIXME: these chars only have meaning at beg/end of pat? */
 697         case '^':
 698                 ret = regnode(preg, BOL);
 699                 break;
 700         case '$':
 701                 ret = regnode(preg, EOL);
 702                 break;
 703         case '.':
 704                 ret = regnode(preg, ANY);
 705                 *flagp |= HASWIDTH|SIMPLE;
 706                 break;
 707         case '[': {
 708                         const char *pattern = preg->regparse;
 709
 710                         if (*pattern == '^') {  /* Complement of range. */
 711                                 ret = regnode(preg, ANYBUT);
 712                                 pattern++;
 713                         } else
 714                                 ret = regnode(preg, ANYOF);
 715
 716                         /* Special case. If the first char is ']' or '-', it is part of the set */
 717                         if (*pattern == ']' || *pattern == '-') {
 718                                 reg_addrange(preg, *pattern, *pattern);
 719                                 pattern++;
 720                         }
 721
 722                         while (*pattern && *pattern != ']') {
 723                                 /* Is this a range? a-z */
 724                                 int start;
 725                                 int end;
 726
 727                                 enum {
 728                                         CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
 729                                         CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
 730                                         CC_NUM
 731                                 };
 732                                 int cc;
 733
 734                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 735                                 if (start == '\\') {
 736                                         /* First check for class shorthand escapes */
 737                                         switch (*pattern) {
 738                                                 case 's':
 739                                                         pattern++;
 740                                                         cc = CC_SPACE;
 741                                                         goto cc_switch;
 742                                                 case 'd':
 743                                                         pattern++;
 744                                                         cc = CC_DIGIT;
 745                                                         goto cc_switch;
 746                                                 case 'w':
 747                                                         pattern++;
 748                                                         reg_addrange(preg, '_', '_');
 749                                                         cc = CC_ALNUM;
 750                                                         goto cc_switch;
 751                                         }
 752                                         pattern += reg_decode_escape(pattern, &start);
 753                                         if (start == 0) {
 754                                                 preg->err = REG_ERR_NULL_CHAR;
 755                                                 return 0;
 756                                         }
 757                                 }
 758                                 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
 759                                         /* skip '-' */
 760                                         pattern += utf8_tounicode(pattern, &end);
 761                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 762                                         if (end == '\\') {
 763                                                 pattern += reg_decode_escape(pattern, &end);
 764                                                 if (end == 0) {
 765                                                         preg->err = REG_ERR_NULL_CHAR;
 766                                                         return 0;
 767                                                 }
 768                                         }
 769
 770                                         reg_addrange(preg, start, end);
 771                                         continue;
 772                                 }
 773                                 if (start == '[' && pattern[0] == ':') {
 774                                         static const char *character_class[] = {
 775                                                 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
 776                                                 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
 777                                         };
 778
 779                                         for (cc = 0; cc < CC_NUM; cc++) {
 780                                                 n = strlen(character_class[cc]);
 781                                                 if (strncmp(pattern, character_class[cc], n) == 0) {
 782                                                         /* Found a character class */
 783                                                         pattern += n + 1;
 784                                                         break;
 785                                                 }
 786                                         }
 787                                         if (cc != CC_NUM) {
 788 cc_switch:
 789                                                 switch (cc) {
 790                                                         case CC_ALNUM:
 791                                                                 reg_addrange(preg, '0', '9');
 792                                                                 /* Fall through */
 793                                                         case CC_ALPHA:
 794                                                                 if ((preg->cflags & REG_ICASE) == 0) {
 795                                                                         reg_addrange(preg, 'a', 'z');
 796                                                                 }
 797                                                                 reg_addrange(preg, 'A', 'Z');
 798                                                                 break;
 799                                                         case CC_SPACE:
 800                                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 801                                                                 break;
 802                                                         case CC_BLANK:
 803                                                                 reg_addrange_str(preg, " \t");
 804                                                                 break;
 805                                                         case CC_UPPER:
 806                                                                 reg_addrange(preg, 'A', 'Z');
 807                                                                 break;
 808                                                         case CC_LOWER:
 809                                                                 reg_addrange(preg, 'a', 'z');
 810                                                                 break;
 811                                                         case CC_XDIGIT:
 812                                                                 reg_addrange(preg, 'a', 'f');
 813                                                                 reg_addrange(preg, 'A', 'F');
 814                                                                 /* Fall through */
 815                                                         case CC_DIGIT:
 816                                                                 reg_addrange(preg, '0', '9');
 817                                                                 break;
 818                                                         case CC_CNTRL:
 819                                                                 reg_addrange(preg, 0, 31);
 820                                                                 reg_addrange(preg, 127, 127);
 821                                                                 break;
 822                                                         case CC_PRINT:
 823                                                                 reg_addrange(preg, ' ', '~');
 824                                                                 break;
 825                                                         case CC_GRAPH:
 826                                                                 reg_addrange(preg, '!', '~');
 827                                                                 break;
 828                                                         case CC_PUNCT:
 829                                                                 reg_addrange(preg, '!', '/');
 830                                                                 reg_addrange(preg, ':', '@');
 831                                                                 reg_addrange(preg, '[', '`');
 832                                                                 reg_addrange(preg, '{', '~');
 833                                                                 break;
 834                                                 }
 835                                                 continue;
 836                                         }
 837                                 }
 838                                 /* Not a range, so just add the char */
 839                                 reg_addrange(preg, start, start);
 840                         }
 841                         regc(preg, '\0');
 842
 843                         if (*pattern) {
 844                                 pattern++;
 845                         }
 846                         preg->regparse = pattern;
 847
 848                         *flagp |= HASWIDTH|SIMPLE;
 849                 }
 850                 break;
 851         case '(':
 852                 ret = reg(preg, 1, &flags);
 853                 if (ret == 0)
 854                         return 0;
 855                 *flagp |= flags&(HASWIDTH|SPSTART);
 856                 break;
 857         case '\0':
 858         case '|':
 859         case ')':
 860                 preg->err = REG_ERR_INTERNAL;
 861                 return 0;       /* Supposed to be caught earlier. */
 862         case '?':
 863         case '+':
 864         case '*':
 865         case '{':
 866                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 867                 return 0;
 868         case '\\':
 869                 ch = *preg->regparse++;
 870                 switch (ch) {
 871                 case '\0':
 872                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 873                         return 0;
 874                 case 'A':
 875                         ret = regnode(preg, BOLX);
 876                         break;
 877                 case 'Z':
 878                         ret = regnode(preg, EOLX);
 879                         break;
 880                 case '<':
 881                 case 'm':
 882                         ret = regnode(preg, WORDA);
 883                         break;
 884                 case '>':
 885                 case 'M':
 886                         ret = regnode(preg, WORDZ);
 887                         break;
 888                 case 'd':
 889                 case 'D':
 890                         ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
 891                         reg_addrange(preg, '0', '9');
 892                         regc(preg, '\0');
 893                         *flagp |= HASWIDTH|SIMPLE;
 894                         break;
 895                 case 'w':
 896                 case 'W':
 897                         ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
 898                         if ((preg->cflags & REG_ICASE) == 0) {
 899                                 reg_addrange(preg, 'a', 'z');
 900                         }
 901                         reg_addrange(preg, 'A', 'Z');
 902                         reg_addrange(preg, '0', '9');
 903                         reg_addrange(preg, '_', '_');
 904                         regc(preg, '\0');
 905                         *flagp |= HASWIDTH|SIMPLE;
 906                         break;
 907                 case 's':
 908                 case 'S':
 909                         ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
 910                         reg_addrange_str(preg," \t\r\n\f\v");
 911                         regc(preg, '\0');
 912                         *flagp |= HASWIDTH|SIMPLE;
 913                         break;
 914                 /* FIXME: Someday handle \1, \2, ... */
 915                 default:
 916                         /* Handle general quoted chars in exact-match routine */
 917                         /* Back up to include the backslash */
 918                         preg->regparse--;
 919                         goto de_fault;
 920                 }
 921                 break;
 922         de_fault:
 923         default: {
 924                         /*
 925                          * Encode a string of characters to be matched exactly.
 926                          */
 927                         int added = 0;
 928
 929                         /* Back up to pick up the first char of interest */
 930                         preg->regparse -= n;
 931
 932                         ret = regnode(preg, EXACTLY);
 933
 934                         /* Note that a META operator such as ? or * consumes the
 935                          * preceding char.
 936                          * Thus we must be careful to look ahead by 2 and add the
 937                          * last char as it's own EXACTLY if necessary
 938                          */
 939
 940                         /* Until end of string or a META char is reached */
 941                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 942                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 943                                 if (ch == '\\' && preg->regparse[n]) {
 944                                         /* Non-trailing backslash.
 945                                          * Is this a special escape, or a regular escape?
 946                                          */
 947                                         if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
 948                                                 /* A special escape. All done with EXACTLY */
 949                                                 break;
 950                                         }
 951                                         /* Decode it. Note that we add the length for the escape
 952                                          * sequence to the length for the backlash so we can skip
 953                                          * the entire sequence, or not as required.
 954                                          */
 955                                         n += reg_decode_escape(preg->regparse + n, &ch);
 956                                         if (ch == 0) {
 957                                                 preg->err = REG_ERR_NULL_CHAR;
 958                                                 return 0;
 959                                         }
 960                                 }
 961
 962                                 /* Now we have one char 'ch' of length 'n'.
 963                                  * Check to see if the following char is a MULT
 964                                  */
 965
 966                                 if (ISMULT(preg->regparse[n])) {
 967                                         /* Yes. But do we already have some EXACTLY chars? */
 968                                         if (added) {
 969                                                 /* Yes, so return what we have and pick up the current char next time around */
 970                                                 break;
 971                                         }
 972                                         /* No, so add this single char and finish */
 973                                         regc(preg, ch);
 974                                         added++;
 975                                         preg->regparse += n;
 976                                         break;
 977                                 }
 978
 979                                 /* No, so just add this char normally */
 980                                 regc(preg, ch);
 981                                 added++;
 982                                 preg->regparse += n;
 983                         }
 984                         regc(preg, '\0');
 985
 986                         *flagp |= HASWIDTH;
 987                         if (added == 1)
 988                                 *flagp |= SIMPLE;
 989                         break;
 990                 }
 991                 break;
 992         }
 993
 994         return(ret);
 995 }
 996
 997 static void reg_grow(regex_t *preg, int n)
 998 {
 999         if (preg->p + n >= preg->proglen) {
1000                 preg->proglen = (preg->p + n) * 2;
1001                 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
1002         }
1003 }
1004
1005 /*
1006  - regnode - emit a node
1007  */
1008 /* Location. */
1009 static int regnode(regex_t *preg, int op)
1010 {
1011         reg_grow(preg, 2);
1012
1013         /* The OP followed by a next pointer */
1014         preg->program[preg->p++] = op;
1015         preg->program[preg->p++] = 0;
1016
1017         /* Return the start of the node */
1018         return preg->p - 2;
1019 }
1020
1021 /*
1022  - regc - emit (if appropriate) a byte of code
1023  */
1024 static void regc(regex_t *preg, int b )
1025 {
1026         reg_grow(preg, 1);
1027         preg->program[preg->p++] = b;
1028 }
1029
1030 /*
1031  - reginsert - insert an operator in front of already-emitted operand
1032  *
1033  * Means relocating the operand.
1034  * Returns the new location of the original operand.
1035  */
1036 static int reginsert(regex_t *preg, int op, int size, int opnd )
1037 {
1038         reg_grow(preg, size);
1039
1040         /* Move everything from opnd up */
1041         memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1042         /* Zero out the new space */
1043         memset(preg->program + opnd, 0, sizeof(int) * size);
1044
1045         preg->program[opnd] = op;
1046
1047         preg->p += size;
1048
1049         return opnd + size;
1050 }
1051
1052 /*
1053  - regtail - set the next-pointer at the end of a node chain
1054  */
1055 static void regtail(regex_t *preg, int p, int val)
1056 {
1057         int scan;
1058         int temp;
1059         int offset;
1060
1061         /* Find last node. */
1062         scan = p;
1063         for (;;) {
1064                 temp = regnext(preg, scan);
1065                 if (temp == 0)
1066                         break;
1067                 scan = temp;
1068         }
1069
1070         if (OP(preg, scan) == BACK)
1071                 offset = scan - val;
1072         else
1073                 offset = val - scan;
1074
1075         preg->program[scan + 1] = offset;
1076 }
1077
1078 /*
1079  - regoptail - regtail on operand of first argument; nop if operandless
1080  */
1081
1082 static void regoptail(regex_t *preg, int p, int val )
1083 {
1084         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1085         if (p != 0 && OP(preg, p) == BRANCH) {
1086                 regtail(preg, OPERAND(p), val);
1087         }
1088 }
1089
1090 /*
1091  * regexec and friends
1092  */
1093
1094 /*
1095  * Forwards.
1096  */
1097 static int regtry(regex_t *preg, const char *string );
1098 static int regmatch(regex_t *preg, int prog);
1099 static int regrepeat(regex_t *preg, int p, int max);
1100
1101 /*
1102  - regexec - match a regexp against a string
1103  */
1104 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1105 {
1106         const char *s;
1107         int scan;
1108
1109         /* Be paranoid... */
1110         if (preg == NULL || preg->program == NULL || string == NULL) {
1111                 return REG_ERR_NULL_ARGUMENT;
1112         }
1113
1114         /* Check validity of program. */
1115         if (*preg->program != REG_MAGIC) {
1116                 return REG_ERR_CORRUPTED;
1117         }
1118
1119 #ifdef DEBUG
1120         fprintf(stderr, "regexec: %s\n", string);
1121         regdump(preg);
1122 #endif
1123
1124         preg->eflags = eflags;
1125         preg->pmatch = pmatch;
1126         preg->nmatch = nmatch;
1127         preg->start = string;   /* All offsets are computed from here */
1128
1129         /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1130         for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1131                 int op = OP(preg, scan);
1132                 if (op == END)
1133                         break;
1134                 if (op == REPX || op == REPXMIN)
1135                         preg->program[scan + 4] = 0;
1136         }
1137
1138         /* If there is a "must appear" string, look for it. */
1139         if (preg->regmust != 0) {
1140                 s = string;
1141                 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1142                         if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1143                                 break;
1144                         }
1145                         s++;
1146                 }
1147                 if (s == NULL)  /* Not present. */
1148                         return REG_NOMATCH;
1149         }
1150
1151         /* Mark beginning of line for ^ . */
1152         preg->regbol = string;
1153
1154         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1155         if (preg->reganch) {
1156                 if (eflags & REG_NOTBOL) {
1157                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1158                         goto nextline;
1159                 }
1160                 while (1) {
1161                         if (regtry(preg, string)) {
1162                                 return REG_NOERROR;
1163                         }
1164                         if (*string) {
1165 nextline:
1166                                 if (preg->cflags & REG_NEWLINE) {
1167                                         /* Try the next anchor? */
1168                                         string = strchr(string, '\n');
1169                                         if (string) {
1170                                                 preg->regbol = ++string;
1171                                                 continue;
1172                                         }
1173                                 }
1174                         }
1175                         return REG_NOMATCH;
1176                 }
1177         }
1178
1179         /* Messy cases:  unanchored match. */
1180         s = string;
1181         if (preg->regstart != '\0') {
1182                 /* We know what char it must start with. */
1183                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1184                         if (regtry(preg, s))
1185                                 return REG_NOERROR;
1186                         s++;
1187                 }
1188         }
1189         else
1190                 /* We don't -- general case. */
1191                 while (1) {
1192                         if (regtry(preg, s))
1193                                 return REG_NOERROR;
1194                         if (*s == '\0') {
1195                                 break;
1196                         }
1197                         else {
1198                                 int c;
1199                                 s += utf8_tounicode(s, &c);
1200                         }
1201                 }
1202
1203         /* Failure. */
1204         return REG_NOMATCH;
1205 }
1206
1207 /*
1208  - regtry - try match at specific point
1209  */
1210                         /* 0 failure, 1 success */
1211 static int regtry( regex_t *preg, const char *string )
1212 {
1213         int i;
1214
1215         preg->reginput = string;
1216
1217         for (i = 0; i < preg->nmatch; i++) {
1218                 preg->pmatch[i].rm_so = -1;
1219                 preg->pmatch[i].rm_eo = -1;
1220         }
1221         if (regmatch(preg, 1)) {
1222                 preg->pmatch[0].rm_so = string - preg->start;
1223                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1224                 return(1);
1225         } else
1226                 return(0);
1227 }
1228
1229 /**
1230  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1231  *
1232  * If 'nocase' is non-zero, does a case-insensitive match.
1233  *
1234  * Returns -1 on not found.
1235  */
1236 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1237 {
1238         const char *s = string;
1239         while (proglen && *s) {
1240                 int ch;
1241                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1242                 if (ch != *prog) {
1243                         return -1;
1244                 }
1245                 prog++;
1246                 s += n;
1247                 proglen--;
1248         }
1249         if (proglen == 0) {
1250                 return s - string;
1251         }
1252         return -1;
1253 }
1254
1255 /**
1256  * Searchs for 'c' in the range 'range'.
1257  *
1258  * Returns 1 if found, or 0 if not.
1259  */
1260 static int reg_range_find(const int *range, int c)
1261 {
1262         while (*range) {
1263                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1264                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1265                         return 1;
1266                 }
1267                 range += 2;
1268         }
1269         return 0;
1270 }
1271
1272 /**
1273  * Search for the character 'c' in the utf-8 string 'string'.
1274  *
1275  * If 'nocase' is set, the 'string' is assumed to be uppercase
1276  * and 'c' is converted to uppercase before matching.
1277  *
1278  * Returns the byte position in the string where the 'c' was found, or
1279  * NULL if not found.
1280  */
1281 static const char *str_find(const char *string, int c, int nocase)
1282 {
1283         if (nocase) {
1284                 /* The "string" should already be converted to uppercase */
1285                 c = utf8_upper(c);
1286         }
1287         while (*string) {
1288                 int ch;
1289                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1290                 if (c == ch) {
1291                         return string;
1292                 }
1293                 string += n;
1294         }
1295         return NULL;
1296 }
1297
1298 /**
1299  * Returns true if 'ch' is an end-of-line char.
1300  *
1301  * In REG_NEWLINE mode, \n is considered EOL in
1302  * addition to \0
1303  */
1304 static int reg_iseol(regex_t *preg, int ch)
1305 {
1306         if (preg->cflags & REG_NEWLINE) {
1307                 return ch == '\0' || ch == '\n';
1308         }
1309         else {
1310                 return ch == '\0';
1311         }
1312 }
1313
1314 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1315 {
1316         int nextch = '\0';
1317         const char *save;
1318         int no;
1319         int c;
1320
1321         int max = preg->program[scan + 2];
1322         int min = preg->program[scan + 3];
1323         int next = regnext(preg, scan);
1324
1325         /*
1326          * Lookahead to avoid useless match attempts
1327          * when we know what character comes next.
1328          */
1329         if (OP(preg, next) == EXACTLY) {
1330                 nextch = preg->program[OPERAND(next)];
1331         }
1332         save = preg->reginput;
1333         no = regrepeat(preg, scan + 5, max);
1334         if (no < min) {
1335                 return 0;
1336         }
1337         if (matchmin) {
1338                 /* from min up to no */
1339                 max = no;
1340                 no = min;
1341         }
1342         /* else from no down to min */
1343         while (1) {
1344                 if (matchmin) {
1345                         if (no > max) {
1346                                 break;
1347                         }
1348                 }
1349                 else {
1350                         if (no < min) {
1351                                 break;
1352                         }
1353                 }
1354                 preg->reginput = save + utf8_index(save, no);
1355                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1356                 /* If it could work, try it. */
1357                 if (reg_iseol(preg, nextch) || c == nextch) {
1358                         if (regmatch(preg, next)) {
1359                                 return(1);
1360                         }
1361                 }
1362                 if (matchmin) {
1363                         /* Couldn't or didn't, add one more */
1364                         no++;
1365                 }
1366                 else {
1367                         /* Couldn't or didn't -- back up. */
1368                         no--;
1369                 }
1370         }
1371         return(0);
1372 }
1373
1374 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1375 {
1376         int *scanpt = preg->program + scan;
1377
1378         int max = scanpt[2];
1379         int min = scanpt[3];
1380
1381         /* Have we reached min? */
1382         if (scanpt[4] < min) {
1383                 /* No, so get another one */
1384                 scanpt[4]++;
1385                 if (regmatch(preg, scan + 5)) {
1386                         return 1;
1387                 }
1388                 scanpt[4]--;
1389                 return 0;
1390         }
1391         if (scanpt[4] > max) {
1392                 return 0;
1393         }
1394
1395         if (matchmin) {
1396                 /* minimal, so try other branch first */
1397                 if (regmatch(preg, regnext(preg, scan))) {
1398                         return 1;
1399                 }
1400                 /* No, so try one more */
1401                 scanpt[4]++;
1402                 if (regmatch(preg, scan + 5)) {
1403                         return 1;
1404                 }
1405                 scanpt[4]--;
1406                 return 0;
1407         }
1408         /* maximal, so try this branch again */
1409         if (scanpt[4] < max) {
1410                 scanpt[4]++;
1411                 if (regmatch(preg, scan + 5)) {
1412                         return 1;
1413                 }
1414                 scanpt[4]--;
1415         }
1416         /* At this point we are at max with no match. Try the other branch */
1417         return regmatch(preg, regnext(preg, scan));
1418 }
1419
1420 /*
1421  - regmatch - main matching routine
1422  *
1423  * Conceptually the strategy is simple:  check to see whether the current
1424  * node matches, call self recursively to see whether the rest matches,
1425  * and then act accordingly.  In practice we make some effort to avoid
1426  * recursion, in particular by going through "ordinary" nodes (that don't
1427  * need to know whether the rest of the match failed) by a loop instead of
1428  * by recursion.
1429  */
1430 /* 0 failure, 1 success */
1431 static int regmatch(regex_t *preg, int prog)
1432 {
1433         int scan;       /* Current node. */
1434         int next;               /* Next node. */
1435         const char *save;
1436
1437         scan = prog;
1438
1439 #ifdef DEBUG
1440         if (scan != 0 && regnarrate)
1441                 fprintf(stderr, "%s(\n", regprop(scan));
1442 #endif
1443         while (scan != 0) {
1444                 int n;
1445                 int c;
1446 #ifdef DEBUG
1447                 if (regnarrate) {
1448                         fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1449                 }
1450 #endif
1451                 next = regnext(preg, scan);
1452                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1453
1454                 switch (OP(preg, scan)) {
1455                 case BOLX:
1456                         if ((preg->eflags & REG_NOTBOL)) {
1457                                 return(0);
1458                         }
1459                         /* Fall through */
1460                 case BOL:
1461                         if (preg->reginput != preg->regbol) {
1462                                 return(0);
1463                         }
1464                         break;
1465                 case EOLX:
1466                         if (c != 0) {
1467                                 /* For EOLX, only match real end of line, not newline */
1468                                 return 0;
1469                         }
1470                         break;
1471                 case EOL:
1472                         if (!reg_iseol(preg, c)) {
1473                                 return(0);
1474                         }
1475                         break;
1476                 case WORDA:
1477                         /* Must be looking at a letter, digit, or _ */
1478                         if ((!isalnum(UCHAR(c))) && c != '_')
1479                                 return(0);
1480                         /* Prev must be BOL or nonword */
1481                         if (preg->reginput > preg->regbol &&
1482                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1483                                 return(0);
1484                         break;
1485                 case WORDZ:
1486                         /* Can't match at BOL */
1487                         if (preg->reginput > preg->regbol) {
1488                                 /* Current must be EOL or nonword */
1489                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1490                                         c = preg->reginput[-1];
1491                                         /* Previous must be word */
1492                                         if (isalnum(UCHAR(c)) || c == '_') {
1493                                                 break;
1494                                         }
1495                                 }
1496                         }
1497                         /* No */
1498                         return(0);
1499
1500                 case ANY:
1501                         if (reg_iseol(preg, c))
1502                                 return 0;
1503                         preg->reginput += n;
1504                         break;
1505                 case EXACTLY: {
1506                                 int opnd;
1507                                 int len;
1508                                 int slen;
1509
1510                                 opnd = OPERAND(scan);
1511                                 len = str_int_len(preg->program + opnd);
1512
1513                                 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1514                                 if (slen < 0) {
1515                                         return(0);
1516                                 }
1517                                 preg->reginput += slen;
1518                         }
1519                         break;
1520                 case ANYOF:
1521                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1522                                 return(0);
1523                         }
1524                         preg->reginput += n;
1525                         break;
1526                 case ANYBUT:
1527                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1528                                 return(0);
1529                         }
1530                         preg->reginput += n;
1531                         break;
1532                 case NOTHING:
1533                         break;
1534                 case BACK:
1535                         break;
1536                 case BRANCH:
1537                         if (OP(preg, next) != BRANCH)           /* No choice. */
1538                                 next = OPERAND(scan);   /* Avoid recursion. */
1539                         else {
1540                                 do {
1541                                         save = preg->reginput;
1542                                         if (regmatch(preg, OPERAND(scan))) {
1543                                                 return(1);
1544                                         }
1545                                         preg->reginput = save;
1546                                         scan = regnext(preg, scan);
1547                                 } while (scan != 0 && OP(preg, scan) == BRANCH);
1548                                 return(0);
1549                                 /* NOTREACHED */
1550                         }
1551                         break;
1552                 case REP:
1553                 case REPMIN:
1554                         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1555
1556                 case REPX:
1557                 case REPXMIN:
1558                         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1559
1560                 case END:
1561                         return 1;       /* Success! */
1562
1563                 case OPENNC:
1564                 case CLOSENC:
1565                         return regmatch(preg, next);
1566
1567                 default:
1568                         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1569                                 save = preg->reginput;
1570                                 if (regmatch(preg, next)) {
1571                                         if (OP(preg, scan) < CLOSE) {
1572                                                 int no = OP(preg, scan) - OPEN;
1573                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1574                                                         preg->pmatch[no].rm_so = save - preg->start;
1575                                                 }
1576                                         }
1577                                         else {
1578                                                 int no = OP(preg, scan) - CLOSE;
1579                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1580                                                         preg->pmatch[no].rm_eo = save - preg->start;
1581                                                 }
1582                                         }
1583                                         return(1);
1584                                 }
1585                                 /* Restore input position after failure */
1586                                 preg->reginput = save;
1587                                 return(0);
1588                         }
1589                         return REG_ERR_INTERNAL;
1590                 }
1591
1592                 scan = next;
1593         }
1594
1595         /*
1596          * We get here only if there's trouble -- normally "case END" is
1597          * the terminating point.
1598          */
1599         return REG_ERR_INTERNAL;
1600 }
1601
1602 /*
1603  - regrepeat - repeatedly match something simple, report how many
1604  */
1605 static int regrepeat(regex_t *preg, int p, int max)
1606 {
1607         int count = 0;
1608         const char *scan;
1609         int opnd;
1610         int ch;
1611         int n;
1612
1613         scan = preg->reginput;
1614         opnd = OPERAND(p);
1615         switch (OP(preg, p)) {
1616         case ANY:
1617                 while (!reg_iseol(preg, *scan) && count < max) {
1618                         count++;
1619                         scan += utf8_charlen(*scan);
1620                 }
1621                 break;
1622         case EXACTLY:
1623                 while (count < max) {
1624                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1625                         if (preg->program[opnd] != ch) {
1626                                 break;
1627                         }
1628                         count++;
1629                         scan += n;
1630                 }
1631                 break;
1632         case ANYOF:
1633                 while (count < max) {
1634                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1635                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1636                                 break;
1637                         }
1638                         count++;
1639                         scan += n;
1640                 }
1641                 break;
1642         case ANYBUT:
1643                 while (count < max) {
1644                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1645                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1646                                 break;
1647                         }
1648                         count++;
1649                         scan += n;
1650                 }
1651                 break;
1652         default:                /* Oh dear.  Called inappropriately. */
1653                 preg->err = REG_ERR_INTERNAL;
1654                 count = 0;      /* Best compromise. */
1655                 break;
1656         }
1657         preg->reginput = scan;
1658
1659         return(count);
1660 }
1661
1662 /*
1663  - regnext - dig the "next" pointer out of a node
1664  */
1665 static int regnext(regex_t *preg, int p )
1666 {
1667         int offset;
1668
1669         offset = NEXT(preg, p);
1670
1671         if (offset == 0)
1672                 return 0;
1673
1674         if (OP(preg, p) == BACK)
1675                 return(p-offset);
1676         else
1677                 return(p+offset);
1678 }
1679
1680 /*
1681  - regopsize - returns the size of opcode + operands at 'p' in words
1682  */
1683 static int regopsize(regex_t *preg, int p )
1684 {
1685         /* Almost all opcodes are 2 words, but some are more */
1686         switch (OP(preg, p)) {
1687                 case REP:
1688                 case REPMIN:
1689                 case REPX:
1690                 case REPXMIN:
1691                         return 5;
1692
1693                 case ANYOF:
1694                 case ANYBUT:
1695                 case EXACTLY: {
1696                         int s = p + 2;
1697                         while (preg->program[s++]) {
1698                         }
1699                         return s - p;
1700                 }
1701         }
1702         return 2;
1703 }
1704
1705 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1706
1707 /*
1708  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1709  */
1710 static void regdump(regex_t *preg)
1711 {
1712         int s;
1713         int op = EXACTLY;       /* Arbitrary non-END op. */
1714         int next;
1715         char buf[MAX_UTF8_LEN + 1];
1716
1717         int i;
1718         for (i = 1; i < preg->p; i++) {
1719                 printf("%02x ", (unsigned char)preg->program[i]);
1720                 if (i % 16 == 0) {
1721                         printf("\n");
1722                 }
1723         }
1724         printf("\n");
1725
1726         s = 1;
1727         while (op != END && s < preg->p) {      /* While that wasn't END last time... */
1728                 op = OP(preg, s);
1729                 printf("%3d: %s", s, regprop(op));      /* Where, what. */
1730                 next = regnext(preg, s);
1731                 if (next == 0)          /* Next ptr. */
1732                         printf("(0)");
1733                 else
1734                         printf("(%d)", next);
1735                 s += 2;
1736                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1737                         int max = preg->program[s];
1738                         int min = preg->program[s + 1];
1739                         if (max == 65535) {
1740                                 printf("{%d,*}", min);
1741                         }
1742                         else {
1743                                 printf("{%d,%d}", min, max);
1744                         }
1745                         printf(" %d", preg->program[s + 2]);
1746                         s += 3;
1747                 }
1748                 else if (op == ANYOF || op == ANYBUT) {
1749                         /* set of ranges */
1750
1751                         while (preg->program[s]) {
1752                                 int len = preg->program[s++];
1753                                 int first = preg->program[s++];
1754                                 buf[utf8_getchars(buf, first)] = 0;
1755                                 printf("%s", buf);
1756                                 if (len > 1) {
1757                                         buf[utf8_getchars(buf, first + len - 1)] = 0;
1758                                         printf("-%s", buf);
1759                                 }
1760                         }
1761                         s++;
1762                 }
1763                 else if (op == EXACTLY) {
1764                         /* Literal string, where present. */
1765
1766                         while (preg->program[s]) {
1767                                 buf[utf8_getchars(buf, preg->program[s])] = 0;
1768                                 printf("%s", buf);
1769                                 s++;
1770                         }
1771                         s++;
1772                 }
1773                 putchar('\n');
1774         }
1775
1776         if (op == END) {
1777                 /* Header fields of interest. */
1778                 if (preg->regstart) {
1779                         buf[utf8_getchars(buf, preg->regstart)] = 0;
1780                         printf("start '%s' ", buf);
1781                 }
1782                 if (preg->reganch)
1783                         printf("anchored ");
1784                 if (preg->regmust != 0) {
1785                         int i;
1786                         printf("must have:");
1787                         for (i = 0; i < preg->regmlen; i++) {
1788                                 putchar(preg->program[preg->regmust + i]);
1789                         }
1790                         putchar('\n');
1791                 }
1792         }
1793         printf("\n");
1794 }
1795
1796 /*
1797  - regprop - printable representation of opcode
1798  */
1799 static const char *regprop( int op )
1800 {
1801         static char buf[50];
1802
1803         switch (op) {
1804         case BOL:
1805                 return "BOL";
1806         case EOL:
1807                 return "EOL";
1808         case BOLX:
1809                 return "BOLX";
1810         case EOLX:
1811                 return "EOLX";
1812         case ANY:
1813                 return "ANY";
1814         case ANYOF:
1815                 return "ANYOF";
1816         case ANYBUT:
1817                 return "ANYBUT";
1818         case BRANCH:
1819                 return "BRANCH";
1820         case EXACTLY:
1821                 return "EXACTLY";
1822         case NOTHING:
1823                 return "NOTHING";
1824         case BACK:
1825                 return "BACK";
1826         case END:
1827                 return "END";
1828         case REP:
1829                 return "REP";
1830         case REPMIN:
1831                 return "REPMIN";
1832         case REPX:
1833                 return "REPX";
1834         case REPXMIN:
1835                 return "REPXMIN";
1836         case WORDA:
1837                 return "WORDA";
1838         case WORDZ:
1839                 return "WORDZ";
1840         case OPENNC:
1841                 return "OPEN";
1842         case CLOSENC:
1843                 return "CLOSE";
1844         default:
1845                 if (op >= OPEN && op < CLOSE) {
1846                         snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1847                 }
1848                 else if (op >= CLOSE && op < CLOSE_END) {
1849                         snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1850                 }
1851                 else {
1852                         snprintf(buf, sizeof(buf), "?%d?\n", op);
1853                 }
1854                 return(buf);
1855         }
1856 }
1857 #endif /* JIM_BOOTSTRAP */
1858
1859 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1860 {
1861         static const char *error_strings[] = {
1862                 "success",
1863                 "no match",
1864                 "bad pattern",
1865                 "null argument",
1866                 "unknown error",
1867                 "too big",
1868                 "out of memory",
1869                 "too many ()",
1870                 "parentheses () not balanced",
1871                 "braces {} not balanced",
1872                 "invalid repetition count(s)",
1873                 "extra characters",
1874                 "*+ of empty atom",
1875                 "nested count",
1876                 "internal error",
1877                 "count follows nothing",
1878                 "trailing backslash",
1879                 "corrupted program",
1880                 "contains null char",
1881         };
1882         const char *err;
1883
1884         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1885                 err = "Bad error code";
1886         }
1887         else {
1888                 err = error_strings[errcode];
1889         }
1890
1891         return snprintf(errbuf, errbuf_size, "%s", err);
1892 }
1893
1894 void regfree(regex_t *preg)
1895 {
1896         free(preg->program);
1897 }
1898
1899 #endif