jimregexp.c

   1 /*
   2  * vi:se ts=8:
   3  *
   4  * regcomp and regexec -- regsub and regerror are elsewhere
   5  *
   6  *      Copyright (c) 1986 by University of Toronto.
   7  *      Written by Henry Spencer.  Not derived from licensed software.
   8  *
   9  *      Permission is granted to anyone to use this software for any
  10  *      purpose on any computer system, and to redistribute it freely,
  11  *      subject to the following restrictions:
  12  *
  13  *      1. The author is not responsible for the consequences of use of
  14  *              this software, no matter how awful, even if they arise
  15  *              from defects in it.
  16  *
  17  *      2. The origin of this software must not be misrepresented, either
  18  *              by explicit claim or by omission.
  19  *
  20  *      3. Altered versions must be plainly marked as such, and must not
  21  *              be misrepresented as being the original software.
  22  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  23  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  24  *** to assist in implementing egrep.
  25  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  26  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  27  *** as in BSD grep and ex.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  29  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  31  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  32  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  33  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  34  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  35  *** to avoid conflicting with the system regexp.h.  Const, bless its
  36  *** soul, was removed so it can compile everywhere.  The declaration
  37  *** of strchr() was in conflict on AIX, so it was removed (as it is
  38  *** happily defined in string.h).
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  41  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  42  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  43  *
  44  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  45  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  46  *   This includes counted repetitions, UTF-8 support, character classes,
  47  *   shorthand character classes, increased number of parentheses to 100,
  48  *   backslash escape sequences. It also removes \n as an alternative to |.
  49  *
  50  * Beware that some of this code is subtly aware of the way operator
  51  * precedence is structured in regular expressions.  Serious changes in
  52  * regular-expression syntax might require a total rethink.
  53  */
  54
  55 #include "jimautoconf.h"
  56
  57 #if defined(JIM_REGEXP)
  58 #include <stdio.h>
  59 #include <ctype.h>
  60 #include <stdlib.h>
  61 #include <string.h>
  62
  63 #include "jim.h"
  64 #include "jimregexp.h"
  65 #include "utf8.h"
  66
  67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
  68 #define REG_MAX_PAREN 100
  69
  70 /*
  71  * Structure for regexp "program".  This is essentially a linear encoding
  72  * of a nondeterministic finite-state machine (aka syntax charts or
  73  * "railroad normal form" in parsing technology).  Each node is an opcode
  74  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  75  * all nodes except BRANCH implement concatenation; a "next" pointer with
  76  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  77  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  78  * opposed to a collection of them) is never concatenated with anything
  79  * because of operator precedence.)  The operand of some types of node is
  80  * a literal string; for others, it is a node leading into a sub-FSM.  In
  81  * particular, the operand of a BRANCH node is the first node of the branch.
  82  * (NB this is *not* a tree structure:  the tail of the branch connects
  83  * to the thing following the set of BRANCHes.)  The opcodes are:
  84  */
  85
  86 /* definition           number  opnd?   meaning */
  87 #define END     0       /* no   End of program. */
  88 #define BOL     1       /* no   Match "" at beginning of line. */
  89 #define EOL     2       /* no   Match "" at end of line. */
  90 #define ANY     3       /* no   Match any one character. */
  91 #define ANYOF   4       /* str  Match any character in this string. */
  92 #define ANYBUT  5       /* str  Match any character not in this string. */
  93 #define BRANCH  6       /* node Match this alternative, or the next... */
  94 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  95 #define EXACTLY 8       /* str  Match this string. */
  96 #define NOTHING 9       /* no   Match empty string. */
  97 #define REP     10      /* max,min      Match this (simple) thing [min,max] times. */
  98 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, minimal match. */
  99 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
 100 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
 101 #define BOLX    14      /* no   Match "" at beginning of input. */
 102 #define EOLX    15      /* no   Match "" at end of input. */
 103 #define WORDA   16      /* no   Match "" at wordchar, where prev is nonword */
 104 #define WORDZ   17      /* no   Match "" at nonwordchar, where prev is word */
 105
 106 #define OPENNC  1000    /* no   Non-capturing parentheses - must be OPEN-1 */
 107 #define OPEN    1001    /* no   Mark this point in input as start of #n. */
 108                         /*      OPEN+1 is number 1, etc. */
 109
 110 /* must not be any other opts between OPEN and CLOSE */
 111
 112 #define CLOSENC 2000    /* no   Non-capturing parentheses - must be CLOSE-1 */
 113 #define CLOSE   2001    /* no   Analogous to OPEN. */
 114 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 115
 116 /*
 117  * The first word of the regexp internal "program" is actually this magic
 118  * number; the start node begins in the second word.
 119  */
 120 #define REG_MAGIC       0xFADED00D
 121
 122 /*
 123  * Opcode notes:
 124  *
 125  * BRANCH       The set of branches constituting a single choice are hooked
 126  *              together with their "next" pointers, since precedence prevents
 127  *              anything being concatenated to any individual branch.  The
 128  *              "next" pointer of the last BRANCH in a choice points to the
 129  *              thing following the whole choice.  This is also where the
 130  *              final "next" pointer of each individual branch points; each
 131  *              branch starts with the operand node of a BRANCH node.
 132  *
 133  * BACK         Normal "next" pointers all implicitly point forward; BACK
 134  *              exists to make loop structures possible.
 135  *
 136  * REP,REPX     Repeated matches ('?', '*', '+' and {min,max}) are implemented
 137  *              as either simple repeats (REP) or complex repeats (REPX).
 138  *              These opcodes include a "min" and "max" count after the opcode.
 139  *              This is followed by a fourth "current count" word that is
 140  *              only used by REPX, as it implements a recursive match.
 141  *              REPMIN and REPXMIN are identical except they implement minimal repeats.
 142  *
 143  * OPEN,CLOSE   ...are numbered at compile time.
 144  */
 145
 146 /*
 147  * A node is one word of opcode followed by one word of "next" pointer.
 148  * The "next" pointer value is a positive offset from the opcode of the node
 149  * containing it.
 150  * An operand, if any, simply follows the node.  (Note that much of the
 151  * code generation knows about this implicit relationship.)
 152  */
 153 #define OP(preg, p)     (preg->program[p])
 154 #define NEXT(preg, p)   (preg->program[p + 1])
 155 #define OPERAND(p)      ((p) + 2)
 156
 157 /*
 158  * See regmagic.h for one further detail of program structure.
 159  */
 160
 161
 162 /*
 163  * Utility definitions.
 164  */
 165
 166 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 167 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 168 #define META            "^$.[()|?{+*"
 169
 170 /*
 171  * Flags to be passed up and down.
 172  */
 173 #define HASWIDTH        1       /* Known never to match null string. */
 174 #define SIMPLE          2       /* Simple enough to be STAR/PLUS operand. */
 175 #define SPSTART         4       /* Starts with * or +. */
 176 #define WORST           0       /* Worst case. */
 177
 178 #define MAX_REP_COUNT 1000000
 179
 180 /*
 181  * Forward declarations for regcomp()'s friends.
 182  */
 183 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 184 static int regpiece(regex_t *preg, int *flagp );
 185 static int regbranch(regex_t *preg, int *flagp );
 186 static int regatom(regex_t *preg, int *flagp );
 187 static int regnode(regex_t *preg, int op );
 188 static int regnext(regex_t *preg, int p );
 189 static void regc(regex_t *preg, int b );
 190 static int reginsert(regex_t *preg, int op, int size, int opnd );
 191 static void regtail(regex_t *preg, int p, int val);
 192 static void regoptail(regex_t *preg, int p, int val );
 193 static int regopsize(regex_t *preg, int p );
 194
 195 static int reg_range_find(const int *string, int c);
 196 static const char *str_find(const char *string, int c, int nocase);
 197 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 198
 199 /*#define DEBUG*/
 200 #ifdef DEBUG
 201 static int regnarrate = 0;
 202 static void regdump(regex_t *preg);
 203 static const char *regprop( int op );
 204 #endif
 205
 206
 207 /**
 208  * Returns the length of the null-terminated integer sequence.
 209  */
 210 static int str_int_len(const int *seq)
 211 {
 212         int n = 0;
 213         while (*seq++) {
 214                 n++;
 215         }
 216         return n;
 217 }
 218
 219 /*
 220  - regcomp - compile a regular expression into internal code
 221  *
 222  * We can't allocate space until we know how big the compiled form will be,
 223  * but we can't compile it (and thus know how big it is) until we've got a
 224  * place to put the code.  So we cheat:  we compile it twice, once with code
 225  * generation turned off and size counting turned on, and once "for real".
 226  * This also means that we don't allocate space until we are sure that the
 227  * thing really will compile successfully, and we never have to move the
 228  * code and thus invalidate pointers into it.  (Note that it has to be in
 229  * one piece because free() must be able to free it all.)
 230  *
 231  * Beware that the optimization-preparation code in here knows about some
 232  * of the structure of the compiled regexp.
 233  */
 234 int regcomp(regex_t *preg, const char *exp, int cflags)
 235 {
 236         int scan;
 237         int longest;
 238         unsigned len;
 239         int flags;
 240
 241 #ifdef DEBUG
 242         fprintf(stderr, "Compiling: '%s'\n", exp);
 243 #endif
 244         memset(preg, 0, sizeof(*preg));
 245
 246         if (exp == NULL)
 247                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 248
 249         /* First pass: determine size, legality. */
 250         preg->cflags = cflags;
 251         preg->regparse = exp;
 252
 253         /* Allocate space. */
 254         preg->proglen = (strlen(exp) + 1) * 5;
 255         preg->program = malloc(preg->proglen * sizeof(int));
 256         if (preg->program == NULL)
 257                 FAIL(preg, REG_ERR_NOMEM);
 258
 259         /* Note that since we store a magic value as the first item in the program,
 260          * program offsets will never be 0
 261          */
 262         regc(preg, REG_MAGIC);
 263         if (reg(preg, 0, &flags) == 0) {
 264                 return preg->err;
 265         }
 266
 267         /* Small enough for pointer-storage convention? */
 268         if (preg->re_nsub >= REG_MAX_PAREN)             /* Probably could be 65535L. */
 269                 FAIL(preg,REG_ERR_TOO_BIG);
 270
 271         /* Dig out information for optimizations. */
 272         preg->regstart = 0;     /* Worst-case defaults. */
 273         preg->reganch = 0;
 274         preg->regmust = 0;
 275         preg->regmlen = 0;
 276         scan = 1;                       /* First BRANCH. */
 277         if (OP(preg, regnext(preg, scan)) == END) {             /* Only one top-level choice. */
 278                 scan = OPERAND(scan);
 279
 280                 /* Starting-point info. */
 281                 if (OP(preg, scan) == EXACTLY) {
 282                         preg->regstart = preg->program[OPERAND(scan)];
 283                 }
 284                 else if (OP(preg, scan) == BOL)
 285                         preg->reganch++;
 286
 287                 /*
 288                  * If there's something expensive in the r.e., find the
 289                  * longest literal string that must appear and make it the
 290                  * regmust.  Resolve ties in favor of later strings, since
 291                  * the regstart check works with the beginning of the r.e.
 292                  * and avoiding duplication strengthens checking.  Not a
 293                  * strong reason, but sufficient in the absence of others.
 294                  */
 295                 if (flags&SPSTART) {
 296                         longest = 0;
 297                         len = 0;
 298                         for (; scan != 0; scan = regnext(preg, scan)) {
 299                                 if (OP(preg, scan) == EXACTLY) {
 300                                         int plen = str_int_len(preg->program + OPERAND(scan));
 301                                         if (plen >= len) {
 302                                                 longest = OPERAND(scan);
 303                                                 len = plen;
 304                                         }
 305                                 }
 306                         }
 307                         preg->regmust = longest;
 308                         preg->regmlen = len;
 309                 }
 310         }
 311
 312 #ifdef DEBUG
 313         regdump(preg);
 314 #endif
 315
 316         return 0;
 317 }
 318
 319 /*
 320  - reg - regular expression, i.e. main body or parenthesized thing
 321  *
 322  * Caller must absorb opening parenthesis.
 323  *
 324  * Combining parenthesis handling with the base level of regular expression
 325  * is a trifle forced, but the need to tie the tails of the branches to what
 326  * follows makes it hard to avoid.
 327  */
 328 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 329 {
 330         int ret;
 331         int br;
 332         int ender;
 333         int parno = 0;
 334         int flags;
 335
 336         *flagp = HASWIDTH;      /* Tentatively. */
 337
 338         /* Make an OPEN node, if parenthesized. */
 339         if (paren) {
 340                 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 341                         /* non-capturing paren */
 342                         preg->regparse += 2;
 343                         parno = -1;
 344                 }
 345                 else {
 346                         parno = ++preg->re_nsub;
 347                 }
 348                 ret = regnode(preg, OPEN+parno);
 349         } else
 350                 ret = 0;
 351
 352         /* Pick up the branches, linking them together. */
 353         br = regbranch(preg, &flags);
 354         if (br == 0)
 355                 return 0;
 356         if (ret != 0)
 357                 regtail(preg, ret, br); /* OPEN -> first. */
 358         else
 359                 ret = br;
 360         if (!(flags&HASWIDTH))
 361                 *flagp &= ~HASWIDTH;
 362         *flagp |= flags&SPSTART;
 363         while (*preg->regparse == '|') {
 364                 preg->regparse++;
 365                 br = regbranch(preg, &flags);
 366                 if (br == 0)
 367                         return 0;
 368                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 369                 if (!(flags&HASWIDTH))
 370                         *flagp &= ~HASWIDTH;
 371                 *flagp |= flags&SPSTART;
 372         }
 373
 374         /* Make a closing node, and hook it on the end. */
 375         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 376         regtail(preg, ret, ender);
 377
 378         /* Hook the tails of the branches to the closing node. */
 379         for (br = ret; br != 0; br = regnext(preg, br))
 380                 regoptail(preg, br, ender);
 381
 382         /* Check for proper termination. */
 383         if (paren && *preg->regparse++ != ')') {
 384                 preg->err = REG_ERR_UNMATCHED_PAREN;
 385                 return 0;
 386         } else if (!paren && *preg->regparse != '\0') {
 387                 if (*preg->regparse == ')') {
 388                         preg->err = REG_ERR_UNMATCHED_PAREN;
 389                         return 0;
 390                 } else {
 391                         preg->err = REG_ERR_JUNK_ON_END;
 392                         return 0;
 393                 }
 394         }
 395
 396         return(ret);
 397 }
 398
 399 /*
 400  - regbranch - one alternative of an | operator
 401  *
 402  * Implements the concatenation operator.
 403  */
 404 static int regbranch(regex_t *preg, int *flagp )
 405 {
 406         int ret;
 407         int chain;
 408         int latest;
 409         int flags;
 410
 411         *flagp = WORST;         /* Tentatively. */
 412
 413         ret = regnode(preg, BRANCH);
 414         chain = 0;
 415         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 416                *preg->regparse != '|') {
 417                 latest = regpiece(preg, &flags);
 418                 if (latest == 0)
 419                         return 0;
 420                 *flagp |= flags&HASWIDTH;
 421                 if (chain == 0) {/* First piece. */
 422                         *flagp |= flags&SPSTART;
 423                 }
 424                 else {
 425                         regtail(preg, chain, latest);
 426                 }
 427                 chain = latest;
 428         }
 429         if (chain == 0) /* Loop ran zero times. */
 430                 (void) regnode(preg, NOTHING);
 431
 432         return(ret);
 433 }
 434
 435 /*
 436  - regpiece - something followed by possible [*+?]
 437  *
 438  * Note that the branching code sequences used for ? and the general cases
 439  * of * and + are somewhat optimized:  they use the same NOTHING node as
 440  * both the endmarker for their branch list and the body of the last branch.
 441  * It might seem that this node could be dispensed with entirely, but the
 442  * endmarker role is not redundant.
 443  */
 444 static int regpiece(regex_t *preg, int *flagp)
 445 {
 446         int ret;
 447         char op;
 448         int next;
 449         int flags;
 450         int min;
 451         int max;
 452
 453         ret = regatom(preg, &flags);
 454         if (ret == 0)
 455                 return 0;
 456
 457         op = *preg->regparse;
 458         if (!ISMULT(op)) {
 459                 *flagp = flags;
 460                 return(ret);
 461         }
 462
 463         if (!(flags&HASWIDTH) && op != '?') {
 464                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 465                 return 0;
 466         }
 467
 468         /* Handle braces (counted repetition) by expansion */
 469         if (op == '{') {
 470                 char *end;
 471
 472                 min = strtoul(preg->regparse + 1, &end, 10);
 473                 if (end == preg->regparse + 1) {
 474                         preg->err = REG_ERR_BAD_COUNT;
 475                         return 0;
 476                 }
 477                 if (*end == '}') {
 478                         max = min;
 479                 }
 480                 else if (*end == '\0') {
 481                         preg->err = REG_ERR_UNMATCHED_BRACES;
 482                         return 0;
 483                 }
 484                 else {
 485                         preg->regparse = end;
 486                         max = strtoul(preg->regparse + 1, &end, 10);
 487                         if (*end != '}') {
 488                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 489                                 return 0;
 490                         }
 491                 }
 492                 if (end == preg->regparse + 1) {
 493                         max = MAX_REP_COUNT;
 494                 }
 495                 else if (max < min || max >= 100) {
 496                         preg->err = REG_ERR_BAD_COUNT;
 497                         return 0;
 498                 }
 499                 if (min >= 100) {
 500                         preg->err = REG_ERR_BAD_COUNT;
 501                         return 0;
 502                 }
 503
 504                 preg->regparse = strchr(preg->regparse, '}');
 505         }
 506         else {
 507                 min = (op == '+');
 508                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 509         }
 510
 511         if (preg->regparse[1] == '?') {
 512                 preg->regparse++;
 513                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 514         }
 515         else {
 516                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 517         }
 518         preg->program[ret + 2] = max;
 519         preg->program[ret + 3] = min;
 520         preg->program[ret + 4] = 0;
 521
 522         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 523
 524         if (!(flags & SIMPLE)) {
 525                 int back = regnode(preg, BACK);
 526                 regtail(preg, back, ret);
 527                 regtail(preg, next, back);
 528         }
 529
 530         preg->regparse++;
 531         if (ISMULT(*preg->regparse)) {
 532                 preg->err = REG_ERR_NESTED_COUNT;
 533                 return 0;
 534         }
 535
 536         return ret;
 537 }
 538
 539 /**
 540  * Add all characters in the inclusive range between lower and upper.
 541  *
 542  * Handles a swapped range (upper < lower).
 543  */
 544 static void reg_addrange(regex_t *preg, int lower, int upper)
 545 {
 546         if (lower > upper) {
 547                 reg_addrange(preg, upper, lower);
 548         }
 549         /* Add a range as length, start */
 550         regc(preg, upper - lower + 1);
 551         regc(preg, lower);
 552 }
 553
 554 /**
 555  * Add a null-terminated literal string as a set of ranges.
 556  */
 557 static void reg_addrange_str(regex_t *preg, const char *str)
 558 {
 559         while (*str) {
 560                 reg_addrange(preg, *str, *str);
 561                 str++;
 562         }
 563 }
 564
 565 /**
 566  * Extracts the next unicode char from utf8.
 567  *
 568  * If 'upper' is set, converts the char to uppercase.
 569  */
 570 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 571 {
 572         int l = utf8_tounicode(s, uc);
 573         if (upper) {
 574                 *uc = utf8_upper(*uc);
 575         }
 576         return l;
 577 }
 578
 579 /**
 580  * Converts a hex digit to decimal.
 581  *
 582  * Returns -1 for an invalid hex digit.
 583  */
 584 static int hexdigitval(int c)
 585 {
 586         if (c >= '0' && c <= '9')
 587                 return c - '0';
 588         if (c >= 'a' && c <= 'f')
 589                 return c - 'a' + 10;
 590         if (c >= 'A' && c <= 'F')
 591                 return c - 'A' + 10;
 592         return -1;
 593 }
 594
 595 /**
 596  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 597  *
 598  * Returns the number of hex digits parsed.
 599  * If there are no hex digits, returns 0 and stores nothing.
 600  */
 601 static int parse_hex(const char *s, int n, int *uc)
 602 {
 603         int val = 0;
 604         int k;
 605
 606         for (k = 0; k < n; k++) {
 607                 int c = hexdigitval(*s++);
 608                 if (c == -1) {
 609                         break;
 610                 }
 611                 val = (val << 4) | c;
 612         }
 613         if (k) {
 614                 *uc = val;
 615         }
 616         return k;
 617 }
 618
 619 /**
 620  * Call for chars after a backlash to decode the escape sequence.
 621  *
 622  * Stores the result in *ch.
 623  *
 624  * Returns the number of bytes consumed.
 625  */
 626 static int reg_decode_escape(const char *s, int *ch)
 627 {
 628         int n;
 629         const char *s0 = s;
 630
 631         *ch = *s++;
 632
 633         switch (*ch) {
 634                 case 'b': *ch = '\b'; break;
 635                 case 'e': *ch = 27; break;
 636                 case 'f': *ch = '\f'; break;
 637                 case 'n': *ch = '\n'; break;
 638                 case 'r': *ch = '\r'; break;
 639                 case 't': *ch = '\t'; break;
 640                 case 'v': *ch = '\v'; break;
 641                 case 'u':
 642                         if (*s == '{') {
 643                                 /* Expect \u{NNNN} */
 644                                 n = parse_hex(s + 1, 6, ch);
 645                                 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
 646                                         s += n + 2;
 647                                 }
 648                                 else {
 649                                         /* Invalid, so just treat as an escaped 'u' */
 650                                         *ch = 'u';
 651                                 }
 652                         }
 653                         else if ((n = parse_hex(s, 4, ch)) > 0) {
 654                                 s += n;
 655                         }
 656                         break;
 657                 case 'U':
 658                         if ((n = parse_hex(s, 8, ch)) > 0) {
 659                                 s += n;
 660                         }
 661                         break;
 662                 case 'x':
 663                         if ((n = parse_hex(s, 2, ch)) > 0) {
 664                                 s += n;
 665                         }
 666                         break;
 667                 case '\0':
 668                         s--;
 669                         *ch = '\\';
 670                         break;
 671         }
 672         return s - s0;
 673 }
 674
 675 /*
 676  - regatom - the lowest level
 677  *
 678  * Optimization:  gobbles an entire sequence of ordinary characters so that
 679  * it can turn them into a single node, which is smaller to store and
 680  * faster to run.  Backslashed characters are exceptions, each becoming a
 681  * separate node; the code is simpler that way and it's not worth fixing.
 682  */
 683 static int regatom(regex_t *preg, int *flagp)
 684 {
 685         int ret;
 686         int flags;
 687         int nocase = (preg->cflags & REG_ICASE);
 688
 689         int ch;
 690         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 691
 692         *flagp = WORST;         /* Tentatively. */
 693
 694         preg->regparse += n;
 695         switch (ch) {
 696         /* FIXME: these chars only have meaning at beg/end of pat? */
 697         case '^':
 698                 ret = regnode(preg, BOL);
 699                 break;
 700         case '$':
 701                 ret = regnode(preg, EOL);
 702                 break;
 703         case '.':
 704                 ret = regnode(preg, ANY);
 705                 *flagp |= HASWIDTH|SIMPLE;
 706                 break;
 707         case '[': {
 708                         const char *pattern = preg->regparse;
 709
 710                         if (*pattern == '^') {  /* Complement of range. */
 711                                 ret = regnode(preg, ANYBUT);
 712                                 pattern++;
 713                         } else
 714                                 ret = regnode(preg, ANYOF);
 715
 716                         /* Special case. If the first char is ']' or '-', it is part of the set */
 717                         if (*pattern == ']' || *pattern == '-') {
 718                                 reg_addrange(preg, *pattern, *pattern);
 719                                 pattern++;
 720                         }
 721
 722                         while (*pattern && *pattern != ']') {
 723                                 /* Is this a range? a-z */
 724                                 int start;
 725                                 int end;
 726
 727                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 728                                 if (start == '\\') {
 729                                         pattern += reg_decode_escape(pattern, &start);
 730                                         if (start == 0) {
 731                                                 preg->err = REG_ERR_NULL_CHAR;
 732                                                 return 0;
 733                                         }
 734                                 }
 735                                 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
 736                                         /* skip '-' */
 737                                         pattern += utf8_tounicode(pattern, &end);
 738                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 739                                         if (end == '\\') {
 740                                                 pattern += reg_decode_escape(pattern, &end);
 741                                                 if (end == 0) {
 742                                                         preg->err = REG_ERR_NULL_CHAR;
 743                                                         return 0;
 744                                                 }
 745                                         }
 746
 747                                         reg_addrange(preg, start, end);
 748                                         continue;
 749                                 }
 750                                 if (start == '[' && pattern[0] == ':') {
 751                                         static const char *character_class[] = {
 752                                                 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
 753                                                 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
 754                                         };
 755                                         enum {
 756                                                 CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
 757                                                 CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
 758                                                 CC_NUM
 759                                         };
 760                                         int i;
 761
 762                                         for (i = 0; i < CC_NUM; i++) {
 763                                                 n = strlen(character_class[i]);
 764                                                 if (strncmp(pattern, character_class[i], n) == 0) {
 765                                                         /* Found a character class */
 766                                                         pattern += n + 1;
 767                                                         break;
 768                                                 }
 769                                         }
 770                                         if (i != CC_NUM) {
 771                                                 switch (i) {
 772                                                         case CC_ALNUM:
 773                                                                 reg_addrange(preg, '0', '9');
 774                                                                 /* Fall through */
 775                                                         case CC_ALPHA:
 776                                                                 if ((preg->cflags & REG_ICASE) == 0) {
 777                                                                         reg_addrange(preg, 'a', 'z');
 778                                                                 }
 779                                                                 reg_addrange(preg, 'A', 'Z');
 780                                                                 break;
 781                                                         case CC_SPACE:
 782                                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 783                                                                 break;
 784                                                         case CC_BLANK:
 785                                                                 reg_addrange_str(preg, " \t");
 786                                                                 break;
 787                                                         case CC_UPPER:
 788                                                                 reg_addrange(preg, 'A', 'Z');
 789                                                                 break;
 790                                                         case CC_LOWER:
 791                                                                 reg_addrange(preg, 'a', 'z');
 792                                                                 break;
 793                                                         case CC_XDIGIT:
 794                                                                 reg_addrange(preg, 'a', 'f');
 795                                                                 reg_addrange(preg, 'A', 'F');
 796                                                                 /* Fall through */
 797                                                         case CC_DIGIT:
 798                                                                 reg_addrange(preg, '0', '9');
 799                                                                 break;
 800                                                         case CC_CNTRL:
 801                                                                 reg_addrange(preg, 0, 31);
 802                                                                 reg_addrange(preg, 127, 127);
 803                                                                 break;
 804                                                         case CC_PRINT:
 805                                                                 reg_addrange(preg, ' ', '~');
 806                                                                 break;
 807                                                         case CC_GRAPH:
 808                                                                 reg_addrange(preg, '!', '~');
 809                                                                 break;
 810                                                         case CC_PUNCT:
 811                                                                 reg_addrange(preg, '!', '/');
 812                                                                 reg_addrange(preg, ':', '@');
 813                                                                 reg_addrange(preg, '[', '`');
 814                                                                 reg_addrange(preg, '{', '~');
 815                                                                 break;
 816                                                 }
 817                                                 continue;
 818                                         }
 819                                 }
 820                                 /* Not a range, so just add the char */
 821                                 reg_addrange(preg, start, start);
 822                         }
 823                         regc(preg, '\0');
 824
 825                         if (*pattern) {
 826                                 pattern++;
 827                         }
 828                         preg->regparse = pattern;
 829
 830                         *flagp |= HASWIDTH|SIMPLE;
 831                 }
 832                 break;
 833         case '(':
 834                 ret = reg(preg, 1, &flags);
 835                 if (ret == 0)
 836                         return 0;
 837                 *flagp |= flags&(HASWIDTH|SPSTART);
 838                 break;
 839         case '\0':
 840         case '|':
 841         case ')':
 842                 preg->err = REG_ERR_INTERNAL;
 843                 return 0;       /* Supposed to be caught earlier. */
 844         case '?':
 845         case '+':
 846         case '*':
 847         case '{':
 848                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 849                 return 0;
 850         case '\\':
 851                 ch = *preg->regparse++;
 852                 switch (ch) {
 853                 case '\0':
 854                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 855                         return 0;
 856                 case 'A':
 857                         ret = regnode(preg, BOLX);
 858                         break;
 859                 case 'Z':
 860                         ret = regnode(preg, EOLX);
 861                         break;
 862                 case '<':
 863                 case 'm':
 864                         ret = regnode(preg, WORDA);
 865                         break;
 866                 case '>':
 867                 case 'M':
 868                         ret = regnode(preg, WORDZ);
 869                         break;
 870                 case 'd':
 871                 case 'D':
 872                         ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
 873                         reg_addrange(preg, '0', '9');
 874                         regc(preg, '\0');
 875                         *flagp |= HASWIDTH|SIMPLE;
 876                         break;
 877                 case 'w':
 878                 case 'W':
 879                         ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
 880                         if ((preg->cflags & REG_ICASE) == 0) {
 881                                 reg_addrange(preg, 'a', 'z');
 882                         }
 883                         reg_addrange(preg, 'A', 'Z');
 884                         reg_addrange(preg, '0', '9');
 885                         reg_addrange(preg, '_', '_');
 886                         regc(preg, '\0');
 887                         *flagp |= HASWIDTH|SIMPLE;
 888                         break;
 889                 case 's':
 890                 case 'S':
 891                         ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
 892                         reg_addrange_str(preg," \t\r\n\f\v");
 893                         regc(preg, '\0');
 894                         *flagp |= HASWIDTH|SIMPLE;
 895                         break;
 896                 /* FIXME: Someday handle \1, \2, ... */
 897                 default:
 898                         /* Handle general quoted chars in exact-match routine */
 899                         /* Back up to include the backslash */
 900                         preg->regparse--;
 901                         goto de_fault;
 902                 }
 903                 break;
 904         de_fault:
 905         default: {
 906                         /*
 907                          * Encode a string of characters to be matched exactly.
 908                          */
 909                         int added = 0;
 910
 911                         /* Back up to pick up the first char of interest */
 912                         preg->regparse -= n;
 913
 914                         ret = regnode(preg, EXACTLY);
 915
 916                         /* Note that a META operator such as ? or * consumes the
 917                          * preceding char.
 918                          * Thus we must be careful to look ahead by 2 and add the
 919                          * last char as it's own EXACTLY if necessary
 920                          */
 921
 922                         /* Until end of string or a META char is reached */
 923                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 924                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 925                                 if (ch == '\\' && preg->regparse[n]) {
 926                                         /* Non-trailing backslash.
 927                                          * Is this a special escape, or a regular escape?
 928                                          */
 929                                         if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
 930                                                 /* A special escape. All done with EXACTLY */
 931                                                 break;
 932                                         }
 933                                         /* Decode it. Note that we add the length for the escape
 934                                          * sequence to the length for the backlash so we can skip
 935                                          * the entire sequence, or not as required.
 936                                          */
 937                                         n += reg_decode_escape(preg->regparse + n, &ch);
 938                                         if (ch == 0) {
 939                                                 preg->err = REG_ERR_NULL_CHAR;
 940                                                 return 0;
 941                                         }
 942                                 }
 943
 944                                 /* Now we have one char 'ch' of length 'n'.
 945                                  * Check to see if the following char is a MULT
 946                                  */
 947
 948                                 if (ISMULT(preg->regparse[n])) {
 949                                         /* Yes. But do we already have some EXACTLY chars? */
 950                                         if (added) {
 951                                                 /* Yes, so return what we have and pick up the current char next time around */
 952                                                 break;
 953                                         }
 954                                         /* No, so add this single char and finish */
 955                                         regc(preg, ch);
 956                                         added++;
 957                                         preg->regparse += n;
 958                                         break;
 959                                 }
 960
 961                                 /* No, so just add this char normally */
 962                                 regc(preg, ch);
 963                                 added++;
 964                                 preg->regparse += n;
 965                         }
 966                         regc(preg, '\0');
 967
 968                         *flagp |= HASWIDTH;
 969                         if (added == 1)
 970                                 *flagp |= SIMPLE;
 971                         break;
 972                 }
 973                 break;
 974         }
 975
 976         return(ret);
 977 }
 978
 979 static void reg_grow(regex_t *preg, int n)
 980 {
 981         if (preg->p + n >= preg->proglen) {
 982                 preg->proglen = (preg->p + n) * 2;
 983                 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
 984         }
 985 }
 986
 987 /*
 988  - regnode - emit a node
 989  */
 990 /* Location. */
 991 static int regnode(regex_t *preg, int op)
 992 {
 993         reg_grow(preg, 2);
 994
 995         /* The OP followed by a next pointer */
 996         preg->program[preg->p++] = op;
 997         preg->program[preg->p++] = 0;
 998
 999         /* Return the start of the node */
1000         return preg->p - 2;
1001 }
1002
1003 /*
1004  - regc - emit (if appropriate) a byte of code
1005  */
1006 static void regc(regex_t *preg, int b )
1007 {
1008         reg_grow(preg, 1);
1009         preg->program[preg->p++] = b;
1010 }
1011
1012 /*
1013  - reginsert - insert an operator in front of already-emitted operand
1014  *
1015  * Means relocating the operand.
1016  * Returns the new location of the original operand.
1017  */
1018 static int reginsert(regex_t *preg, int op, int size, int opnd )
1019 {
1020         reg_grow(preg, size);
1021
1022         /* Move everything from opnd up */
1023         memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1024         /* Zero out the new space */
1025         memset(preg->program + opnd, 0, sizeof(int) * size);
1026
1027         preg->program[opnd] = op;
1028
1029         preg->p += size;
1030
1031         return opnd + size;
1032 }
1033
1034 /*
1035  - regtail - set the next-pointer at the end of a node chain
1036  */
1037 static void regtail(regex_t *preg, int p, int val)
1038 {
1039         int scan;
1040         int temp;
1041         int offset;
1042
1043         /* Find last node. */
1044         scan = p;
1045         for (;;) {
1046                 temp = regnext(preg, scan);
1047                 if (temp == 0)
1048                         break;
1049                 scan = temp;
1050         }
1051
1052         if (OP(preg, scan) == BACK)
1053                 offset = scan - val;
1054         else
1055                 offset = val - scan;
1056
1057         preg->program[scan + 1] = offset;
1058 }
1059
1060 /*
1061  - regoptail - regtail on operand of first argument; nop if operandless
1062  */
1063
1064 static void regoptail(regex_t *preg, int p, int val )
1065 {
1066         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1067         if (p != 0 && OP(preg, p) == BRANCH) {
1068                 regtail(preg, OPERAND(p), val);
1069         }
1070 }
1071
1072 /*
1073  * regexec and friends
1074  */
1075
1076 /*
1077  * Forwards.
1078  */
1079 static int regtry(regex_t *preg, const char *string );
1080 static int regmatch(regex_t *preg, int prog);
1081 static int regrepeat(regex_t *preg, int p, int max);
1082
1083 /*
1084  - regexec - match a regexp against a string
1085  */
1086 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1087 {
1088         const char *s;
1089         int scan;
1090
1091         /* Be paranoid... */
1092         if (preg == NULL || preg->program == NULL || string == NULL) {
1093                 return REG_ERR_NULL_ARGUMENT;
1094         }
1095
1096         /* Check validity of program. */
1097         if (*preg->program != REG_MAGIC) {
1098                 return REG_ERR_CORRUPTED;
1099         }
1100
1101 #ifdef DEBUG
1102         fprintf(stderr, "regexec: %s\n", string);
1103         regdump(preg);
1104 #endif
1105
1106         preg->eflags = eflags;
1107         preg->pmatch = pmatch;
1108         preg->nmatch = nmatch;
1109         preg->start = string;   /* All offsets are computed from here */
1110
1111         /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1112         for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1113                 int op = OP(preg, scan);
1114                 if (op == END)
1115                         break;
1116                 if (op == REPX || op == REPXMIN)
1117                         preg->program[scan + 4] = 0;
1118         }
1119
1120         /* If there is a "must appear" string, look for it. */
1121         if (preg->regmust != 0) {
1122                 s = string;
1123                 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1124                         if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1125                                 break;
1126                         }
1127                         s++;
1128                 }
1129                 if (s == NULL)  /* Not present. */
1130                         return REG_NOMATCH;
1131         }
1132
1133         /* Mark beginning of line for ^ . */
1134         preg->regbol = string;
1135
1136         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1137         if (preg->reganch) {
1138                 if (eflags & REG_NOTBOL) {
1139                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1140                         goto nextline;
1141                 }
1142                 while (1) {
1143                         if (regtry(preg, string)) {
1144                                 return REG_NOERROR;
1145                         }
1146                         if (*string) {
1147 nextline:
1148                                 if (preg->cflags & REG_NEWLINE) {
1149                                         /* Try the next anchor? */
1150                                         string = strchr(string, '\n');
1151                                         if (string) {
1152                                                 preg->regbol = ++string;
1153                                                 continue;
1154                                         }
1155                                 }
1156                         }
1157                         return REG_NOMATCH;
1158                 }
1159         }
1160
1161         /* Messy cases:  unanchored match. */
1162         s = string;
1163         if (preg->regstart != '\0') {
1164                 /* We know what char it must start with. */
1165                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1166                         if (regtry(preg, s))
1167                                 return REG_NOERROR;
1168                         s++;
1169                 }
1170         }
1171         else
1172                 /* We don't -- general case. */
1173                 while (1) {
1174                         if (regtry(preg, s))
1175                                 return REG_NOERROR;
1176                         if (*s == '\0') {
1177                                 break;
1178                         }
1179                         else {
1180                                 int c;
1181                                 s += utf8_tounicode(s, &c);
1182                         }
1183                 }
1184
1185         /* Failure. */
1186         return REG_NOMATCH;
1187 }
1188
1189 /*
1190  - regtry - try match at specific point
1191  */
1192                         /* 0 failure, 1 success */
1193 static int regtry( regex_t *preg, const char *string )
1194 {
1195         int i;
1196
1197         preg->reginput = string;
1198
1199         for (i = 0; i < preg->nmatch; i++) {
1200                 preg->pmatch[i].rm_so = -1;
1201                 preg->pmatch[i].rm_eo = -1;
1202         }
1203         if (regmatch(preg, 1)) {
1204                 preg->pmatch[0].rm_so = string - preg->start;
1205                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1206                 return(1);
1207         } else
1208                 return(0);
1209 }
1210
1211 /**
1212  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1213  *
1214  * If 'nocase' is non-zero, does a case-insensitive match.
1215  *
1216  * Returns -1 on not found.
1217  */
1218 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1219 {
1220         const char *s = string;
1221         while (proglen && *s) {
1222                 int ch;
1223                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1224                 if (ch != *prog) {
1225                         return -1;
1226                 }
1227                 prog++;
1228                 s += n;
1229                 proglen--;
1230         }
1231         if (proglen == 0) {
1232                 return s - string;
1233         }
1234         return -1;
1235 }
1236
1237 /**
1238  * Searchs for 'c' in the range 'range'.
1239  *
1240  * Returns 1 if found, or 0 if not.
1241  */
1242 static int reg_range_find(const int *range, int c)
1243 {
1244         while (*range) {
1245                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1246                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1247                         return 1;
1248                 }
1249                 range += 2;
1250         }
1251         return 0;
1252 }
1253
1254 /**
1255  * Search for the character 'c' in the utf-8 string 'string'.
1256  *
1257  * If 'nocase' is set, the 'string' is assumed to be uppercase
1258  * and 'c' is converted to uppercase before matching.
1259  *
1260  * Returns the byte position in the string where the 'c' was found, or
1261  * NULL if not found.
1262  */
1263 static const char *str_find(const char *string, int c, int nocase)
1264 {
1265         if (nocase) {
1266                 /* The "string" should already be converted to uppercase */
1267                 c = utf8_upper(c);
1268         }
1269         while (*string) {
1270                 int ch;
1271                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1272                 if (c == ch) {
1273                         return string;
1274                 }
1275                 string += n;
1276         }
1277         return NULL;
1278 }
1279
1280 /**
1281  * Returns true if 'ch' is an end-of-line char.
1282  *
1283  * In REG_NEWLINE mode, \n is considered EOL in
1284  * addition to \0
1285  */
1286 static int reg_iseol(regex_t *preg, int ch)
1287 {
1288         if (preg->cflags & REG_NEWLINE) {
1289                 return ch == '\0' || ch == '\n';
1290         }
1291         else {
1292                 return ch == '\0';
1293         }
1294 }
1295
1296 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1297 {
1298         int nextch = '\0';
1299         const char *save;
1300         int no;
1301         int c;
1302
1303         int max = preg->program[scan + 2];
1304         int min = preg->program[scan + 3];
1305         int next = regnext(preg, scan);
1306
1307         /*
1308          * Lookahead to avoid useless match attempts
1309          * when we know what character comes next.
1310          */
1311         if (OP(preg, next) == EXACTLY) {
1312                 nextch = preg->program[OPERAND(next)];
1313         }
1314         save = preg->reginput;
1315         no = regrepeat(preg, scan + 5, max);
1316         if (no < min) {
1317                 return 0;
1318         }
1319         if (matchmin) {
1320                 /* from min up to no */
1321                 max = no;
1322                 no = min;
1323         }
1324         /* else from no down to min */
1325         while (1) {
1326                 if (matchmin) {
1327                         if (no > max) {
1328                                 break;
1329                         }
1330                 }
1331                 else {
1332                         if (no < min) {
1333                                 break;
1334                         }
1335                 }
1336                 preg->reginput = save + utf8_index(save, no);
1337                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1338                 /* If it could work, try it. */
1339                 if (reg_iseol(preg, nextch) || c == nextch) {
1340                         if (regmatch(preg, next)) {
1341                                 return(1);
1342                         }
1343                 }
1344                 if (matchmin) {
1345                         /* Couldn't or didn't, add one more */
1346                         no++;
1347                 }
1348                 else {
1349                         /* Couldn't or didn't -- back up. */
1350                         no--;
1351                 }
1352         }
1353         return(0);
1354 }
1355
1356 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1357 {
1358         int *scanpt = preg->program + scan;
1359
1360         int max = scanpt[2];
1361         int min = scanpt[3];
1362
1363         /* Have we reached min? */
1364         if (scanpt[4] < min) {
1365                 /* No, so get another one */
1366                 scanpt[4]++;
1367                 if (regmatch(preg, scan + 5)) {
1368                         return 1;
1369                 }
1370                 scanpt[4]--;
1371                 return 0;
1372         }
1373         if (scanpt[4] > max) {
1374                 return 0;
1375         }
1376
1377         if (matchmin) {
1378                 /* minimal, so try other branch first */
1379                 if (regmatch(preg, regnext(preg, scan))) {
1380                         return 1;
1381                 }
1382                 /* No, so try one more */
1383                 scanpt[4]++;
1384                 if (regmatch(preg, scan + 5)) {
1385                         return 1;
1386                 }
1387                 scanpt[4]--;
1388                 return 0;
1389         }
1390         /* maximal, so try this branch again */
1391         if (scanpt[4] < max) {
1392                 scanpt[4]++;
1393                 if (regmatch(preg, scan + 5)) {
1394                         return 1;
1395                 }
1396                 scanpt[4]--;
1397         }
1398         /* At this point we are at max with no match. Try the other branch */
1399         return regmatch(preg, regnext(preg, scan));
1400 }
1401
1402 /*
1403  - regmatch - main matching routine
1404  *
1405  * Conceptually the strategy is simple:  check to see whether the current
1406  * node matches, call self recursively to see whether the rest matches,
1407  * and then act accordingly.  In practice we make some effort to avoid
1408  * recursion, in particular by going through "ordinary" nodes (that don't
1409  * need to know whether the rest of the match failed) by a loop instead of
1410  * by recursion.
1411  */
1412 /* 0 failure, 1 success */
1413 static int regmatch(regex_t *preg, int prog)
1414 {
1415         int scan;       /* Current node. */
1416         int next;               /* Next node. */
1417         const char *save;
1418
1419         scan = prog;
1420
1421 #ifdef DEBUG
1422         if (scan != 0 && regnarrate)
1423                 fprintf(stderr, "%s(\n", regprop(scan));
1424 #endif
1425         while (scan != 0) {
1426                 int n;
1427                 int c;
1428 #ifdef DEBUG
1429                 if (regnarrate) {
1430                         fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1431                 }
1432 #endif
1433                 next = regnext(preg, scan);
1434                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1435
1436                 switch (OP(preg, scan)) {
1437                 case BOLX:
1438                         if ((preg->eflags & REG_NOTBOL)) {
1439                                 return(0);
1440                         }
1441                         /* Fall through */
1442                 case BOL:
1443                         if (preg->reginput != preg->regbol) {
1444                                 return(0);
1445                         }
1446                         break;
1447                 case EOLX:
1448                         if (c != 0) {
1449                                 /* For EOLX, only match real end of line, not newline */
1450                                 return 0;
1451                         }
1452                         break;
1453                 case EOL:
1454                         if (!reg_iseol(preg, c)) {
1455                                 return(0);
1456                         }
1457                         break;
1458                 case WORDA:
1459                         /* Must be looking at a letter, digit, or _ */
1460                         if ((!isalnum(UCHAR(c))) && c != '_')
1461                                 return(0);
1462                         /* Prev must be BOL or nonword */
1463                         if (preg->reginput > preg->regbol &&
1464                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1465                                 return(0);
1466                         break;
1467                 case WORDZ:
1468                         /* Can't match at BOL */
1469                         if (preg->reginput > preg->regbol) {
1470                                 /* Current must be EOL or nonword */
1471                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1472                                         c = preg->reginput[-1];
1473                                         /* Previous must be word */
1474                                         if (isalnum(UCHAR(c)) || c == '_') {
1475                                                 break;
1476                                         }
1477                                 }
1478                         }
1479                         /* No */
1480                         return(0);
1481
1482                 case ANY:
1483                         if (reg_iseol(preg, c))
1484                                 return 0;
1485                         preg->reginput += n;
1486                         break;
1487                 case EXACTLY: {
1488                                 int opnd;
1489                                 int len;
1490                                 int slen;
1491
1492                                 opnd = OPERAND(scan);
1493                                 len = str_int_len(preg->program + opnd);
1494
1495                                 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1496                                 if (slen < 0) {
1497                                         return(0);
1498                                 }
1499                                 preg->reginput += slen;
1500                         }
1501                         break;
1502                 case ANYOF:
1503                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1504                                 return(0);
1505                         }
1506                         preg->reginput += n;
1507                         break;
1508                 case ANYBUT:
1509                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1510                                 return(0);
1511                         }
1512                         preg->reginput += n;
1513                         break;
1514                 case NOTHING:
1515                         break;
1516                 case BACK:
1517                         break;
1518                 case BRANCH:
1519                         if (OP(preg, next) != BRANCH)           /* No choice. */
1520                                 next = OPERAND(scan);   /* Avoid recursion. */
1521                         else {
1522                                 do {
1523                                         save = preg->reginput;
1524                                         if (regmatch(preg, OPERAND(scan))) {
1525                                                 return(1);
1526                                         }
1527                                         preg->reginput = save;
1528                                         scan = regnext(preg, scan);
1529                                 } while (scan != 0 && OP(preg, scan) == BRANCH);
1530                                 return(0);
1531                                 /* NOTREACHED */
1532                         }
1533                         break;
1534                 case REP:
1535                 case REPMIN:
1536                         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1537
1538                 case REPX:
1539                 case REPXMIN:
1540                         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1541
1542                 case END:
1543                         return 1;       /* Success! */
1544
1545                 case OPENNC:
1546                 case CLOSENC:
1547                         return regmatch(preg, next);
1548
1549                 default:
1550                         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1551                                 save = preg->reginput;
1552                                 if (regmatch(preg, next)) {
1553                                         if (OP(preg, scan) < CLOSE) {
1554                                                 int no = OP(preg, scan) - OPEN;
1555                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1556                                                         preg->pmatch[no].rm_so = save - preg->start;
1557                                                 }
1558                                         }
1559                                         else {
1560                                                 int no = OP(preg, scan) - CLOSE;
1561                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1562                                                         preg->pmatch[no].rm_eo = save - preg->start;
1563                                                 }
1564                                         }
1565                                         return(1);
1566                                 }
1567                                 return(0);
1568                         }
1569                         return REG_ERR_INTERNAL;
1570                 }
1571
1572                 scan = next;
1573         }
1574
1575         /*
1576          * We get here only if there's trouble -- normally "case END" is
1577          * the terminating point.
1578          */
1579         return REG_ERR_INTERNAL;
1580 }
1581
1582 /*
1583  - regrepeat - repeatedly match something simple, report how many
1584  */
1585 static int regrepeat(regex_t *preg, int p, int max)
1586 {
1587         int count = 0;
1588         const char *scan;
1589         int opnd;
1590         int ch;
1591         int n;
1592
1593         scan = preg->reginput;
1594         opnd = OPERAND(p);
1595         switch (OP(preg, p)) {
1596         case ANY:
1597                 /* No need to handle utf8 specially here */
1598                 while (!reg_iseol(preg, *scan) && count < max) {
1599                         count++;
1600                         scan++;
1601                 }
1602                 break;
1603         case EXACTLY:
1604                 while (count < max) {
1605                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1606                         if (preg->program[opnd] != ch) {
1607                                 break;
1608                         }
1609                         count++;
1610                         scan += n;
1611                 }
1612                 break;
1613         case ANYOF:
1614                 while (count < max) {
1615                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1616                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1617                                 break;
1618                         }
1619                         count++;
1620                         scan += n;
1621                 }
1622                 break;
1623         case ANYBUT:
1624                 while (count < max) {
1625                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1626                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1627                                 break;
1628                         }
1629                         count++;
1630                         scan += n;
1631                 }
1632                 break;
1633         default:                /* Oh dear.  Called inappropriately. */
1634                 preg->err = REG_ERR_INTERNAL;
1635                 count = 0;      /* Best compromise. */
1636                 break;
1637         }
1638         preg->reginput = scan;
1639
1640         return(count);
1641 }
1642
1643 /*
1644  - regnext - dig the "next" pointer out of a node
1645  */
1646 static int regnext(regex_t *preg, int p )
1647 {
1648         int offset;
1649
1650         offset = NEXT(preg, p);
1651
1652         if (offset == 0)
1653                 return 0;
1654
1655         if (OP(preg, p) == BACK)
1656                 return(p-offset);
1657         else
1658                 return(p+offset);
1659 }
1660
1661 /*
1662  - regopsize - returns the size of opcode + operands at 'p' in words
1663  */
1664 static int regopsize(regex_t *preg, int p )
1665 {
1666         /* Almost all opcodes are 2 words, but some are more */
1667         switch (OP(preg, p)) {
1668                 case REP:
1669                 case REPMIN:
1670                 case REPX:
1671                 case REPXMIN:
1672                         return 5;
1673
1674                 case ANYOF:
1675                 case ANYBUT:
1676                 case EXACTLY: {
1677                         int s = p + 2;
1678                         while (preg->program[s++]) {
1679                         }
1680                         return s - p;
1681                 }
1682         }
1683         return 2;
1684 }
1685
1686 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1687
1688 /*
1689  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1690  */
1691 static void regdump(regex_t *preg)
1692 {
1693         int s;
1694         int op = EXACTLY;       /* Arbitrary non-END op. */
1695         int next;
1696         char buf[MAX_UTF8_LEN + 1];
1697
1698         int i;
1699         for (i = 1; i < preg->p; i++) {
1700                 printf("%02x ", (unsigned char)preg->program[i]);
1701                 if (i % 16 == 0) {
1702                         printf("\n");
1703                 }
1704         }
1705         printf("\n");
1706
1707         s = 1;
1708         while (op != END && s < preg->p) {      /* While that wasn't END last time... */
1709                 op = OP(preg, s);
1710                 printf("%3d: %s", s, regprop(op));      /* Where, what. */
1711                 next = regnext(preg, s);
1712                 if (next == 0)          /* Next ptr. */
1713                         printf("(0)");
1714                 else
1715                         printf("(%d)", next);
1716                 s += 2;
1717                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1718                         int max = preg->program[s];
1719                         int min = preg->program[s + 1];
1720                         if (max == 65535) {
1721                                 printf("{%d,*}", min);
1722                         }
1723                         else {
1724                                 printf("{%d,%d}", min, max);
1725                         }
1726                         printf(" %d", preg->program[s + 2]);
1727                         s += 3;
1728                 }
1729                 else if (op == ANYOF || op == ANYBUT) {
1730                         /* set of ranges */
1731
1732                         while (preg->program[s]) {
1733                                 int len = preg->program[s++];
1734                                 int first = preg->program[s++];
1735                                 buf[utf8_getchars(buf, first)] = 0;
1736                                 printf("%s", buf);
1737                                 if (len > 1) {
1738                                         buf[utf8_getchars(buf, first + len - 1)] = 0;
1739                                         printf("-%s", buf);
1740                                 }
1741                         }
1742                         s++;
1743                 }
1744                 else if (op == EXACTLY) {
1745                         /* Literal string, where present. */
1746
1747                         while (preg->program[s]) {
1748                                 buf[utf8_getchars(buf, preg->program[s])] = 0;
1749                                 printf("%s", buf);
1750                                 s++;
1751                         }
1752                         s++;
1753                 }
1754                 putchar('\n');
1755         }
1756
1757         if (op == END) {
1758                 /* Header fields of interest. */
1759                 if (preg->regstart) {
1760                         buf[utf8_getchars(buf, preg->regstart)] = 0;
1761                         printf("start '%s' ", buf);
1762                 }
1763                 if (preg->reganch)
1764                         printf("anchored ");
1765                 if (preg->regmust != 0) {
1766                         int i;
1767                         printf("must have:");
1768                         for (i = 0; i < preg->regmlen; i++) {
1769                                 putchar(preg->program[preg->regmust + i]);
1770                         }
1771                         putchar('\n');
1772                 }
1773         }
1774         printf("\n");
1775 }
1776
1777 /*
1778  - regprop - printable representation of opcode
1779  */
1780 static const char *regprop( int op )
1781 {
1782         static char buf[50];
1783
1784         switch (op) {
1785         case BOL:
1786                 return "BOL";
1787         case EOL:
1788                 return "EOL";
1789         case BOLX:
1790                 return "BOLX";
1791         case EOLX:
1792                 return "EOLX";
1793         case ANY:
1794                 return "ANY";
1795         case ANYOF:
1796                 return "ANYOF";
1797         case ANYBUT:
1798                 return "ANYBUT";
1799         case BRANCH:
1800                 return "BRANCH";
1801         case EXACTLY:
1802                 return "EXACTLY";
1803         case NOTHING:
1804                 return "NOTHING";
1805         case BACK:
1806                 return "BACK";
1807         case END:
1808                 return "END";
1809         case REP:
1810                 return "REP";
1811         case REPMIN:
1812                 return "REPMIN";
1813         case REPX:
1814                 return "REPX";
1815         case REPXMIN:
1816                 return "REPXMIN";
1817         case WORDA:
1818                 return "WORDA";
1819         case WORDZ:
1820                 return "WORDZ";
1821         case OPENNC:
1822                 return "OPEN";
1823         case CLOSENC:
1824                 return "CLOSE";
1825         default:
1826                 if (op >= OPEN && op < CLOSE) {
1827                         snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1828                 }
1829                 else if (op >= CLOSE && op < CLOSE_END) {
1830                         snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1831                 }
1832                 else {
1833                         snprintf(buf, sizeof(buf), "?%d?\n", op);
1834                 }
1835                 return(buf);
1836         }
1837 }
1838 #endif /* JIM_BOOTSTRAP */
1839
1840 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1841 {
1842         static const char *error_strings[] = {
1843                 "success",
1844                 "no match",
1845                 "bad pattern",
1846                 "null argument",
1847                 "unknown error",
1848                 "too big",
1849                 "out of memory",
1850                 "too many ()",
1851                 "parentheses () not balanced",
1852                 "braces {} not balanced",
1853                 "invalid repetition count(s)",
1854                 "extra characters",
1855                 "*+ of empty atom",
1856                 "nested count",
1857                 "internal error",
1858                 "count follows nothing",
1859                 "trailing backslash",
1860                 "corrupted program",
1861                 "contains null char",
1862         };
1863         const char *err;
1864
1865         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1866                 err = "Bad error code";
1867         }
1868         else {
1869                 err = error_strings[errcode];
1870         }
1871
1872         return snprintf(errbuf, errbuf_size, "%s", err);
1873 }
1874
1875 void regfree(regex_t *preg)
1876 {
1877         free(preg->program);
1878 }
1879
1880 #endif