jimregexp.c

   1 /*
   2  * vi:se ts=8:
   3  *
   4  * regcomp and regexec -- regsub and regerror are elsewhere
   5  *
   6  *      Copyright (c) 1986 by University of Toronto.
   7  *      Written by Henry Spencer.  Not derived from licensed software.
   8  *
   9  *      Permission is granted to anyone to use this software for any
  10  *      purpose on any computer system, and to redistribute it freely,
  11  *      subject to the following restrictions:
  12  *
  13  *      1. The author is not responsible for the consequences of use of
  14  *              this software, no matter how awful, even if they arise
  15  *              from defects in it.
  16  *
  17  *      2. The origin of this software must not be misrepresented, either
  18  *              by explicit claim or by omission.
  19  *
  20  *      3. Altered versions must be plainly marked as such, and must not
  21  *              be misrepresented as being the original software.
  22  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  23  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  24  *** to assist in implementing egrep.
  25  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  26  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  27  *** as in BSD grep and ex.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  29  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  31  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  32  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  33  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  34  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  35  *** to avoid conflicting with the system regexp.h.  Const, bless its
  36  *** soul, was removed so it can compile everywhere.  The declaration
  37  *** of strchr() was in conflict on AIX, so it was removed (as it is
  38  *** happily defined in string.h).
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  41  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  42  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  43  *
  44  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  45  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  46  *   This includes counted repetitions, UTF-8 support, character classes,
  47  *   shorthand character classes, increased number of parentheses to 100,
  48  *   backslash escape sequences. It also removes \n as an alternative to |.
  49  *
  50  * Beware that some of this code is subtly aware of the way operator
  51  * precedence is structured in regular expressions.  Serious changes in
  52  * regular-expression syntax might require a total rethink.
  53  */
  54
  55 #include "jimautoconf.h"
  56
  57 #if defined(JIM_REGEXP)
  58 #include <stdio.h>
  59 #include <ctype.h>
  60 #include <stdlib.h>
  61 #include <string.h>
  62
  63 #include "jim.h"
  64 #include "jimregexp.h"
  65 #include "utf8.h"
  66
  67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
  68 #define REG_MAX_PAREN 100
  69
  70 /*
  71  * Structure for regexp "program".  This is essentially a linear encoding
  72  * of a nondeterministic finite-state machine (aka syntax charts or
  73  * "railroad normal form" in parsing technology).  Each node is an opcode
  74  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  75  * all nodes except BRANCH implement concatenation; a "next" pointer with
  76  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  77  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  78  * opposed to a collection of them) is never concatenated with anything
  79  * because of operator precedence.)  The operand of some types of node is
  80  * a literal string; for others, it is a node leading into a sub-FSM.  In
  81  * particular, the operand of a BRANCH node is the first node of the branch.
  82  * (NB this is *not* a tree structure:  the tail of the branch connects
  83  * to the thing following the set of BRANCHes.)  The opcodes are:
  84  */
  85
  86 /* definition           number  opnd?   meaning */
  87 #define END     0       /* no   End of program. */
  88 #define BOL     1       /* no   Match "" at beginning of line. */
  89 #define EOL     2       /* no   Match "" at end of line. */
  90 #define ANY     3       /* no   Match any one character. */
  91 #define ANYOF   4       /* str  Match any character in this string. */
  92 #define ANYBUT  5       /* str  Match any character not in this string. */
  93 #define BRANCH  6       /* node Match this alternative, or the next... */
  94 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  95 #define EXACTLY 8       /* str  Match this string. */
  96 #define NOTHING 9       /* no   Match empty string. */
  97 #define REP     10      /* max,min      Match this (simple) thing [min,max] times. */
  98 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, minimal match. */
  99 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
 100 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
 101 #define BOLX    14      /* no   Match "" at beginning of input. */
 102 #define EOLX    15      /* no   Match "" at end of input. */
 103 #define WORDA   16      /* no   Match "" at wordchar, where prev is nonword */
 104 #define WORDZ   17      /* no   Match "" at nonwordchar, where prev is word */
 105
 106 #define OPENNC  1000    /* no   Non-capturing parentheses - must be OPEN-1 */
 107 #define OPEN    1001    /* no   Mark this point in input as start of #n. */
 108                         /*      OPEN+1 is number 1, etc. */
 109
 110 /* must not be any other opts between OPEN and CLOSE */
 111
 112 #define CLOSENC 2000    /* no   Non-capturing parentheses - must be CLOSE-1 */
 113 #define CLOSE   2001    /* no   Analogous to OPEN. */
 114 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 115
 116 /*
 117  * The first word of the regexp internal "program" is actually this magic
 118  * number; the start node begins in the second word.
 119  */
 120 #define REG_MAGIC       0xFADED00D
 121
 122 /*
 123  * Opcode notes:
 124  *
 125  * BRANCH       The set of branches constituting a single choice are hooked
 126  *              together with their "next" pointers, since precedence prevents
 127  *              anything being concatenated to any individual branch.  The
 128  *              "next" pointer of the last BRANCH in a choice points to the
 129  *              thing following the whole choice.  This is also where the
 130  *              final "next" pointer of each individual branch points; each
 131  *              branch starts with the operand node of a BRANCH node.
 132  *
 133  * BACK         Normal "next" pointers all implicitly point forward; BACK
 134  *              exists to make loop structures possible.
 135  *
 136  * REP,REPX     Repeated matches ('?', '*', '+' and {min,max}) are implemented
 137  *              as either simple repeats (REP) or complex repeats (REPX).
 138  *              These opcodes include a "min" and "max" count after the opcode.
 139  *              This is followed by a fourth "current count" word that is
 140  *              only used by REPX, as it implements a recursive match.
 141  *              REPMIN and REPXMIN are identical except they implement minimal repeats.
 142  *
 143  * OPEN,CLOSE   ...are numbered at compile time.
 144  */
 145
 146 /*
 147  * A node is one word of opcode followed by one word of "next" pointer.
 148  * The "next" pointer value is a positive offset from the opcode of the node
 149  * containing it.
 150  * An operand, if any, simply follows the node.  (Note that much of the
 151  * code generation knows about this implicit relationship.)
 152  */
 153 #define OP(preg, p)     (preg->program[p])
 154 #define NEXT(preg, p)   (preg->program[p + 1])
 155 #define OPERAND(p)      ((p) + 2)
 156
 157 /*
 158  * See regmagic.h for one further detail of program structure.
 159  */
 160
 161
 162 /*
 163  * Utility definitions.
 164  */
 165
 166 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 167 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 168 #define META            "^$.[()|?{+*"
 169
 170 /*
 171  * Flags to be passed up and down.
 172  */
 173 #define HASWIDTH        1       /* Known never to match null string. */
 174 #define SIMPLE          2       /* Simple enough to be STAR/PLUS operand. */
 175 #define SPSTART         4       /* Starts with * or +. */
 176 #define WORST           0       /* Worst case. */
 177
 178 #define MAX_REP_COUNT 1000000
 179
 180 /*
 181  * Forward declarations for regcomp()'s friends.
 182  */
 183 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 184 static int regpiece(regex_t *preg, int *flagp );
 185 static int regbranch(regex_t *preg, int *flagp );
 186 static int regatom(regex_t *preg, int *flagp );
 187 static int regnode(regex_t *preg, int op );
 188 static int regnext(regex_t *preg, int p );
 189 static void regc(regex_t *preg, int b );
 190 static int reginsert(regex_t *preg, int op, int size, int opnd );
 191 static void regtail(regex_t *preg, int p, int val);
 192 static void regoptail(regex_t *preg, int p, int val );
 193 static int regopsize(regex_t *preg, int p );
 194
 195 static int reg_range_find(const int *string, int c);
 196 static const char *str_find(const char *string, int c, int nocase);
 197 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 198
 199 /*#define DEBUG*/
 200 #ifdef DEBUG
 201 static int regnarrate = 0;
 202 static void regdump(regex_t *preg);
 203 static const char *regprop( int op );
 204 #endif
 205
 206
 207 /**
 208  * Returns the length of the null-terminated integer sequence.
 209  */
 210 static int str_int_len(const int *seq)
 211 {
 212         int n = 0;
 213         while (*seq++) {
 214                 n++;
 215         }
 216         return n;
 217 }
 218
 219 /*
 220  - regcomp - compile a regular expression into internal code
 221  *
 222  * We can't allocate space until we know how big the compiled form will be,
 223  * but we can't compile it (and thus know how big it is) until we've got a
 224  * place to put the code.  So we cheat:  we compile it twice, once with code
 225  * generation turned off and size counting turned on, and once "for real".
 226  * This also means that we don't allocate space until we are sure that the
 227  * thing really will compile successfully, and we never have to move the
 228  * code and thus invalidate pointers into it.  (Note that it has to be in
 229  * one piece because free() must be able to free it all.)
 230  *
 231  * Beware that the optimization-preparation code in here knows about some
 232  * of the structure of the compiled regexp.
 233  */
 234 int regcomp(regex_t *preg, const char *exp, int cflags)
 235 {
 236         int scan;
 237         int longest;
 238         unsigned len;
 239         int flags;
 240
 241 #ifdef DEBUG
 242         fprintf(stderr, "Compiling: '%s'\n", exp);
 243 #endif
 244         memset(preg, 0, sizeof(*preg));
 245
 246         if (exp == NULL)
 247                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 248
 249         /* First pass: determine size, legality. */
 250         preg->cflags = cflags;
 251         preg->regparse = exp;
 252
 253         /* Allocate space. */
 254         preg->proglen = (strlen(exp) + 1) * 5;
 255         preg->program = malloc(preg->proglen * sizeof(int));
 256         if (preg->program == NULL)
 257                 FAIL(preg, REG_ERR_NOMEM);
 258
 259         /* Note that since we store a magic value as the first item in the program,
 260          * program offsets will never be 0
 261          */
 262         regc(preg, REG_MAGIC);
 263         if (reg(preg, 0, &flags) == 0) {
 264                 return preg->err;
 265         }
 266
 267         /* Small enough for pointer-storage convention? */
 268         if (preg->re_nsub >= REG_MAX_PAREN)             /* Probably could be 65535L. */
 269                 FAIL(preg,REG_ERR_TOO_BIG);
 270
 271         /* Dig out information for optimizations. */
 272         preg->regstart = 0;     /* Worst-case defaults. */
 273         preg->reganch = 0;
 274         preg->regmust = 0;
 275         preg->regmlen = 0;
 276         scan = 1;                       /* First BRANCH. */
 277         if (OP(preg, regnext(preg, scan)) == END) {             /* Only one top-level choice. */
 278                 scan = OPERAND(scan);
 279
 280                 /* Starting-point info. */
 281                 if (OP(preg, scan) == EXACTLY) {
 282                         preg->regstart = preg->program[OPERAND(scan)];
 283                 }
 284                 else if (OP(preg, scan) == BOL)
 285                         preg->reganch++;
 286
 287                 /*
 288                  * If there's something expensive in the r.e., find the
 289                  * longest literal string that must appear and make it the
 290                  * regmust.  Resolve ties in favor of later strings, since
 291                  * the regstart check works with the beginning of the r.e.
 292                  * and avoiding duplication strengthens checking.  Not a
 293                  * strong reason, but sufficient in the absence of others.
 294                  */
 295                 if (flags&SPSTART) {
 296                         longest = 0;
 297                         len = 0;
 298                         for (; scan != 0; scan = regnext(preg, scan)) {
 299                                 if (OP(preg, scan) == EXACTLY) {
 300                                         int plen = str_int_len(preg->program + OPERAND(scan));
 301                                         if (plen >= len) {
 302                                                 longest = OPERAND(scan);
 303                                                 len = plen;
 304                                         }
 305                                 }
 306                         }
 307                         preg->regmust = longest;
 308                         preg->regmlen = len;
 309                 }
 310         }
 311
 312 #ifdef DEBUG
 313         regdump(preg);
 314 #endif
 315
 316         return 0;
 317 }
 318
 319 /*
 320  - reg - regular expression, i.e. main body or parenthesized thing
 321  *
 322  * Caller must absorb opening parenthesis.
 323  *
 324  * Combining parenthesis handling with the base level of regular expression
 325  * is a trifle forced, but the need to tie the tails of the branches to what
 326  * follows makes it hard to avoid.
 327  */
 328 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 329 {
 330         int ret;
 331         int br;
 332         int ender;
 333         int parno = 0;
 334         int flags;
 335
 336         *flagp = HASWIDTH;      /* Tentatively. */
 337
 338         /* Make an OPEN node, if parenthesized. */
 339         if (paren) {
 340                 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 341                         /* non-capturing paren */
 342                         preg->regparse += 2;
 343                         parno = -1;
 344                 }
 345                 else {
 346                         parno = ++preg->re_nsub;
 347                 }
 348                 ret = regnode(preg, OPEN+parno);
 349         } else
 350                 ret = 0;
 351
 352         /* Pick up the branches, linking them together. */
 353         br = regbranch(preg, &flags);
 354         if (br == 0)
 355                 return 0;
 356         if (ret != 0)
 357                 regtail(preg, ret, br); /* OPEN -> first. */
 358         else
 359                 ret = br;
 360         if (!(flags&HASWIDTH))
 361                 *flagp &= ~HASWIDTH;
 362         *flagp |= flags&SPSTART;
 363         while (*preg->regparse == '|') {
 364                 preg->regparse++;
 365                 br = regbranch(preg, &flags);
 366                 if (br == 0)
 367                         return 0;
 368                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 369                 if (!(flags&HASWIDTH))
 370                         *flagp &= ~HASWIDTH;
 371                 *flagp |= flags&SPSTART;
 372         }
 373
 374         /* Make a closing node, and hook it on the end. */
 375         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 376         regtail(preg, ret, ender);
 377
 378         /* Hook the tails of the branches to the closing node. */
 379         for (br = ret; br != 0; br = regnext(preg, br))
 380                 regoptail(preg, br, ender);
 381
 382         /* Check for proper termination. */
 383         if (paren && *preg->regparse++ != ')') {
 384                 preg->err = REG_ERR_UNMATCHED_PAREN;
 385                 return 0;
 386         } else if (!paren && *preg->regparse != '\0') {
 387                 if (*preg->regparse == ')') {
 388                         preg->err = REG_ERR_UNMATCHED_PAREN;
 389                         return 0;
 390                 } else {
 391                         preg->err = REG_ERR_JUNK_ON_END;
 392                         return 0;
 393                 }
 394         }
 395
 396         return(ret);
 397 }
 398
 399 /*
 400  - regbranch - one alternative of an | operator
 401  *
 402  * Implements the concatenation operator.
 403  */
 404 static int regbranch(regex_t *preg, int *flagp )
 405 {
 406         int ret;
 407         int chain;
 408         int latest;
 409         int flags;
 410
 411         *flagp = WORST;         /* Tentatively. */
 412
 413         ret = regnode(preg, BRANCH);
 414         chain = 0;
 415         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 416                *preg->regparse != '|') {
 417                 latest = regpiece(preg, &flags);
 418                 if (latest == 0)
 419                         return 0;
 420                 *flagp |= flags&HASWIDTH;
 421                 if (chain == 0) {/* First piece. */
 422                         *flagp |= flags&SPSTART;
 423                 }
 424                 else {
 425                         regtail(preg, chain, latest);
 426                 }
 427                 chain = latest;
 428         }
 429         if (chain == 0) /* Loop ran zero times. */
 430                 (void) regnode(preg, NOTHING);
 431
 432         return(ret);
 433 }
 434
 435 /*
 436  - regpiece - something followed by possible [*+?]
 437  *
 438  * Note that the branching code sequences used for ? and the general cases
 439  * of * and + are somewhat optimized:  they use the same NOTHING node as
 440  * both the endmarker for their branch list and the body of the last branch.
 441  * It might seem that this node could be dispensed with entirely, but the
 442  * endmarker role is not redundant.
 443  */
 444 static int regpiece(regex_t *preg, int *flagp)
 445 {
 446         int ret;
 447         char op;
 448         int next;
 449         int flags;
 450         int min;
 451         int max;
 452
 453         ret = regatom(preg, &flags);
 454         if (ret == 0)
 455                 return 0;
 456
 457         op = *preg->regparse;
 458         if (!ISMULT(op)) {
 459                 *flagp = flags;
 460                 return(ret);
 461         }
 462
 463         if (!(flags&HASWIDTH) && op != '?') {
 464                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 465                 return 0;
 466         }
 467
 468         /* Handle braces (counted repetition) by expansion */
 469         if (op == '{') {
 470                 char *end;
 471
 472                 min = strtoul(preg->regparse + 1, &end, 10);
 473                 if (end == preg->regparse + 1) {
 474                         preg->err = REG_ERR_BAD_COUNT;
 475                         return 0;
 476                 }
 477                 if (*end == '}') {
 478                         max = min;
 479                 }
 480                 else {
 481                         preg->regparse = end;
 482                         max = strtoul(preg->regparse + 1, &end, 10);
 483                         if (*end != '}') {
 484                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 485                                 return 0;
 486                         }
 487                 }
 488                 if (end == preg->regparse + 1) {
 489                         max = MAX_REP_COUNT;
 490                 }
 491                 else if (max < min || max >= 100) {
 492                         preg->err = REG_ERR_BAD_COUNT;
 493                         return 0;
 494                 }
 495                 if (min >= 100) {
 496                         preg->err = REG_ERR_BAD_COUNT;
 497                         return 0;
 498                 }
 499
 500                 preg->regparse = strchr(preg->regparse, '}');
 501         }
 502         else {
 503                 min = (op == '+');
 504                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 505         }
 506
 507         if (preg->regparse[1] == '?') {
 508                 preg->regparse++;
 509                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 510         }
 511         else {
 512                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 513         }
 514         preg->program[ret + 2] = max;
 515         preg->program[ret + 3] = min;
 516         preg->program[ret + 4] = 0;
 517
 518         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 519
 520         if (!(flags & SIMPLE)) {
 521                 int back = regnode(preg, BACK);
 522                 regtail(preg, back, ret);
 523                 regtail(preg, next, back);
 524         }
 525
 526         preg->regparse++;
 527         if (ISMULT(*preg->regparse)) {
 528                 preg->err = REG_ERR_NESTED_COUNT;
 529                 return 0;
 530         }
 531
 532         return ret;
 533 }
 534
 535 /**
 536  * Add all characters in the inclusive range between lower and upper.
 537  *
 538  * Handles a swapped range (upper < lower).
 539  */
 540 static void reg_addrange(regex_t *preg, int lower, int upper)
 541 {
 542         if (lower > upper) {
 543                 reg_addrange(preg, upper, lower);
 544         }
 545         /* Add a range as length, start */
 546         regc(preg, upper - lower + 1);
 547         regc(preg, lower);
 548 }
 549
 550 /**
 551  * Add a null-terminated literal string as a set of ranges.
 552  */
 553 static void reg_addrange_str(regex_t *preg, const char *str)
 554 {
 555         while (*str) {
 556                 reg_addrange(preg, *str, *str);
 557                 str++;
 558         }
 559 }
 560
 561 /**
 562  * Extracts the next unicode char from utf8.
 563  *
 564  * If 'upper' is set, converts the char to uppercase.
 565  */
 566 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 567 {
 568         int l = utf8_tounicode(s, uc);
 569         if (upper) {
 570                 *uc = utf8_upper(*uc);
 571         }
 572         return l;
 573 }
 574
 575 /**
 576  * Converts a hex digit to decimal.
 577  *
 578  * Returns -1 for an invalid hex digit.
 579  */
 580 static int hexdigitval(int c)
 581 {
 582         if (c >= '0' && c <= '9')
 583                 return c - '0';
 584         if (c >= 'a' && c <= 'f')
 585                 return c - 'a' + 10;
 586         if (c >= 'A' && c <= 'F')
 587                 return c - 'A' + 10;
 588         return -1;
 589 }
 590
 591 /**
 592  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 593  *
 594  * Returns the number of hex digits parsed.
 595  * If there are no hex digits, returns 0 and stores nothing.
 596  */
 597 static int parse_hex(const char *s, int n, int *uc)
 598 {
 599         int val = 0;
 600         int k;
 601
 602         for (k = 0; k < n; k++) {
 603                 int c = hexdigitval(*s++);
 604                 if (c == -1) {
 605                         break;
 606                 }
 607                 val = (val << 4) | c;
 608         }
 609         if (k) {
 610                 *uc = val;
 611         }
 612         return k;
 613 }
 614
 615 /**
 616  * Call for chars after a backlash to decode the escape sequence.
 617  *
 618  * Stores the result in *ch.
 619  *
 620  * Returns the number of bytes consumed.
 621  */
 622 static int reg_decode_escape(const char *s, int *ch)
 623 {
 624         int n;
 625         const char *s0 = s;
 626
 627         *ch = *s++;
 628
 629         switch (*ch) {
 630                 case 'b': *ch = '\b'; break;
 631                 case 'e': *ch = 27; break;
 632                 case 'f': *ch = '\f'; break;
 633                 case 'n': *ch = '\n'; break;
 634                 case 'r': *ch = '\r'; break;
 635                 case 't': *ch = '\t'; break;
 636                 case 'v': *ch = '\v'; break;
 637                 case 'u':
 638                         if (*s == '{') {
 639                                 /* Expect \u{NNNN} */
 640                                 n = parse_hex(s + 1, 6, ch);
 641                                 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
 642                                         s += n + 2;
 643                                 }
 644                                 else {
 645                                         /* Invalid, so just treat as an escaped 'u' */
 646                                         *ch = 'u';
 647                                 }
 648                         }
 649                         else if ((n = parse_hex(s, 4, ch)) > 0) {
 650                                 s += n;
 651                         }
 652                         break;
 653                 case 'U':
 654                         if ((n = parse_hex(s, 8, ch)) > 0) {
 655                                 s += n;
 656                         }
 657                         break;
 658                 case 'x':
 659                         if ((n = parse_hex(s, 2, ch)) > 0) {
 660                                 s += n;
 661                         }
 662                         break;
 663                 case '\0':
 664                         s--;
 665                         *ch = '\\';
 666                         break;
 667         }
 668         return s - s0;
 669 }
 670
 671 /*
 672  - regatom - the lowest level
 673  *
 674  * Optimization:  gobbles an entire sequence of ordinary characters so that
 675  * it can turn them into a single node, which is smaller to store and
 676  * faster to run.  Backslashed characters are exceptions, each becoming a
 677  * separate node; the code is simpler that way and it's not worth fixing.
 678  */
 679 static int regatom(regex_t *preg, int *flagp)
 680 {
 681         int ret;
 682         int flags;
 683         int nocase = (preg->cflags & REG_ICASE);
 684
 685         int ch;
 686         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 687
 688         *flagp = WORST;         /* Tentatively. */
 689
 690         preg->regparse += n;
 691         switch (ch) {
 692         /* FIXME: these chars only have meaning at beg/end of pat? */
 693         case '^':
 694                 ret = regnode(preg, BOL);
 695                 break;
 696         case '$':
 697                 ret = regnode(preg, EOL);
 698                 break;
 699         case '.':
 700                 ret = regnode(preg, ANY);
 701                 *flagp |= HASWIDTH|SIMPLE;
 702                 break;
 703         case '[': {
 704                         const char *pattern = preg->regparse;
 705
 706                         if (*pattern == '^') {  /* Complement of range. */
 707                                 ret = regnode(preg, ANYBUT);
 708                                 pattern++;
 709                         } else
 710                                 ret = regnode(preg, ANYOF);
 711
 712                         /* Special case. If the first char is ']' or '-', it is part of the set */
 713                         if (*pattern == ']' || *pattern == '-') {
 714                                 reg_addrange(preg, *pattern, *pattern);
 715                                 pattern++;
 716                         }
 717
 718                         while (*pattern && *pattern != ']') {
 719                                 /* Is this a range? a-z */
 720                                 int start;
 721                                 int end;
 722
 723                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 724                                 if (start == '\\') {
 725                                         pattern += reg_decode_escape(pattern, &start);
 726                                         if (start == 0) {
 727                                                 preg->err = REG_ERR_NULL_CHAR;
 728                                                 return 0;
 729                                         }
 730                                 }
 731                                 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
 732                                         /* skip '-' */
 733                                         pattern += utf8_tounicode(pattern, &end);
 734                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 735                                         if (end == '\\') {
 736                                                 pattern += reg_decode_escape(pattern, &end);
 737                                                 if (end == 0) {
 738                                                         preg->err = REG_ERR_NULL_CHAR;
 739                                                         return 0;
 740                                                 }
 741                                         }
 742
 743                                         reg_addrange(preg, start, end);
 744                                         continue;
 745                                 }
 746                                 if (start == '[') {
 747                                         if (strncmp(pattern, ":alpha:]", 8) == 0) {
 748                                                 if ((preg->cflags & REG_ICASE) == 0) {
 749                                                         reg_addrange(preg, 'a', 'z');
 750                                                 }
 751                                                 reg_addrange(preg, 'A', 'Z');
 752                                                 pattern += 8;
 753                                                 continue;
 754                                         }
 755                                         if (strncmp(pattern, ":alnum:]", 8) == 0) {
 756                                                 if ((preg->cflags & REG_ICASE) == 0) {
 757                                                         reg_addrange(preg, 'a', 'z');
 758                                                 }
 759                                                 reg_addrange(preg, 'A', 'Z');
 760                                                 reg_addrange(preg, '0', '9');
 761                                                 pattern += 8;
 762                                                 continue;
 763                                         }
 764                                         if (strncmp(pattern, ":space:]", 8) == 0) {
 765                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 766                                                 pattern += 8;
 767                                                 continue;
 768                                         }
 769                                 }
 770                                 /* Not a range, so just add the char */
 771                                 reg_addrange(preg, start, start);
 772                         }
 773                         regc(preg, '\0');
 774
 775                         if (*pattern) {
 776                                 pattern++;
 777                         }
 778                         preg->regparse = pattern;
 779
 780                         *flagp |= HASWIDTH|SIMPLE;
 781                 }
 782                 break;
 783         case '(':
 784                 ret = reg(preg, 1, &flags);
 785                 if (ret == 0)
 786                         return 0;
 787                 *flagp |= flags&(HASWIDTH|SPSTART);
 788                 break;
 789         case '\0':
 790         case '|':
 791         case ')':
 792                 preg->err = REG_ERR_INTERNAL;
 793                 return 0;       /* Supposed to be caught earlier. */
 794         case '?':
 795         case '+':
 796         case '*':
 797         case '{':
 798                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 799                 return 0;
 800         case '\\':
 801                 ch = *preg->regparse++;
 802                 switch (ch) {
 803                 case '\0':
 804                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 805                         return 0;
 806                 case 'A':
 807                         ret = regnode(preg, BOLX);
 808                         break;
 809                 case 'Z':
 810                         ret = regnode(preg, EOLX);
 811                         break;
 812                 case '<':
 813                 case 'm':
 814                         ret = regnode(preg, WORDA);
 815                         break;
 816                 case '>':
 817                 case 'M':
 818                         ret = regnode(preg, WORDZ);
 819                         break;
 820                 case 'd':
 821                 case 'D':
 822                         ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
 823                         reg_addrange(preg, '0', '9');
 824                         regc(preg, '\0');
 825                         *flagp |= HASWIDTH|SIMPLE;
 826                         break;
 827                 case 'w':
 828                 case 'W':
 829                         ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
 830                         if ((preg->cflags & REG_ICASE) == 0) {
 831                                 reg_addrange(preg, 'a', 'z');
 832                         }
 833                         reg_addrange(preg, 'A', 'Z');
 834                         reg_addrange(preg, '0', '9');
 835                         reg_addrange(preg, '_', '_');
 836                         regc(preg, '\0');
 837                         *flagp |= HASWIDTH|SIMPLE;
 838                         break;
 839                 case 's':
 840                 case 'S':
 841                         ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
 842                         reg_addrange_str(preg," \t\r\n\f\v");
 843                         regc(preg, '\0');
 844                         *flagp |= HASWIDTH|SIMPLE;
 845                         break;
 846                 /* FIXME: Someday handle \1, \2, ... */
 847                 default:
 848                         /* Handle general quoted chars in exact-match routine */
 849                         /* Back up to include the backslash */
 850                         preg->regparse--;
 851                         goto de_fault;
 852                 }
 853                 break;
 854         de_fault:
 855         default: {
 856                         /*
 857                          * Encode a string of characters to be matched exactly.
 858                          */
 859                         int added = 0;
 860
 861                         /* Back up to pick up the first char of interest */
 862                         preg->regparse -= n;
 863
 864                         ret = regnode(preg, EXACTLY);
 865
 866                         /* Note that a META operator such as ? or * consumes the
 867                          * preceding char.
 868                          * Thus we must be careful to look ahead by 2 and add the
 869                          * last char as it's own EXACTLY if necessary
 870                          */
 871
 872                         /* Until end of string or a META char is reached */
 873                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 874                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 875                                 if (ch == '\\' && preg->regparse[n]) {
 876                                         /* Non-trailing backslash.
 877                                          * Is this a special escape, or a regular escape?
 878                                          */
 879                                         if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
 880                                                 /* A special escape. All done with EXACTLY */
 881                                                 break;
 882                                         }
 883                                         /* Decode it. Note that we add the length for the escape
 884                                          * sequence to the length for the backlash so we can skip
 885                                          * the entire sequence, or not as required.
 886                                          */
 887                                         n += reg_decode_escape(preg->regparse + n, &ch);
 888                                         if (ch == 0) {
 889                                                 preg->err = REG_ERR_NULL_CHAR;
 890                                                 return 0;
 891                                         }
 892                                 }
 893
 894                                 /* Now we have one char 'ch' of length 'n'.
 895                                  * Check to see if the following char is a MULT
 896                                  */
 897
 898                                 if (ISMULT(preg->regparse[n])) {
 899                                         /* Yes. But do we already have some EXACTLY chars? */
 900                                         if (added) {
 901                                                 /* Yes, so return what we have and pick up the current char next time around */
 902                                                 break;
 903                                         }
 904                                         /* No, so add this single char and finish */
 905                                         regc(preg, ch);
 906                                         added++;
 907                                         preg->regparse += n;
 908                                         break;
 909                                 }
 910
 911                                 /* No, so just add this char normally */
 912                                 regc(preg, ch);
 913                                 added++;
 914                                 preg->regparse += n;
 915                         }
 916                         regc(preg, '\0');
 917
 918                         *flagp |= HASWIDTH;
 919                         if (added == 1)
 920                                 *flagp |= SIMPLE;
 921                         break;
 922                 }
 923                 break;
 924         }
 925
 926         return(ret);
 927 }
 928
 929 static void reg_grow(regex_t *preg, int n)
 930 {
 931         if (preg->p + n >= preg->proglen) {
 932                 preg->proglen = (preg->p + n) * 2;
 933                 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
 934         }
 935 }
 936
 937 /*
 938  - regnode - emit a node
 939  */
 940 /* Location. */
 941 static int regnode(regex_t *preg, int op)
 942 {
 943         reg_grow(preg, 2);
 944
 945         /* The OP followed by a next pointer */
 946         preg->program[preg->p++] = op;
 947         preg->program[preg->p++] = 0;
 948
 949         /* Return the start of the node */
 950         return preg->p - 2;
 951 }
 952
 953 /*
 954  - regc - emit (if appropriate) a byte of code
 955  */
 956 static void regc(regex_t *preg, int b )
 957 {
 958         reg_grow(preg, 1);
 959         preg->program[preg->p++] = b;
 960 }
 961
 962 /*
 963  - reginsert - insert an operator in front of already-emitted operand
 964  *
 965  * Means relocating the operand.
 966  * Returns the new location of the original operand.
 967  */
 968 static int reginsert(regex_t *preg, int op, int size, int opnd )
 969 {
 970         reg_grow(preg, size);
 971
 972         /* Move everything from opnd up */
 973         memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
 974         /* Zero out the new space */
 975         memset(preg->program + opnd, 0, sizeof(int) * size);
 976
 977         preg->program[opnd] = op;
 978
 979         preg->p += size;
 980
 981         return opnd + size;
 982 }
 983
 984 /*
 985  - regtail - set the next-pointer at the end of a node chain
 986  */
 987 static void regtail(regex_t *preg, int p, int val)
 988 {
 989         int scan;
 990         int temp;
 991         int offset;
 992
 993         /* Find last node. */
 994         scan = p;
 995         for (;;) {
 996                 temp = regnext(preg, scan);
 997                 if (temp == 0)
 998                         break;
 999                 scan = temp;
1000         }
1001
1002         if (OP(preg, scan) == BACK)
1003                 offset = scan - val;
1004         else
1005                 offset = val - scan;
1006
1007         preg->program[scan + 1] = offset;
1008 }
1009
1010 /*
1011  - regoptail - regtail on operand of first argument; nop if operandless
1012  */
1013
1014 static void regoptail(regex_t *preg, int p, int val )
1015 {
1016         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1017         if (p != 0 && OP(preg, p) == BRANCH) {
1018                 regtail(preg, OPERAND(p), val);
1019         }
1020 }
1021
1022 /*
1023  * regexec and friends
1024  */
1025
1026 /*
1027  * Forwards.
1028  */
1029 static int regtry(regex_t *preg, const char *string );
1030 static int regmatch(regex_t *preg, int prog);
1031 static int regrepeat(regex_t *preg, int p, int max);
1032
1033 /*
1034  - regexec - match a regexp against a string
1035  */
1036 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1037 {
1038         const char *s;
1039         int scan;
1040
1041         /* Be paranoid... */
1042         if (preg == NULL || preg->program == NULL || string == NULL) {
1043                 return REG_ERR_NULL_ARGUMENT;
1044         }
1045
1046         /* Check validity of program. */
1047         if (*preg->program != REG_MAGIC) {
1048                 return REG_ERR_CORRUPTED;
1049         }
1050
1051 #ifdef DEBUG
1052         fprintf(stderr, "regexec: %s\n", string);
1053         regdump(preg);
1054 #endif
1055
1056         preg->eflags = eflags;
1057         preg->pmatch = pmatch;
1058         preg->nmatch = nmatch;
1059         preg->start = string;   /* All offsets are computed from here */
1060
1061         /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1062         for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1063                 int op = OP(preg, scan);
1064                 if (op == END)
1065                         break;
1066                 if (op == REPX || op == REPXMIN)
1067                         preg->program[scan + 4] = 0;
1068         }
1069
1070         /* If there is a "must appear" string, look for it. */
1071         if (preg->regmust != 0) {
1072                 s = string;
1073                 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1074                         if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1075                                 break;
1076                         }
1077                         s++;
1078                 }
1079                 if (s == NULL)  /* Not present. */
1080                         return REG_NOMATCH;
1081         }
1082
1083         /* Mark beginning of line for ^ . */
1084         preg->regbol = string;
1085
1086         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1087         if (preg->reganch) {
1088                 if (eflags & REG_NOTBOL) {
1089                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1090                         goto nextline;
1091                 }
1092                 while (1) {
1093                         if (regtry(preg, string)) {
1094                                 return REG_NOERROR;
1095                         }
1096                         if (*string) {
1097 nextline:
1098                                 if (preg->cflags & REG_NEWLINE) {
1099                                         /* Try the next anchor? */
1100                                         string = strchr(string, '\n');
1101                                         if (string) {
1102                                                 preg->regbol = ++string;
1103                                                 continue;
1104                                         }
1105                                 }
1106                         }
1107                         return REG_NOMATCH;
1108                 }
1109         }
1110
1111         /* Messy cases:  unanchored match. */
1112         s = string;
1113         if (preg->regstart != '\0') {
1114                 /* We know what char it must start with. */
1115                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1116                         if (regtry(preg, s))
1117                                 return REG_NOERROR;
1118                         s++;
1119                 }
1120         }
1121         else
1122                 /* We don't -- general case. */
1123                 while (1) {
1124                         if (regtry(preg, s))
1125                                 return REG_NOERROR;
1126                         if (*s == '\0') {
1127                                 break;
1128                         }
1129                         else {
1130                                 int c;
1131                                 s += utf8_tounicode(s, &c);
1132                         }
1133                 }
1134
1135         /* Failure. */
1136         return REG_NOMATCH;
1137 }
1138
1139 /*
1140  - regtry - try match at specific point
1141  */
1142                         /* 0 failure, 1 success */
1143 static int regtry( regex_t *preg, const char *string )
1144 {
1145         int i;
1146
1147         preg->reginput = string;
1148
1149         for (i = 0; i < preg->nmatch; i++) {
1150                 preg->pmatch[i].rm_so = -1;
1151                 preg->pmatch[i].rm_eo = -1;
1152         }
1153         if (regmatch(preg, 1)) {
1154                 preg->pmatch[0].rm_so = string - preg->start;
1155                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1156                 return(1);
1157         } else
1158                 return(0);
1159 }
1160
1161 /**
1162  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1163  *
1164  * If 'nocase' is non-zero, does a case-insensitive match.
1165  *
1166  * Returns -1 on not found.
1167  */
1168 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1169 {
1170         const char *s = string;
1171         while (proglen && *s) {
1172                 int ch;
1173                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1174                 if (ch != *prog) {
1175                         return -1;
1176                 }
1177                 prog++;
1178                 s += n;
1179                 proglen--;
1180         }
1181         if (proglen == 0) {
1182                 return s - string;
1183         }
1184         return -1;
1185 }
1186
1187 /**
1188  * Searchs for 'c' in the range 'range'.
1189  *
1190  * Returns 1 if found, or 0 if not.
1191  */
1192 static int reg_range_find(const int *range, int c)
1193 {
1194         while (*range) {
1195                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1196                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1197                         return 1;
1198                 }
1199                 range += 2;
1200         }
1201         return 0;
1202 }
1203
1204 /**
1205  * Search for the character 'c' in the utf-8 string 'string'.
1206  *
1207  * If 'nocase' is set, the 'string' is assumed to be uppercase
1208  * and 'c' is converted to uppercase before matching.
1209  *
1210  * Returns the byte position in the string where the 'c' was found, or
1211  * NULL if not found.
1212  */
1213 static const char *str_find(const char *string, int c, int nocase)
1214 {
1215         if (nocase) {
1216                 /* The "string" should already be converted to uppercase */
1217                 c = utf8_upper(c);
1218         }
1219         while (*string) {
1220                 int ch;
1221                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1222                 if (c == ch) {
1223                         return string;
1224                 }
1225                 string += n;
1226         }
1227         return NULL;
1228 }
1229
1230 /**
1231  * Returns true if 'ch' is an end-of-line char.
1232  *
1233  * In REG_NEWLINE mode, \n is considered EOL in
1234  * addition to \0
1235  */
1236 static int reg_iseol(regex_t *preg, int ch)
1237 {
1238         if (preg->cflags & REG_NEWLINE) {
1239                 return ch == '\0' || ch == '\n';
1240         }
1241         else {
1242                 return ch == '\0';
1243         }
1244 }
1245
1246 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1247 {
1248         int nextch = '\0';
1249         const char *save;
1250         int no;
1251         int c;
1252
1253         int max = preg->program[scan + 2];
1254         int min = preg->program[scan + 3];
1255         int next = regnext(preg, scan);
1256
1257         /*
1258          * Lookahead to avoid useless match attempts
1259          * when we know what character comes next.
1260          */
1261         if (OP(preg, next) == EXACTLY) {
1262                 nextch = preg->program[OPERAND(next)];
1263         }
1264         save = preg->reginput;
1265         no = regrepeat(preg, scan + 5, max);
1266         if (no < min) {
1267                 return 0;
1268         }
1269         if (matchmin) {
1270                 /* from min up to no */
1271                 max = no;
1272                 no = min;
1273         }
1274         /* else from no down to min */
1275         while (1) {
1276                 if (matchmin) {
1277                         if (no > max) {
1278                                 break;
1279                         }
1280                 }
1281                 else {
1282                         if (no < min) {
1283                                 break;
1284                         }
1285                 }
1286                 preg->reginput = save + utf8_index(save, no);
1287                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1288                 /* If it could work, try it. */
1289                 if (reg_iseol(preg, nextch) || c == nextch) {
1290                         if (regmatch(preg, next)) {
1291                                 return(1);
1292                         }
1293                 }
1294                 if (matchmin) {
1295                         /* Couldn't or didn't, add one more */
1296                         no++;
1297                 }
1298                 else {
1299                         /* Couldn't or didn't -- back up. */
1300                         no--;
1301                 }
1302         }
1303         return(0);
1304 }
1305
1306 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1307 {
1308         int *scanpt = preg->program + scan;
1309
1310         int max = scanpt[2];
1311         int min = scanpt[3];
1312
1313         /* Have we reached min? */
1314         if (scanpt[4] < min) {
1315                 /* No, so get another one */
1316                 scanpt[4]++;
1317                 if (regmatch(preg, scan + 5)) {
1318                         return 1;
1319                 }
1320                 scanpt[4]--;
1321                 return 0;
1322         }
1323         if (scanpt[4] > max) {
1324                 return 0;
1325         }
1326
1327         if (matchmin) {
1328                 /* minimal, so try other branch first */
1329                 if (regmatch(preg, regnext(preg, scan))) {
1330                         return 1;
1331                 }
1332                 /* No, so try one more */
1333                 scanpt[4]++;
1334                 if (regmatch(preg, scan + 5)) {
1335                         return 1;
1336                 }
1337                 scanpt[4]--;
1338                 return 0;
1339         }
1340         /* maximal, so try this branch again */
1341         if (scanpt[4] < max) {
1342                 scanpt[4]++;
1343                 if (regmatch(preg, scan + 5)) {
1344                         return 1;
1345                 }
1346                 scanpt[4]--;
1347         }
1348         /* At this point we are at max with no match. Try the other branch */
1349         return regmatch(preg, regnext(preg, scan));
1350 }
1351
1352 /*
1353  - regmatch - main matching routine
1354  *
1355  * Conceptually the strategy is simple:  check to see whether the current
1356  * node matches, call self recursively to see whether the rest matches,
1357  * and then act accordingly.  In practice we make some effort to avoid
1358  * recursion, in particular by going through "ordinary" nodes (that don't
1359  * need to know whether the rest of the match failed) by a loop instead of
1360  * by recursion.
1361  */
1362 /* 0 failure, 1 success */
1363 static int regmatch(regex_t *preg, int prog)
1364 {
1365         int scan;       /* Current node. */
1366         int next;               /* Next node. */
1367         const char *save;
1368
1369         scan = prog;
1370
1371 #ifdef DEBUG
1372         if (scan != 0 && regnarrate)
1373                 fprintf(stderr, "%s(\n", regprop(scan));
1374 #endif
1375         while (scan != 0) {
1376                 int n;
1377                 int c;
1378 #ifdef DEBUG
1379                 if (regnarrate) {
1380                         fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1381                 }
1382 #endif
1383                 next = regnext(preg, scan);
1384                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1385
1386                 switch (OP(preg, scan)) {
1387                 case BOLX:
1388                         if ((preg->eflags & REG_NOTBOL)) {
1389                                 return(0);
1390                         }
1391                         /* Fall through */
1392                 case BOL:
1393                         if (preg->reginput != preg->regbol) {
1394                                 return(0);
1395                         }
1396                         break;
1397                 case EOLX:
1398                         if (c != 0) {
1399                                 /* For EOLX, only match real end of line, not newline */
1400                                 return 0;
1401                         }
1402                         break;
1403                 case EOL:
1404                         if (!reg_iseol(preg, c)) {
1405                                 return(0);
1406                         }
1407                         break;
1408                 case WORDA:
1409                         /* Must be looking at a letter, digit, or _ */
1410                         if ((!isalnum(UCHAR(c))) && c != '_')
1411                                 return(0);
1412                         /* Prev must be BOL or nonword */
1413                         if (preg->reginput > preg->regbol &&
1414                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1415                                 return(0);
1416                         break;
1417                 case WORDZ:
1418                         /* Can't match at BOL */
1419                         if (preg->reginput > preg->regbol) {
1420                                 /* Current must be EOL or nonword */
1421                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1422                                         c = preg->reginput[-1];
1423                                         /* Previous must be word */
1424                                         if (isalnum(UCHAR(c)) || c == '_') {
1425                                                 break;
1426                                         }
1427                                 }
1428                         }
1429                         /* No */
1430                         return(0);
1431
1432                 case ANY:
1433                         if (reg_iseol(preg, c))
1434                                 return 0;
1435                         preg->reginput += n;
1436                         break;
1437                 case EXACTLY: {
1438                                 int opnd;
1439                                 int len;
1440                                 int slen;
1441
1442                                 opnd = OPERAND(scan);
1443                                 len = str_int_len(preg->program + opnd);
1444
1445                                 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1446                                 if (slen < 0) {
1447                                         return(0);
1448                                 }
1449                                 preg->reginput += slen;
1450                         }
1451                         break;
1452                 case ANYOF:
1453                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1454                                 return(0);
1455                         }
1456                         preg->reginput += n;
1457                         break;
1458                 case ANYBUT:
1459                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1460                                 return(0);
1461                         }
1462                         preg->reginput += n;
1463                         break;
1464                 case NOTHING:
1465                         break;
1466                 case BACK:
1467                         break;
1468                 case BRANCH:
1469                         if (OP(preg, next) != BRANCH)           /* No choice. */
1470                                 next = OPERAND(scan);   /* Avoid recursion. */
1471                         else {
1472                                 do {
1473                                         save = preg->reginput;
1474                                         if (regmatch(preg, OPERAND(scan))) {
1475                                                 return(1);
1476                                         }
1477                                         preg->reginput = save;
1478                                         scan = regnext(preg, scan);
1479                                 } while (scan != 0 && OP(preg, scan) == BRANCH);
1480                                 return(0);
1481                                 /* NOTREACHED */
1482                         }
1483                         break;
1484                 case REP:
1485                 case REPMIN:
1486                         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1487
1488                 case REPX:
1489                 case REPXMIN:
1490                         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1491
1492                 case END:
1493                         return 1;       /* Success! */
1494
1495                 case OPENNC:
1496                 case CLOSENC:
1497                         return regmatch(preg, next);
1498
1499                 default:
1500                         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1501                                 save = preg->reginput;
1502                                 if (regmatch(preg, next)) {
1503                                         if (OP(preg, scan) < CLOSE) {
1504                                                 int no = OP(preg, scan) - OPEN;
1505                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1506                                                         preg->pmatch[no].rm_so = save - preg->start;
1507                                                 }
1508                                         }
1509                                         else {
1510                                                 int no = OP(preg, scan) - CLOSE;
1511                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1512                                                         preg->pmatch[no].rm_eo = save - preg->start;
1513                                                 }
1514                                         }
1515                                         return(1);
1516                                 }
1517                                 return(0);
1518                         }
1519                         return REG_ERR_INTERNAL;
1520                 }
1521
1522                 scan = next;
1523         }
1524
1525         /*
1526          * We get here only if there's trouble -- normally "case END" is
1527          * the terminating point.
1528          */
1529         return REG_ERR_INTERNAL;
1530 }
1531
1532 /*
1533  - regrepeat - repeatedly match something simple, report how many
1534  */
1535 static int regrepeat(regex_t *preg, int p, int max)
1536 {
1537         int count = 0;
1538         const char *scan;
1539         int opnd;
1540         int ch;
1541         int n;
1542
1543         scan = preg->reginput;
1544         opnd = OPERAND(p);
1545         switch (OP(preg, p)) {
1546         case ANY:
1547                 /* No need to handle utf8 specially here */
1548                 while (!reg_iseol(preg, *scan) && count < max) {
1549                         count++;
1550                         scan++;
1551                 }
1552                 break;
1553         case EXACTLY:
1554                 while (count < max) {
1555                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1556                         if (preg->program[opnd] != ch) {
1557                                 break;
1558                         }
1559                         count++;
1560                         scan += n;
1561                 }
1562                 break;
1563         case ANYOF:
1564                 while (count < max) {
1565                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1566                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1567                                 break;
1568                         }
1569                         count++;
1570                         scan += n;
1571                 }
1572                 break;
1573         case ANYBUT:
1574                 while (count < max) {
1575                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1576                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1577                                 break;
1578                         }
1579                         count++;
1580                         scan += n;
1581                 }
1582                 break;
1583         default:                /* Oh dear.  Called inappropriately. */
1584                 preg->err = REG_ERR_INTERNAL;
1585                 count = 0;      /* Best compromise. */
1586                 break;
1587         }
1588         preg->reginput = scan;
1589
1590         return(count);
1591 }
1592
1593 /*
1594  - regnext - dig the "next" pointer out of a node
1595  */
1596 static int regnext(regex_t *preg, int p )
1597 {
1598         int offset;
1599
1600         offset = NEXT(preg, p);
1601
1602         if (offset == 0)
1603                 return 0;
1604
1605         if (OP(preg, p) == BACK)
1606                 return(p-offset);
1607         else
1608                 return(p+offset);
1609 }
1610
1611 /*
1612  - regopsize - returns the size of opcode + operands at 'p' in words
1613  */
1614 static int regopsize(regex_t *preg, int p )
1615 {
1616         /* Almost all opcodes are 2 words, but some are more */
1617         switch (OP(preg, p)) {
1618                 case REP:
1619                 case REPMIN:
1620                 case REPX:
1621                 case REPXMIN:
1622                         return 5;
1623
1624                 case ANYOF:
1625                 case ANYBUT:
1626                 case EXACTLY: {
1627                         int s = p + 2;
1628                         while (preg->program[s++]) {
1629                         }
1630                         return s - p;
1631                 }
1632         }
1633         return 2;
1634 }
1635
1636 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1637
1638 /*
1639  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1640  */
1641 static void regdump(regex_t *preg)
1642 {
1643         int s;
1644         int op = EXACTLY;       /* Arbitrary non-END op. */
1645         int next;
1646         char buf[MAX_UTF8_LEN + 1];
1647
1648         int i;
1649         for (i = 1; i < preg->p; i++) {
1650                 printf("%02x ", (unsigned char)preg->program[i]);
1651                 if (i % 16 == 0) {
1652                         printf("\n");
1653                 }
1654         }
1655         printf("\n");
1656
1657         s = 1;
1658         while (op != END && s < preg->p) {      /* While that wasn't END last time... */
1659                 op = OP(preg, s);
1660                 printf("%3d: %s", s, regprop(op));      /* Where, what. */
1661                 next = regnext(preg, s);
1662                 if (next == 0)          /* Next ptr. */
1663                         printf("(0)");
1664                 else
1665                         printf("(%d)", next);
1666                 s += 2;
1667                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1668                         int max = preg->program[s];
1669                         int min = preg->program[s + 1];
1670                         if (max == 65535) {
1671                                 printf("{%d,*}", min);
1672                         }
1673                         else {
1674                                 printf("{%d,%d}", min, max);
1675                         }
1676                         printf(" %d", preg->program[s + 2]);
1677                         s += 3;
1678                 }
1679                 else if (op == ANYOF || op == ANYBUT) {
1680                         /* set of ranges */
1681
1682                         while (preg->program[s]) {
1683                                 int len = preg->program[s++];
1684                                 int first = preg->program[s++];
1685                                 buf[utf8_getchars(buf, first)] = 0;
1686                                 printf("%s", buf);
1687                                 if (len > 1) {
1688                                         buf[utf8_getchars(buf, first + len - 1)] = 0;
1689                                         printf("-%s", buf);
1690                                 }
1691                         }
1692                         s++;
1693                 }
1694                 else if (op == EXACTLY) {
1695                         /* Literal string, where present. */
1696
1697                         while (preg->program[s]) {
1698                                 buf[utf8_getchars(buf, preg->program[s])] = 0;
1699                                 printf("%s", buf);
1700                                 s++;
1701                         }
1702                         s++;
1703                 }
1704                 putchar('\n');
1705         }
1706
1707         if (op == END) {
1708                 /* Header fields of interest. */
1709                 if (preg->regstart) {
1710                         buf[utf8_getchars(buf, preg->regstart)] = 0;
1711                         printf("start '%s' ", buf);
1712                 }
1713                 if (preg->reganch)
1714                         printf("anchored ");
1715                 if (preg->regmust != 0) {
1716                         int i;
1717                         printf("must have:");
1718                         for (i = 0; i < preg->regmlen; i++) {
1719                                 putchar(preg->program[preg->regmust + i]);
1720                         }
1721                         putchar('\n');
1722                 }
1723         }
1724         printf("\n");
1725 }
1726
1727 /*
1728  - regprop - printable representation of opcode
1729  */
1730 static const char *regprop( int op )
1731 {
1732         static char buf[50];
1733
1734         switch (op) {
1735         case BOL:
1736                 return "BOL";
1737         case EOL:
1738                 return "EOL";
1739         case BOLX:
1740                 return "BOLX";
1741         case EOLX:
1742                 return "EOLX";
1743         case ANY:
1744                 return "ANY";
1745         case ANYOF:
1746                 return "ANYOF";
1747         case ANYBUT:
1748                 return "ANYBUT";
1749         case BRANCH:
1750                 return "BRANCH";
1751         case EXACTLY:
1752                 return "EXACTLY";
1753         case NOTHING:
1754                 return "NOTHING";
1755         case BACK:
1756                 return "BACK";
1757         case END:
1758                 return "END";
1759         case REP:
1760                 return "REP";
1761         case REPMIN:
1762                 return "REPMIN";
1763         case REPX:
1764                 return "REPX";
1765         case REPXMIN:
1766                 return "REPXMIN";
1767         case WORDA:
1768                 return "WORDA";
1769         case WORDZ:
1770                 return "WORDZ";
1771         case OPENNC:
1772                 return "OPEN";
1773         case CLOSENC:
1774                 return "CLOSE";
1775         default:
1776                 if (op >= OPEN && op < CLOSE) {
1777                         snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1778                 }
1779                 else if (op >= CLOSE && op < CLOSE_END) {
1780                         snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1781                 }
1782                 else {
1783                         snprintf(buf, sizeof(buf), "?%d?\n", op);
1784                 }
1785                 return(buf);
1786         }
1787 }
1788 #endif /* JIM_BOOTSTRAP */
1789
1790 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1791 {
1792         static const char *error_strings[] = {
1793                 "success",
1794                 "no match",
1795                 "bad pattern",
1796                 "null argument",
1797                 "unknown error",
1798                 "too big",
1799                 "out of memory",
1800                 "too many ()",
1801                 "parentheses () not balanced",
1802                 "braces {} not balanced",
1803                 "invalid repetition count(s)",
1804                 "extra characters",
1805                 "*+ of empty atom",
1806                 "nested count",
1807                 "internal error",
1808                 "count follows nothing",
1809                 "trailing backslash",
1810                 "corrupted program",
1811                 "contains null char",
1812         };
1813         const char *err;
1814
1815         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1816                 err = "Bad error code";
1817         }
1818         else {
1819                 err = error_strings[errcode];
1820         }
1821
1822         return snprintf(errbuf, errbuf_size, "%s", err);
1823 }
1824
1825 void regfree(regex_t *preg)
1826 {
1827         free(preg->program);
1828 }
1829
1830 #endif