jimregexp.c

   1 /*
   2  * vi:se ts=8:
   3  *
   4  * regcomp and regexec -- regsub and regerror are elsewhere
   5  *
   6  *      Copyright (c) 1986 by University of Toronto.
   7  *      Written by Henry Spencer.  Not derived from licensed software.
   8  *
   9  *      Permission is granted to anyone to use this software for any
  10  *      purpose on any computer system, and to redistribute it freely,
  11  *      subject to the following restrictions:
  12  *
  13  *      1. The author is not responsible for the consequences of use of
  14  *              this software, no matter how awful, even if they arise
  15  *              from defects in it.
  16  *
  17  *      2. The origin of this software must not be misrepresented, either
  18  *              by explicit claim or by omission.
  19  *
  20  *      3. Altered versions must be plainly marked as such, and must not
  21  *              be misrepresented as being the original software.
  22  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  23  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  24  *** to assist in implementing egrep.
  25  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  26  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  27  *** as in BSD grep and ex.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  29  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  31  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  32  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  33  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  34  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  35  *** to avoid conflicting with the system regexp.h.  Const, bless its
  36  *** soul, was removed so it can compile everywhere.  The declaration
  37  *** of strchr() was in conflict on AIX, so it was removed (as it is
  38  *** happily defined in string.h).
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  41  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  42  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  43  *
  44  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  45  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  46  *   This includes counted repetitions, UTF-8 support, character classes,
  47  *   shorthand character classes, increased number of parentheses to 100,
  48  *   backslash escape sequences. It also removes \n as an alternative to |.
  49  *
  50  * Beware that some of this code is subtly aware of the way operator
  51  * precedence is structured in regular expressions.  Serious changes in
  52  * regular-expression syntax might require a total rethink.
  53  */
  54
  55 #include "jimautoconf.h"
  56
  57 #if defined(JIM_REGEXP)
  58 #include <stdio.h>
  59 #include <ctype.h>
  60 #include <stdlib.h>
  61 #include <string.h>
  62
  63 #include "jim.h"
  64 #include "jimregexp.h"
  65 #include "utf8.h"
  66
  67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
  68 #define REG_MAX_PAREN 100
  69
  70 /*
  71  * Structure for regexp "program".  This is essentially a linear encoding
  72  * of a nondeterministic finite-state machine (aka syntax charts or
  73  * "railroad normal form" in parsing technology).  Each node is an opcode
  74  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  75  * all nodes except BRANCH implement concatenation; a "next" pointer with
  76  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  77  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  78  * opposed to a collection of them) is never concatenated with anything
  79  * because of operator precedence.)  The operand of some types of node is
  80  * a literal string; for others, it is a node leading into a sub-FSM.  In
  81  * particular, the operand of a BRANCH node is the first node of the branch.
  82  * (NB this is *not* a tree structure:  the tail of the branch connects
  83  * to the thing following the set of BRANCHes.)  The opcodes are:
  84  */
  85
  86 /* definition           number  opnd?   meaning */
  87 #define END     0       /* no   End of program. */
  88 #define BOL     1       /* no   Match "" at beginning of line. */
  89 #define EOL     2       /* no   Match "" at end of line. */
  90 #define ANY     3       /* no   Match any one character. */
  91 #define ANYOF   4       /* str  Match any character in this string. */
  92 #define ANYBUT  5       /* str  Match any character not in this string. */
  93 #define BRANCH  6       /* node Match this alternative, or the next... */
  94 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  95 #define EXACTLY 8       /* str  Match this string. */
  96 #define NOTHING 9       /* no   Match empty string. */
  97 #define REP     10      /* max,min      Match this (simple) thing [min,max] times. */
  98 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, minimal match. */
  99 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
 100 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
 101 #define BOLX    14      /* no   Match "" at beginning of input. */
 102 #define EOLX    15      /* no   Match "" at end of input. */
 103 #define WORDA   16      /* no   Match "" at wordchar, where prev is nonword */
 104 #define WORDZ   17      /* no   Match "" at nonwordchar, where prev is word */
 105
 106 #define OPENNC  1000    /* no   Non-capturing parentheses - must be OPEN-1 */
 107 #define OPEN    1001    /* no   Mark this point in input as start of #n. */
 108                         /*      OPEN+1 is number 1, etc. */
 109
 110 /* must not be any other opts between OPEN and CLOSE */
 111
 112 #define CLOSENC 2000    /* no   Non-capturing parentheses - must be CLOSE-1 */
 113 #define CLOSE   2001    /* no   Analogous to OPEN. */
 114 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 115
 116 /*
 117  * The first word of the regexp internal "program" is actually this magic
 118  * number; the start node begins in the second word.
 119  */
 120 #define REG_MAGIC       0xFADED00D
 121
 122 /*
 123  * Opcode notes:
 124  *
 125  * BRANCH       The set of branches constituting a single choice are hooked
 126  *              together with their "next" pointers, since precedence prevents
 127  *              anything being concatenated to any individual branch.  The
 128  *              "next" pointer of the last BRANCH in a choice points to the
 129  *              thing following the whole choice.  This is also where the
 130  *              final "next" pointer of each individual branch points; each
 131  *              branch starts with the operand node of a BRANCH node.
 132  *
 133  * BACK         Normal "next" pointers all implicitly point forward; BACK
 134  *              exists to make loop structures possible.
 135  *
 136  * REP,REPX     Repeated matches ('?', '*', '+' and {min,max}) are implemented
 137  *              as either simple repeats (REP) or complex repeats (REPX).
 138  *              These opcodes include a "min" and "max" count after the opcode.
 139  *              This is followed by a fourth "current count" word that is
 140  *              only used by REPX, as it implements a recursive match.
 141  *              REPMIN and REPXMIN are identical except they implement minimal repeats.
 142  *
 143  * OPEN,CLOSE   ...are numbered at compile time.
 144  */
 145
 146 /*
 147  * A node is one word of opcode followed by one word of "next" pointer.
 148  * The "next" pointer value is a positive offset from the opcode of the node
 149  * containing it.
 150  * An operand, if any, simply follows the node.  (Note that much of the
 151  * code generation knows about this implicit relationship.)
 152  */
 153 #define OP(preg, p)     (preg->program[p])
 154 #define NEXT(preg, p)   (preg->program[p + 1])
 155 #define OPERAND(p)      ((p) + 2)
 156
 157 /*
 158  * See regmagic.h for one further detail of program structure.
 159  */
 160
 161
 162 /*
 163  * Utility definitions.
 164  */
 165
 166 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 167 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 168 #define META            "^$.[()|?{+*"
 169
 170 /*
 171  * Flags to be passed up and down.
 172  */
 173 #define HASWIDTH        1       /* Known never to match null string. */
 174 #define SIMPLE          2       /* Simple enough to be STAR/PLUS operand. */
 175 #define SPSTART         4       /* Starts with * or +. */
 176 #define WORST           0       /* Worst case. */
 177
 178 #define MAX_REP_COUNT 1000000
 179
 180 /*
 181  * Forward declarations for regcomp()'s friends.
 182  */
 183 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 184 static int regpiece(regex_t *preg, int *flagp );
 185 static int regbranch(regex_t *preg, int *flagp );
 186 static int regatom(regex_t *preg, int *flagp );
 187 static int regnode(regex_t *preg, int op );
 188 static int regnext(regex_t *preg, int p );
 189 static void regc(regex_t *preg, int b );
 190 static int reginsert(regex_t *preg, int op, int size, int opnd );
 191 static void regtail(regex_t *preg, int p, int val);
 192 static void regoptail(regex_t *preg, int p, int val );
 193 static int regopsize(regex_t *preg, int p );
 194
 195 static int reg_range_find(const int *string, int c);
 196 static const char *str_find(const char *string, int c, int nocase);
 197 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 198
 199 /*#define DEBUG*/
 200 #ifdef DEBUG
 201 static int regnarrate = 0;
 202 static void regdump(regex_t *preg);
 203 static const char *regprop( int op );
 204 #endif
 205
 206
 207 /**
 208  * Returns the length of the null-terminated integer sequence.
 209  */
 210 static int str_int_len(const int *seq)
 211 {
 212         int n = 0;
 213         while (*seq++) {
 214                 n++;
 215         }
 216         return n;
 217 }
 218
 219 /*
 220  - regcomp - compile a regular expression into internal code
 221  *
 222  * We can't allocate space until we know how big the compiled form will be,
 223  * but we can't compile it (and thus know how big it is) until we've got a
 224  * place to put the code.  So we cheat:  we compile it twice, once with code
 225  * generation turned off and size counting turned on, and once "for real".
 226  * This also means that we don't allocate space until we are sure that the
 227  * thing really will compile successfully, and we never have to move the
 228  * code and thus invalidate pointers into it.  (Note that it has to be in
 229  * one piece because free() must be able to free it all.)
 230  *
 231  * Beware that the optimization-preparation code in here knows about some
 232  * of the structure of the compiled regexp.
 233  */
 234 int regcomp(regex_t *preg, const char *exp, int cflags)
 235 {
 236         int scan;
 237         int longest;
 238         unsigned len;
 239         int flags;
 240
 241 #ifdef DEBUG
 242         fprintf(stderr, "Compiling: '%s'\n", exp);
 243 #endif
 244         memset(preg, 0, sizeof(*preg));
 245
 246         if (exp == NULL)
 247                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 248
 249         /* First pass: determine size, legality. */
 250         preg->cflags = cflags;
 251         preg->regparse = exp;
 252
 253         /* Allocate space. */
 254         preg->proglen = (strlen(exp) + 1) * 5;
 255         preg->program = malloc(preg->proglen * sizeof(int));
 256         if (preg->program == NULL)
 257                 FAIL(preg, REG_ERR_NOMEM);
 258
 259         /* Note that since we store a magic value as the first item in the program,
 260          * program offsets will never be 0
 261          */
 262         regc(preg, REG_MAGIC);
 263         if (reg(preg, 0, &flags) == 0) {
 264                 return preg->err;
 265         }
 266
 267         /* Small enough for pointer-storage convention? */
 268         if (preg->re_nsub >= REG_MAX_PAREN)             /* Probably could be 65535L. */
 269                 FAIL(preg,REG_ERR_TOO_BIG);
 270
 271         /* Dig out information for optimizations. */
 272         preg->regstart = 0;     /* Worst-case defaults. */
 273         preg->reganch = 0;
 274         preg->regmust = 0;
 275         preg->regmlen = 0;
 276         scan = 1;                       /* First BRANCH. */
 277         if (OP(preg, regnext(preg, scan)) == END) {             /* Only one top-level choice. */
 278                 scan = OPERAND(scan);
 279
 280                 /* Starting-point info. */
 281                 if (OP(preg, scan) == EXACTLY) {
 282                         preg->regstart = preg->program[OPERAND(scan)];
 283                 }
 284                 else if (OP(preg, scan) == BOL)
 285                         preg->reganch++;
 286
 287                 /*
 288                  * If there's something expensive in the r.e., find the
 289                  * longest literal string that must appear and make it the
 290                  * regmust.  Resolve ties in favor of later strings, since
 291                  * the regstart check works with the beginning of the r.e.
 292                  * and avoiding duplication strengthens checking.  Not a
 293                  * strong reason, but sufficient in the absence of others.
 294                  */
 295                 if (flags&SPSTART) {
 296                         longest = 0;
 297                         len = 0;
 298                         for (; scan != 0; scan = regnext(preg, scan)) {
 299                                 if (OP(preg, scan) == EXACTLY) {
 300                                         int plen = str_int_len(preg->program + OPERAND(scan));
 301                                         if (plen >= len) {
 302                                                 longest = OPERAND(scan);
 303                                                 len = plen;
 304                                         }
 305                                 }
 306                         }
 307                         preg->regmust = longest;
 308                         preg->regmlen = len;
 309                 }
 310         }
 311
 312 #ifdef DEBUG
 313         regdump(preg);
 314 #endif
 315
 316         return 0;
 317 }
 318
 319 /*
 320  - reg - regular expression, i.e. main body or parenthesized thing
 321  *
 322  * Caller must absorb opening parenthesis.
 323  *
 324  * Combining parenthesis handling with the base level of regular expression
 325  * is a trifle forced, but the need to tie the tails of the branches to what
 326  * follows makes it hard to avoid.
 327  */
 328 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 329 {
 330         int ret;
 331         int br;
 332         int ender;
 333         int parno = 0;
 334         int flags;
 335
 336         *flagp = HASWIDTH;      /* Tentatively. */
 337
 338         /* Make an OPEN node, if parenthesized. */
 339         if (paren) {
 340                 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 341                         /* non-capturing paren */
 342                         preg->regparse += 2;
 343                         parno = -1;
 344                 }
 345                 else {
 346                         parno = ++preg->re_nsub;
 347                 }
 348                 ret = regnode(preg, OPEN+parno);
 349         } else
 350                 ret = 0;
 351
 352         /* Pick up the branches, linking them together. */
 353         br = regbranch(preg, &flags);
 354         if (br == 0)
 355                 return 0;
 356         if (ret != 0)
 357                 regtail(preg, ret, br); /* OPEN -> first. */
 358         else
 359                 ret = br;
 360         if (!(flags&HASWIDTH))
 361                 *flagp &= ~HASWIDTH;
 362         *flagp |= flags&SPSTART;
 363         while (*preg->regparse == '|') {
 364                 preg->regparse++;
 365                 br = regbranch(preg, &flags);
 366                 if (br == 0)
 367                         return 0;
 368                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 369                 if (!(flags&HASWIDTH))
 370                         *flagp &= ~HASWIDTH;
 371                 *flagp |= flags&SPSTART;
 372         }
 373
 374         /* Make a closing node, and hook it on the end. */
 375         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 376         regtail(preg, ret, ender);
 377
 378         /* Hook the tails of the branches to the closing node. */
 379         for (br = ret; br != 0; br = regnext(preg, br))
 380                 regoptail(preg, br, ender);
 381
 382         /* Check for proper termination. */
 383         if (paren && *preg->regparse++ != ')') {
 384                 preg->err = REG_ERR_UNMATCHED_PAREN;
 385                 return 0;
 386         } else if (!paren && *preg->regparse != '\0') {
 387                 if (*preg->regparse == ')') {
 388                         preg->err = REG_ERR_UNMATCHED_PAREN;
 389                         return 0;
 390                 } else {
 391                         preg->err = REG_ERR_JUNK_ON_END;
 392                         return 0;
 393                 }
 394         }
 395
 396         return(ret);
 397 }
 398
 399 /*
 400  - regbranch - one alternative of an | operator
 401  *
 402  * Implements the concatenation operator.
 403  */
 404 static int regbranch(regex_t *preg, int *flagp )
 405 {
 406         int ret;
 407         int chain;
 408         int latest;
 409         int flags;
 410
 411         *flagp = WORST;         /* Tentatively. */
 412
 413         ret = regnode(preg, BRANCH);
 414         chain = 0;
 415         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 416                *preg->regparse != '|') {
 417                 latest = regpiece(preg, &flags);
 418                 if (latest == 0)
 419                         return 0;
 420                 *flagp |= flags&HASWIDTH;
 421                 if (chain == 0) {/* First piece. */
 422                         *flagp |= flags&SPSTART;
 423                 }
 424                 else {
 425                         regtail(preg, chain, latest);
 426                 }
 427                 chain = latest;
 428         }
 429         if (chain == 0) /* Loop ran zero times. */
 430                 (void) regnode(preg, NOTHING);
 431
 432         return(ret);
 433 }
 434
 435 /*
 436  - regpiece - something followed by possible [*+?]
 437  *
 438  * Note that the branching code sequences used for ? and the general cases
 439  * of * and + are somewhat optimized:  they use the same NOTHING node as
 440  * both the endmarker for their branch list and the body of the last branch.
 441  * It might seem that this node could be dispensed with entirely, but the
 442  * endmarker role is not redundant.
 443  */
 444 static int regpiece(regex_t *preg, int *flagp)
 445 {
 446         int ret;
 447         char op;
 448         int next;
 449         int flags;
 450         int min;
 451         int max;
 452
 453         ret = regatom(preg, &flags);
 454         if (ret == 0)
 455                 return 0;
 456
 457         op = *preg->regparse;
 458         if (!ISMULT(op)) {
 459                 *flagp = flags;
 460                 return(ret);
 461         }
 462
 463         if (!(flags&HASWIDTH) && op != '?') {
 464                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 465                 return 0;
 466         }
 467
 468         /* Handle braces (counted repetition) by expansion */
 469         if (op == '{') {
 470                 char *end;
 471
 472                 min = strtoul(preg->regparse + 1, &end, 10);
 473                 if (end == preg->regparse + 1) {
 474                         preg->err = REG_ERR_BAD_COUNT;
 475                         return 0;
 476                 }
 477                 if (*end == '}') {
 478                         max = min;
 479                 }
 480                 else {
 481                         preg->regparse = end;
 482                         max = strtoul(preg->regparse + 1, &end, 10);
 483                         if (*end != '}') {
 484                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 485                                 return 0;
 486                         }
 487                 }
 488                 if (end == preg->regparse + 1) {
 489                         max = MAX_REP_COUNT;
 490                 }
 491                 else if (max < min || max >= 100) {
 492                         preg->err = REG_ERR_BAD_COUNT;
 493                         return 0;
 494                 }
 495                 if (min >= 100) {
 496                         preg->err = REG_ERR_BAD_COUNT;
 497                         return 0;
 498                 }
 499
 500                 preg->regparse = strchr(preg->regparse, '}');
 501         }
 502         else {
 503                 min = (op == '+');
 504                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 505         }
 506
 507         if (preg->regparse[1] == '?') {
 508                 preg->regparse++;
 509                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 510         }
 511         else {
 512                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 513         }
 514         preg->program[ret + 2] = max;
 515         preg->program[ret + 3] = min;
 516         preg->program[ret + 4] = 0;
 517
 518         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 519
 520         if (!(flags & SIMPLE)) {
 521                 int back = regnode(preg, BACK);
 522                 regtail(preg, back, ret);
 523                 regtail(preg, next, back);
 524         }
 525
 526         preg->regparse++;
 527         if (ISMULT(*preg->regparse)) {
 528                 preg->err = REG_ERR_NESTED_COUNT;
 529                 return 0;
 530         }
 531
 532         return ret;
 533 }
 534
 535 /**
 536  * Add all characters in the inclusive range between lower and upper.
 537  *
 538  * Handles a swapped range (upper < lower).
 539  */
 540 static void reg_addrange(regex_t *preg, int lower, int upper)
 541 {
 542         if (lower > upper) {
 543                 reg_addrange(preg, upper, lower);
 544         }
 545         /* Add a range as length, start */
 546         regc(preg, upper - lower + 1);
 547         regc(preg, lower);
 548 }
 549
 550 /**
 551  * Add a null-terminated literal string as a set of ranges.
 552  */
 553 static void reg_addrange_str(regex_t *preg, const char *str)
 554 {
 555         while (*str) {
 556                 reg_addrange(preg, *str, *str);
 557                 str++;
 558         }
 559 }
 560
 561 /**
 562  * Extracts the next unicode char from utf8.
 563  *
 564  * If 'upper' is set, converts the char to uppercase.
 565  */
 566 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 567 {
 568         int l = utf8_tounicode(s, uc);
 569         if (upper) {
 570                 *uc = utf8_upper(*uc);
 571         }
 572         return l;
 573 }
 574
 575 /**
 576  * Converts a hex digit to decimal.
 577  *
 578  * Returns -1 for an invalid hex digit.
 579  */
 580 static int hexdigitval(int c)
 581 {
 582         if (c >= '0' && c <= '9')
 583                 return c - '0';
 584         if (c >= 'a' && c <= 'f')
 585                 return c - 'a' + 10;
 586         if (c >= 'A' && c <= 'F')
 587                 return c - 'A' + 10;
 588         return -1;
 589 }
 590
 591 /**
 592  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 593  *
 594  * Returns the number of hex digits parsed.
 595  * If there are no hex digits, returns 0 and stores nothing.
 596  */
 597 static int parse_hex(const char *s, int n, int *uc)
 598 {
 599         int val = 0;
 600         int k;
 601
 602         for (k = 0; k < n; k++) {
 603                 int c = hexdigitval(*s++);
 604                 if (c == -1) {
 605                         break;
 606                 }
 607                 val = (val << 4) | c;
 608         }
 609         if (k) {
 610                 *uc = val;
 611         }
 612         return k;
 613 }
 614
 615 /**
 616  * Call for chars after a backlash to decode the escape sequence.
 617  *
 618  * Stores the result in *ch.
 619  *
 620  * Returns the number of bytes consumed.
 621  */
 622 static int reg_decode_escape(const char *s, int *ch)
 623 {
 624         int n;
 625         const char *s0 = s;
 626
 627         *ch = *s++;
 628
 629         switch (*ch) {
 630                 case 'b': *ch = '\b'; break;
 631                 case 'e': *ch = 27; break;
 632                 case 'f': *ch = '\f'; break;
 633                 case 'n': *ch = '\n'; break;
 634                 case 'r': *ch = '\r'; break;
 635                 case 't': *ch = '\t'; break;
 636                 case 'v': *ch = '\v'; break;
 637                 case 'u':
 638                         if (*s == '{') {
 639                                 /* Expect \u{NNNN} */
 640                                 n = parse_hex(s + 1, 6, ch);
 641                                 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
 642                                         s += n + 2;
 643                                 }
 644                                 else {
 645                                         /* Invalid, so just treat as an escaped 'u' */
 646                                         *ch = 'u';
 647                                 }
 648                         }
 649                         else if ((n = parse_hex(s, 4, ch)) > 0) {
 650                                 s += n;
 651                         }
 652                         break;
 653                 case 'U':
 654                         if ((n = parse_hex(s, 8, ch)) > 0) {
 655                                 s += n;
 656                         }
 657                         break;
 658                 case 'x':
 659                         if ((n = parse_hex(s, 2, ch)) > 0) {
 660                                 s += n;
 661                         }
 662                         break;
 663                 case '\0':
 664                         s--;
 665                         *ch = '\\';
 666                         break;
 667         }
 668         return s - s0;
 669 }
 670
 671 /*
 672  - regatom - the lowest level
 673  *
 674  * Optimization:  gobbles an entire sequence of ordinary characters so that
 675  * it can turn them into a single node, which is smaller to store and
 676  * faster to run.  Backslashed characters are exceptions, each becoming a
 677  * separate node; the code is simpler that way and it's not worth fixing.
 678  */
 679 static int regatom(regex_t *preg, int *flagp)
 680 {
 681         int ret;
 682         int flags;
 683         int nocase = (preg->cflags & REG_ICASE);
 684
 685         int ch;
 686         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 687
 688         *flagp = WORST;         /* Tentatively. */
 689
 690         preg->regparse += n;
 691         switch (ch) {
 692         /* FIXME: these chars only have meaning at beg/end of pat? */
 693         case '^':
 694                 ret = regnode(preg, BOL);
 695                 break;
 696         case '$':
 697                 ret = regnode(preg, EOL);
 698                 break;
 699         case '.':
 700                 ret = regnode(preg, ANY);
 701                 *flagp |= HASWIDTH|SIMPLE;
 702                 break;
 703         case '[': {
 704                         const char *pattern = preg->regparse;
 705
 706                         if (*pattern == '^') {  /* Complement of range. */
 707                                 ret = regnode(preg, ANYBUT);
 708                                 pattern++;
 709                         } else
 710                                 ret = regnode(preg, ANYOF);
 711
 712                         /* Special case. If the first char is ']' or '-', it is part of the set */
 713                         if (*pattern == ']' || *pattern == '-') {
 714                                 reg_addrange(preg, *pattern, *pattern);
 715                                 pattern++;
 716                         }
 717
 718                         while (*pattern && *pattern != ']') {
 719                                 /* Is this a range? a-z */
 720                                 int start;
 721                                 int end;
 722
 723                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 724                                 if (start == '\\') {
 725                                         pattern += reg_decode_escape(pattern, &start);
 726                                         if (start == 0) {
 727                                                 preg->err = REG_ERR_NULL_CHAR;
 728                                                 return 0;
 729                                         }
 730                                 }
 731                                 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
 732                                         /* skip '-' */
 733                                         pattern += utf8_tounicode(pattern, &end);
 734                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 735                                         if (end == '\\') {
 736                                                 pattern += reg_decode_escape(pattern, &end);
 737                                                 if (end == 0) {
 738                                                         preg->err = REG_ERR_NULL_CHAR;
 739                                                         return 0;
 740                                                 }
 741                                         }
 742
 743                                         reg_addrange(preg, start, end);
 744                                         continue;
 745                                 }
 746                                 if (start == '[' && pattern[0] == ':') {
 747                                         static const char *character_class[] = {
 748                                                 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
 749                                                 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
 750                                         };
 751                                         enum {
 752                                                 CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
 753                                                 CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
 754                                                 CC_NUM
 755                                         };
 756                                         int i;
 757
 758                                         for (i = 0; i < CC_NUM; i++) {
 759                                                 int n = strlen(character_class[i]);
 760                                                 if (strncmp(pattern, character_class[i], n) == 0) {
 761                                                         /* Found a character class */
 762                                                         pattern += n + 1;
 763                                                         break;
 764                                                 }
 765                                         }
 766                                         if (i != CC_NUM) {
 767                                                 switch (i) {
 768                                                         case CC_ALNUM:
 769                                                                 reg_addrange(preg, '0', '9');
 770                                                                 /* Fall through */
 771                                                         case CC_ALPHA:
 772                                                                 if ((preg->cflags & REG_ICASE) == 0) {
 773                                                                         reg_addrange(preg, 'a', 'z');
 774                                                                 }
 775                                                                 reg_addrange(preg, 'A', 'Z');
 776                                                                 break;
 777                                                         case CC_SPACE:
 778                                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 779                                                                 break;
 780                                                         case CC_BLANK:
 781                                                                 reg_addrange_str(preg, " \t");
 782                                                                 break;
 783                                                         case CC_UPPER:
 784                                                                 reg_addrange(preg, 'A', 'Z');
 785                                                                 break;
 786                                                         case CC_LOWER:
 787                                                                 reg_addrange(preg, 'a', 'z');
 788                                                                 break;
 789                                                         case CC_XDIGIT:
 790                                                                 reg_addrange(preg, 'a', 'f');
 791                                                                 reg_addrange(preg, 'A', 'F');
 792                                                                 /* Fall through */
 793                                                         case CC_DIGIT:
 794                                                                 reg_addrange(preg, '0', '9');
 795                                                                 break;
 796                                                         case CC_CNTRL:
 797                                                                 reg_addrange(preg, 0, 31);
 798                                                                 reg_addrange(preg, 127, 127);
 799                                                                 break;
 800                                                         case CC_PRINT:
 801                                                                 reg_addrange(preg, ' ', '~');
 802                                                                 break;
 803                                                         case CC_GRAPH:
 804                                                                 reg_addrange(preg, '!', '~');
 805                                                                 break;
 806                                                         case CC_PUNCT:
 807                                                                 reg_addrange(preg, '!', '/');
 808                                                                 reg_addrange(preg, ':', '@');
 809                                                                 reg_addrange(preg, '[', '`');
 810                                                                 reg_addrange(preg, '{', '~');
 811                                                                 break;
 812                                                 }
 813                                                 continue;
 814                                         }
 815                                 }
 816                                 /* Not a range, so just add the char */
 817                                 reg_addrange(preg, start, start);
 818                         }
 819                         regc(preg, '\0');
 820
 821                         if (*pattern) {
 822                                 pattern++;
 823                         }
 824                         preg->regparse = pattern;
 825
 826                         *flagp |= HASWIDTH|SIMPLE;
 827                 }
 828                 break;
 829         case '(':
 830                 ret = reg(preg, 1, &flags);
 831                 if (ret == 0)
 832                         return 0;
 833                 *flagp |= flags&(HASWIDTH|SPSTART);
 834                 break;
 835         case '\0':
 836         case '|':
 837         case ')':
 838                 preg->err = REG_ERR_INTERNAL;
 839                 return 0;       /* Supposed to be caught earlier. */
 840         case '?':
 841         case '+':
 842         case '*':
 843         case '{':
 844                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 845                 return 0;
 846         case '\\':
 847                 ch = *preg->regparse++;
 848                 switch (ch) {
 849                 case '\0':
 850                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 851                         return 0;
 852                 case 'A':
 853                         ret = regnode(preg, BOLX);
 854                         break;
 855                 case 'Z':
 856                         ret = regnode(preg, EOLX);
 857                         break;
 858                 case '<':
 859                 case 'm':
 860                         ret = regnode(preg, WORDA);
 861                         break;
 862                 case '>':
 863                 case 'M':
 864                         ret = regnode(preg, WORDZ);
 865                         break;
 866                 case 'd':
 867                 case 'D':
 868                         ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
 869                         reg_addrange(preg, '0', '9');
 870                         regc(preg, '\0');
 871                         *flagp |= HASWIDTH|SIMPLE;
 872                         break;
 873                 case 'w':
 874                 case 'W':
 875                         ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
 876                         if ((preg->cflags & REG_ICASE) == 0) {
 877                                 reg_addrange(preg, 'a', 'z');
 878                         }
 879                         reg_addrange(preg, 'A', 'Z');
 880                         reg_addrange(preg, '0', '9');
 881                         reg_addrange(preg, '_', '_');
 882                         regc(preg, '\0');
 883                         *flagp |= HASWIDTH|SIMPLE;
 884                         break;
 885                 case 's':
 886                 case 'S':
 887                         ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
 888                         reg_addrange_str(preg," \t\r\n\f\v");
 889                         regc(preg, '\0');
 890                         *flagp |= HASWIDTH|SIMPLE;
 891                         break;
 892                 /* FIXME: Someday handle \1, \2, ... */
 893                 default:
 894                         /* Handle general quoted chars in exact-match routine */
 895                         /* Back up to include the backslash */
 896                         preg->regparse--;
 897                         goto de_fault;
 898                 }
 899                 break;
 900         de_fault:
 901         default: {
 902                         /*
 903                          * Encode a string of characters to be matched exactly.
 904                          */
 905                         int added = 0;
 906
 907                         /* Back up to pick up the first char of interest */
 908                         preg->regparse -= n;
 909
 910                         ret = regnode(preg, EXACTLY);
 911
 912                         /* Note that a META operator such as ? or * consumes the
 913                          * preceding char.
 914                          * Thus we must be careful to look ahead by 2 and add the
 915                          * last char as it's own EXACTLY if necessary
 916                          */
 917
 918                         /* Until end of string or a META char is reached */
 919                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 920                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 921                                 if (ch == '\\' && preg->regparse[n]) {
 922                                         /* Non-trailing backslash.
 923                                          * Is this a special escape, or a regular escape?
 924                                          */
 925                                         if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
 926                                                 /* A special escape. All done with EXACTLY */
 927                                                 break;
 928                                         }
 929                                         /* Decode it. Note that we add the length for the escape
 930                                          * sequence to the length for the backlash so we can skip
 931                                          * the entire sequence, or not as required.
 932                                          */
 933                                         n += reg_decode_escape(preg->regparse + n, &ch);
 934                                         if (ch == 0) {
 935                                                 preg->err = REG_ERR_NULL_CHAR;
 936                                                 return 0;
 937                                         }
 938                                 }
 939
 940                                 /* Now we have one char 'ch' of length 'n'.
 941                                  * Check to see if the following char is a MULT
 942                                  */
 943
 944                                 if (ISMULT(preg->regparse[n])) {
 945                                         /* Yes. But do we already have some EXACTLY chars? */
 946                                         if (added) {
 947                                                 /* Yes, so return what we have and pick up the current char next time around */
 948                                                 break;
 949                                         }
 950                                         /* No, so add this single char and finish */
 951                                         regc(preg, ch);
 952                                         added++;
 953                                         preg->regparse += n;
 954                                         break;
 955                                 }
 956
 957                                 /* No, so just add this char normally */
 958                                 regc(preg, ch);
 959                                 added++;
 960                                 preg->regparse += n;
 961                         }
 962                         regc(preg, '\0');
 963
 964                         *flagp |= HASWIDTH;
 965                         if (added == 1)
 966                                 *flagp |= SIMPLE;
 967                         break;
 968                 }
 969                 break;
 970         }
 971
 972         return(ret);
 973 }
 974
 975 static void reg_grow(regex_t *preg, int n)
 976 {
 977         if (preg->p + n >= preg->proglen) {
 978                 preg->proglen = (preg->p + n) * 2;
 979                 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
 980         }
 981 }
 982
 983 /*
 984  - regnode - emit a node
 985  */
 986 /* Location. */
 987 static int regnode(regex_t *preg, int op)
 988 {
 989         reg_grow(preg, 2);
 990
 991         /* The OP followed by a next pointer */
 992         preg->program[preg->p++] = op;
 993         preg->program[preg->p++] = 0;
 994
 995         /* Return the start of the node */
 996         return preg->p - 2;
 997 }
 998
 999 /*
1000  - regc - emit (if appropriate) a byte of code
1001  */
1002 static void regc(regex_t *preg, int b )
1003 {
1004         reg_grow(preg, 1);
1005         preg->program[preg->p++] = b;
1006 }
1007
1008 /*
1009  - reginsert - insert an operator in front of already-emitted operand
1010  *
1011  * Means relocating the operand.
1012  * Returns the new location of the original operand.
1013  */
1014 static int reginsert(regex_t *preg, int op, int size, int opnd )
1015 {
1016         reg_grow(preg, size);
1017
1018         /* Move everything from opnd up */
1019         memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1020         /* Zero out the new space */
1021         memset(preg->program + opnd, 0, sizeof(int) * size);
1022
1023         preg->program[opnd] = op;
1024
1025         preg->p += size;
1026
1027         return opnd + size;
1028 }
1029
1030 /*
1031  - regtail - set the next-pointer at the end of a node chain
1032  */
1033 static void regtail(regex_t *preg, int p, int val)
1034 {
1035         int scan;
1036         int temp;
1037         int offset;
1038
1039         /* Find last node. */
1040         scan = p;
1041         for (;;) {
1042                 temp = regnext(preg, scan);
1043                 if (temp == 0)
1044                         break;
1045                 scan = temp;
1046         }
1047
1048         if (OP(preg, scan) == BACK)
1049                 offset = scan - val;
1050         else
1051                 offset = val - scan;
1052
1053         preg->program[scan + 1] = offset;
1054 }
1055
1056 /*
1057  - regoptail - regtail on operand of first argument; nop if operandless
1058  */
1059
1060 static void regoptail(regex_t *preg, int p, int val )
1061 {
1062         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1063         if (p != 0 && OP(preg, p) == BRANCH) {
1064                 regtail(preg, OPERAND(p), val);
1065         }
1066 }
1067
1068 /*
1069  * regexec and friends
1070  */
1071
1072 /*
1073  * Forwards.
1074  */
1075 static int regtry(regex_t *preg, const char *string );
1076 static int regmatch(regex_t *preg, int prog);
1077 static int regrepeat(regex_t *preg, int p, int max);
1078
1079 /*
1080  - regexec - match a regexp against a string
1081  */
1082 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1083 {
1084         const char *s;
1085         int scan;
1086
1087         /* Be paranoid... */
1088         if (preg == NULL || preg->program == NULL || string == NULL) {
1089                 return REG_ERR_NULL_ARGUMENT;
1090         }
1091
1092         /* Check validity of program. */
1093         if (*preg->program != REG_MAGIC) {
1094                 return REG_ERR_CORRUPTED;
1095         }
1096
1097 #ifdef DEBUG
1098         fprintf(stderr, "regexec: %s\n", string);
1099         regdump(preg);
1100 #endif
1101
1102         preg->eflags = eflags;
1103         preg->pmatch = pmatch;
1104         preg->nmatch = nmatch;
1105         preg->start = string;   /* All offsets are computed from here */
1106
1107         /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1108         for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1109                 int op = OP(preg, scan);
1110                 if (op == END)
1111                         break;
1112                 if (op == REPX || op == REPXMIN)
1113                         preg->program[scan + 4] = 0;
1114         }
1115
1116         /* If there is a "must appear" string, look for it. */
1117         if (preg->regmust != 0) {
1118                 s = string;
1119                 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1120                         if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1121                                 break;
1122                         }
1123                         s++;
1124                 }
1125                 if (s == NULL)  /* Not present. */
1126                         return REG_NOMATCH;
1127         }
1128
1129         /* Mark beginning of line for ^ . */
1130         preg->regbol = string;
1131
1132         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1133         if (preg->reganch) {
1134                 if (eflags & REG_NOTBOL) {
1135                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1136                         goto nextline;
1137                 }
1138                 while (1) {
1139                         if (regtry(preg, string)) {
1140                                 return REG_NOERROR;
1141                         }
1142                         if (*string) {
1143 nextline:
1144                                 if (preg->cflags & REG_NEWLINE) {
1145                                         /* Try the next anchor? */
1146                                         string = strchr(string, '\n');
1147                                         if (string) {
1148                                                 preg->regbol = ++string;
1149                                                 continue;
1150                                         }
1151                                 }
1152                         }
1153                         return REG_NOMATCH;
1154                 }
1155         }
1156
1157         /* Messy cases:  unanchored match. */
1158         s = string;
1159         if (preg->regstart != '\0') {
1160                 /* We know what char it must start with. */
1161                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1162                         if (regtry(preg, s))
1163                                 return REG_NOERROR;
1164                         s++;
1165                 }
1166         }
1167         else
1168                 /* We don't -- general case. */
1169                 while (1) {
1170                         if (regtry(preg, s))
1171                                 return REG_NOERROR;
1172                         if (*s == '\0') {
1173                                 break;
1174                         }
1175                         else {
1176                                 int c;
1177                                 s += utf8_tounicode(s, &c);
1178                         }
1179                 }
1180
1181         /* Failure. */
1182         return REG_NOMATCH;
1183 }
1184
1185 /*
1186  - regtry - try match at specific point
1187  */
1188                         /* 0 failure, 1 success */
1189 static int regtry( regex_t *preg, const char *string )
1190 {
1191         int i;
1192
1193         preg->reginput = string;
1194
1195         for (i = 0; i < preg->nmatch; i++) {
1196                 preg->pmatch[i].rm_so = -1;
1197                 preg->pmatch[i].rm_eo = -1;
1198         }
1199         if (regmatch(preg, 1)) {
1200                 preg->pmatch[0].rm_so = string - preg->start;
1201                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1202                 return(1);
1203         } else
1204                 return(0);
1205 }
1206
1207 /**
1208  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1209  *
1210  * If 'nocase' is non-zero, does a case-insensitive match.
1211  *
1212  * Returns -1 on not found.
1213  */
1214 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1215 {
1216         const char *s = string;
1217         while (proglen && *s) {
1218                 int ch;
1219                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1220                 if (ch != *prog) {
1221                         return -1;
1222                 }
1223                 prog++;
1224                 s += n;
1225                 proglen--;
1226         }
1227         if (proglen == 0) {
1228                 return s - string;
1229         }
1230         return -1;
1231 }
1232
1233 /**
1234  * Searchs for 'c' in the range 'range'.
1235  *
1236  * Returns 1 if found, or 0 if not.
1237  */
1238 static int reg_range_find(const int *range, int c)
1239 {
1240         while (*range) {
1241                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1242                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1243                         return 1;
1244                 }
1245                 range += 2;
1246         }
1247         return 0;
1248 }
1249
1250 /**
1251  * Search for the character 'c' in the utf-8 string 'string'.
1252  *
1253  * If 'nocase' is set, the 'string' is assumed to be uppercase
1254  * and 'c' is converted to uppercase before matching.
1255  *
1256  * Returns the byte position in the string where the 'c' was found, or
1257  * NULL if not found.
1258  */
1259 static const char *str_find(const char *string, int c, int nocase)
1260 {
1261         if (nocase) {
1262                 /* The "string" should already be converted to uppercase */
1263                 c = utf8_upper(c);
1264         }
1265         while (*string) {
1266                 int ch;
1267                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1268                 if (c == ch) {
1269                         return string;
1270                 }
1271                 string += n;
1272         }
1273         return NULL;
1274 }
1275
1276 /**
1277  * Returns true if 'ch' is an end-of-line char.
1278  *
1279  * In REG_NEWLINE mode, \n is considered EOL in
1280  * addition to \0
1281  */
1282 static int reg_iseol(regex_t *preg, int ch)
1283 {
1284         if (preg->cflags & REG_NEWLINE) {
1285                 return ch == '\0' || ch == '\n';
1286         }
1287         else {
1288                 return ch == '\0';
1289         }
1290 }
1291
1292 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1293 {
1294         int nextch = '\0';
1295         const char *save;
1296         int no;
1297         int c;
1298
1299         int max = preg->program[scan + 2];
1300         int min = preg->program[scan + 3];
1301         int next = regnext(preg, scan);
1302
1303         /*
1304          * Lookahead to avoid useless match attempts
1305          * when we know what character comes next.
1306          */
1307         if (OP(preg, next) == EXACTLY) {
1308                 nextch = preg->program[OPERAND(next)];
1309         }
1310         save = preg->reginput;
1311         no = regrepeat(preg, scan + 5, max);
1312         if (no < min) {
1313                 return 0;
1314         }
1315         if (matchmin) {
1316                 /* from min up to no */
1317                 max = no;
1318                 no = min;
1319         }
1320         /* else from no down to min */
1321         while (1) {
1322                 if (matchmin) {
1323                         if (no > max) {
1324                                 break;
1325                         }
1326                 }
1327                 else {
1328                         if (no < min) {
1329                                 break;
1330                         }
1331                 }
1332                 preg->reginput = save + utf8_index(save, no);
1333                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1334                 /* If it could work, try it. */
1335                 if (reg_iseol(preg, nextch) || c == nextch) {
1336                         if (regmatch(preg, next)) {
1337                                 return(1);
1338                         }
1339                 }
1340                 if (matchmin) {
1341                         /* Couldn't or didn't, add one more */
1342                         no++;
1343                 }
1344                 else {
1345                         /* Couldn't or didn't -- back up. */
1346                         no--;
1347                 }
1348         }
1349         return(0);
1350 }
1351
1352 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1353 {
1354         int *scanpt = preg->program + scan;
1355
1356         int max = scanpt[2];
1357         int min = scanpt[3];
1358
1359         /* Have we reached min? */
1360         if (scanpt[4] < min) {
1361                 /* No, so get another one */
1362                 scanpt[4]++;
1363                 if (regmatch(preg, scan + 5)) {
1364                         return 1;
1365                 }
1366                 scanpt[4]--;
1367                 return 0;
1368         }
1369         if (scanpt[4] > max) {
1370                 return 0;
1371         }
1372
1373         if (matchmin) {
1374                 /* minimal, so try other branch first */
1375                 if (regmatch(preg, regnext(preg, scan))) {
1376                         return 1;
1377                 }
1378                 /* No, so try one more */
1379                 scanpt[4]++;
1380                 if (regmatch(preg, scan + 5)) {
1381                         return 1;
1382                 }
1383                 scanpt[4]--;
1384                 return 0;
1385         }
1386         /* maximal, so try this branch again */
1387         if (scanpt[4] < max) {
1388                 scanpt[4]++;
1389                 if (regmatch(preg, scan + 5)) {
1390                         return 1;
1391                 }
1392                 scanpt[4]--;
1393         }
1394         /* At this point we are at max with no match. Try the other branch */
1395         return regmatch(preg, regnext(preg, scan));
1396 }
1397
1398 /*
1399  - regmatch - main matching routine
1400  *
1401  * Conceptually the strategy is simple:  check to see whether the current
1402  * node matches, call self recursively to see whether the rest matches,
1403  * and then act accordingly.  In practice we make some effort to avoid
1404  * recursion, in particular by going through "ordinary" nodes (that don't
1405  * need to know whether the rest of the match failed) by a loop instead of
1406  * by recursion.
1407  */
1408 /* 0 failure, 1 success */
1409 static int regmatch(regex_t *preg, int prog)
1410 {
1411         int scan;       /* Current node. */
1412         int next;               /* Next node. */
1413         const char *save;
1414
1415         scan = prog;
1416
1417 #ifdef DEBUG
1418         if (scan != 0 && regnarrate)
1419                 fprintf(stderr, "%s(\n", regprop(scan));
1420 #endif
1421         while (scan != 0) {
1422                 int n;
1423                 int c;
1424 #ifdef DEBUG
1425                 if (regnarrate) {
1426                         fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1427                 }
1428 #endif
1429                 next = regnext(preg, scan);
1430                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1431
1432                 switch (OP(preg, scan)) {
1433                 case BOLX:
1434                         if ((preg->eflags & REG_NOTBOL)) {
1435                                 return(0);
1436                         }
1437                         /* Fall through */
1438                 case BOL:
1439                         if (preg->reginput != preg->regbol) {
1440                                 return(0);
1441                         }
1442                         break;
1443                 case EOLX:
1444                         if (c != 0) {
1445                                 /* For EOLX, only match real end of line, not newline */
1446                                 return 0;
1447                         }
1448                         break;
1449                 case EOL:
1450                         if (!reg_iseol(preg, c)) {
1451                                 return(0);
1452                         }
1453                         break;
1454                 case WORDA:
1455                         /* Must be looking at a letter, digit, or _ */
1456                         if ((!isalnum(UCHAR(c))) && c != '_')
1457                                 return(0);
1458                         /* Prev must be BOL or nonword */
1459                         if (preg->reginput > preg->regbol &&
1460                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1461                                 return(0);
1462                         break;
1463                 case WORDZ:
1464                         /* Can't match at BOL */
1465                         if (preg->reginput > preg->regbol) {
1466                                 /* Current must be EOL or nonword */
1467                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1468                                         c = preg->reginput[-1];
1469                                         /* Previous must be word */
1470                                         if (isalnum(UCHAR(c)) || c == '_') {
1471                                                 break;
1472                                         }
1473                                 }
1474                         }
1475                         /* No */
1476                         return(0);
1477
1478                 case ANY:
1479                         if (reg_iseol(preg, c))
1480                                 return 0;
1481                         preg->reginput += n;
1482                         break;
1483                 case EXACTLY: {
1484                                 int opnd;
1485                                 int len;
1486                                 int slen;
1487
1488                                 opnd = OPERAND(scan);
1489                                 len = str_int_len(preg->program + opnd);
1490
1491                                 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1492                                 if (slen < 0) {
1493                                         return(0);
1494                                 }
1495                                 preg->reginput += slen;
1496                         }
1497                         break;
1498                 case ANYOF:
1499                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1500                                 return(0);
1501                         }
1502                         preg->reginput += n;
1503                         break;
1504                 case ANYBUT:
1505                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1506                                 return(0);
1507                         }
1508                         preg->reginput += n;
1509                         break;
1510                 case NOTHING:
1511                         break;
1512                 case BACK:
1513                         break;
1514                 case BRANCH:
1515                         if (OP(preg, next) != BRANCH)           /* No choice. */
1516                                 next = OPERAND(scan);   /* Avoid recursion. */
1517                         else {
1518                                 do {
1519                                         save = preg->reginput;
1520                                         if (regmatch(preg, OPERAND(scan))) {
1521                                                 return(1);
1522                                         }
1523                                         preg->reginput = save;
1524                                         scan = regnext(preg, scan);
1525                                 } while (scan != 0 && OP(preg, scan) == BRANCH);
1526                                 return(0);
1527                                 /* NOTREACHED */
1528                         }
1529                         break;
1530                 case REP:
1531                 case REPMIN:
1532                         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1533
1534                 case REPX:
1535                 case REPXMIN:
1536                         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1537
1538                 case END:
1539                         return 1;       /* Success! */
1540
1541                 case OPENNC:
1542                 case CLOSENC:
1543                         return regmatch(preg, next);
1544
1545                 default:
1546                         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1547                                 save = preg->reginput;
1548                                 if (regmatch(preg, next)) {
1549                                         if (OP(preg, scan) < CLOSE) {
1550                                                 int no = OP(preg, scan) - OPEN;
1551                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1552                                                         preg->pmatch[no].rm_so = save - preg->start;
1553                                                 }
1554                                         }
1555                                         else {
1556                                                 int no = OP(preg, scan) - CLOSE;
1557                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1558                                                         preg->pmatch[no].rm_eo = save - preg->start;
1559                                                 }
1560                                         }
1561                                         return(1);
1562                                 }
1563                                 return(0);
1564                         }
1565                         return REG_ERR_INTERNAL;
1566                 }
1567
1568                 scan = next;
1569         }
1570
1571         /*
1572          * We get here only if there's trouble -- normally "case END" is
1573          * the terminating point.
1574          */
1575         return REG_ERR_INTERNAL;
1576 }
1577
1578 /*
1579  - regrepeat - repeatedly match something simple, report how many
1580  */
1581 static int regrepeat(regex_t *preg, int p, int max)
1582 {
1583         int count = 0;
1584         const char *scan;
1585         int opnd;
1586         int ch;
1587         int n;
1588
1589         scan = preg->reginput;
1590         opnd = OPERAND(p);
1591         switch (OP(preg, p)) {
1592         case ANY:
1593                 /* No need to handle utf8 specially here */
1594                 while (!reg_iseol(preg, *scan) && count < max) {
1595                         count++;
1596                         scan++;
1597                 }
1598                 break;
1599         case EXACTLY:
1600                 while (count < max) {
1601                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1602                         if (preg->program[opnd] != ch) {
1603                                 break;
1604                         }
1605                         count++;
1606                         scan += n;
1607                 }
1608                 break;
1609         case ANYOF:
1610                 while (count < max) {
1611                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1612                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1613                                 break;
1614                         }
1615                         count++;
1616                         scan += n;
1617                 }
1618                 break;
1619         case ANYBUT:
1620                 while (count < max) {
1621                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1622                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1623                                 break;
1624                         }
1625                         count++;
1626                         scan += n;
1627                 }
1628                 break;
1629         default:                /* Oh dear.  Called inappropriately. */
1630                 preg->err = REG_ERR_INTERNAL;
1631                 count = 0;      /* Best compromise. */
1632                 break;
1633         }
1634         preg->reginput = scan;
1635
1636         return(count);
1637 }
1638
1639 /*
1640  - regnext - dig the "next" pointer out of a node
1641  */
1642 static int regnext(regex_t *preg, int p )
1643 {
1644         int offset;
1645
1646         offset = NEXT(preg, p);
1647
1648         if (offset == 0)
1649                 return 0;
1650
1651         if (OP(preg, p) == BACK)
1652                 return(p-offset);
1653         else
1654                 return(p+offset);
1655 }
1656
1657 /*
1658  - regopsize - returns the size of opcode + operands at 'p' in words
1659  */
1660 static int regopsize(regex_t *preg, int p )
1661 {
1662         /* Almost all opcodes are 2 words, but some are more */
1663         switch (OP(preg, p)) {
1664                 case REP:
1665                 case REPMIN:
1666                 case REPX:
1667                 case REPXMIN:
1668                         return 5;
1669
1670                 case ANYOF:
1671                 case ANYBUT:
1672                 case EXACTLY: {
1673                         int s = p + 2;
1674                         while (preg->program[s++]) {
1675                         }
1676                         return s - p;
1677                 }
1678         }
1679         return 2;
1680 }
1681
1682 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1683
1684 /*
1685  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1686  */
1687 static void regdump(regex_t *preg)
1688 {
1689         int s;
1690         int op = EXACTLY;       /* Arbitrary non-END op. */
1691         int next;
1692         char buf[MAX_UTF8_LEN + 1];
1693
1694         int i;
1695         for (i = 1; i < preg->p; i++) {
1696                 printf("%02x ", (unsigned char)preg->program[i]);
1697                 if (i % 16 == 0) {
1698                         printf("\n");
1699                 }
1700         }
1701         printf("\n");
1702
1703         s = 1;
1704         while (op != END && s < preg->p) {      /* While that wasn't END last time... */
1705                 op = OP(preg, s);
1706                 printf("%3d: %s", s, regprop(op));      /* Where, what. */
1707                 next = regnext(preg, s);
1708                 if (next == 0)          /* Next ptr. */
1709                         printf("(0)");
1710                 else
1711                         printf("(%d)", next);
1712                 s += 2;
1713                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1714                         int max = preg->program[s];
1715                         int min = preg->program[s + 1];
1716                         if (max == 65535) {
1717                                 printf("{%d,*}", min);
1718                         }
1719                         else {
1720                                 printf("{%d,%d}", min, max);
1721                         }
1722                         printf(" %d", preg->program[s + 2]);
1723                         s += 3;
1724                 }
1725                 else if (op == ANYOF || op == ANYBUT) {
1726                         /* set of ranges */
1727
1728                         while (preg->program[s]) {
1729                                 int len = preg->program[s++];
1730                                 int first = preg->program[s++];
1731                                 buf[utf8_getchars(buf, first)] = 0;
1732                                 printf("%s", buf);
1733                                 if (len > 1) {
1734                                         buf[utf8_getchars(buf, first + len - 1)] = 0;
1735                                         printf("-%s", buf);
1736                                 }
1737                         }
1738                         s++;
1739                 }
1740                 else if (op == EXACTLY) {
1741                         /* Literal string, where present. */
1742
1743                         while (preg->program[s]) {
1744                                 buf[utf8_getchars(buf, preg->program[s])] = 0;
1745                                 printf("%s", buf);
1746                                 s++;
1747                         }
1748                         s++;
1749                 }
1750                 putchar('\n');
1751         }
1752
1753         if (op == END) {
1754                 /* Header fields of interest. */
1755                 if (preg->regstart) {
1756                         buf[utf8_getchars(buf, preg->regstart)] = 0;
1757                         printf("start '%s' ", buf);
1758                 }
1759                 if (preg->reganch)
1760                         printf("anchored ");
1761                 if (preg->regmust != 0) {
1762                         int i;
1763                         printf("must have:");
1764                         for (i = 0; i < preg->regmlen; i++) {
1765                                 putchar(preg->program[preg->regmust + i]);
1766                         }
1767                         putchar('\n');
1768                 }
1769         }
1770         printf("\n");
1771 }
1772
1773 /*
1774  - regprop - printable representation of opcode
1775  */
1776 static const char *regprop( int op )
1777 {
1778         static char buf[50];
1779
1780         switch (op) {
1781         case BOL:
1782                 return "BOL";
1783         case EOL:
1784                 return "EOL";
1785         case BOLX:
1786                 return "BOLX";
1787         case EOLX:
1788                 return "EOLX";
1789         case ANY:
1790                 return "ANY";
1791         case ANYOF:
1792                 return "ANYOF";
1793         case ANYBUT:
1794                 return "ANYBUT";
1795         case BRANCH:
1796                 return "BRANCH";
1797         case EXACTLY:
1798                 return "EXACTLY";
1799         case NOTHING:
1800                 return "NOTHING";
1801         case BACK:
1802                 return "BACK";
1803         case END:
1804                 return "END";
1805         case REP:
1806                 return "REP";
1807         case REPMIN:
1808                 return "REPMIN";
1809         case REPX:
1810                 return "REPX";
1811         case REPXMIN:
1812                 return "REPXMIN";
1813         case WORDA:
1814                 return "WORDA";
1815         case WORDZ:
1816                 return "WORDZ";
1817         case OPENNC:
1818                 return "OPEN";
1819         case CLOSENC:
1820                 return "CLOSE";
1821         default:
1822                 if (op >= OPEN && op < CLOSE) {
1823                         snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1824                 }
1825                 else if (op >= CLOSE && op < CLOSE_END) {
1826                         snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1827                 }
1828                 else {
1829                         snprintf(buf, sizeof(buf), "?%d?\n", op);
1830                 }
1831                 return(buf);
1832         }
1833 }
1834 #endif /* JIM_BOOTSTRAP */
1835
1836 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1837 {
1838         static const char *error_strings[] = {
1839                 "success",
1840                 "no match",
1841                 "bad pattern",
1842                 "null argument",
1843                 "unknown error",
1844                 "too big",
1845                 "out of memory",
1846                 "too many ()",
1847                 "parentheses () not balanced",
1848                 "braces {} not balanced",
1849                 "invalid repetition count(s)",
1850                 "extra characters",
1851                 "*+ of empty atom",
1852                 "nested count",
1853                 "internal error",
1854                 "count follows nothing",
1855                 "trailing backslash",
1856                 "corrupted program",
1857                 "contains null char",
1858         };
1859         const char *err;
1860
1861         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1862                 err = "Bad error code";
1863         }
1864         else {
1865                 err = error_strings[errcode];
1866         }
1867
1868         return snprintf(errbuf, errbuf_size, "%s", err);
1869 }
1870
1871 void regfree(regex_t *preg)
1872 {
1873         free(preg->program);
1874 }
1875
1876 #endif