jimregexp.c

   1 /*
   2  * regcomp and regexec -- regsub and regerror are elsewhere
   3  *
   4  *      Copyright (c) 1986 by University of Toronto.
   5  *      Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *      Permission is granted to anyone to use this software for any
   8  *      purpose on any computer system, and to redistribute it freely,
   9  *      subject to the following restrictions:
  10  *
  11  *      1. The author is not responsible for the consequences of use of
  12  *              this software, no matter how awful, even if they arise
  13  *              from defects in it.
  14  *
  15  *      2. The origin of this software must not be misrepresented, either
  16  *              by explicit claim or by omission.
  17  *
  18  *      3. Altered versions must be plainly marked as such, and must not
  19  *              be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  33  *** to avoid conflicting with the system regexp.h.  Const, bless its
  34  *** soul, was removed so it can compile everywhere.  The declaration
  35  *** of strchr() was in conflict on AIX, so it was removed (as it is
  36  *** happily defined in string.h).
  37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  41  *
  42  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  43  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  44  *   This includes counted repetitions, UTF-8 support, character classes,
  45  *   shorthand character classes, increased number of parentheses to 100,
  46  *   backslash escape sequences. It also removes \n as an alternative to |.
  47  *
  48  * Beware that some of this code is subtly aware of the way operator
  49  * precedence is structured in regular expressions.  Serious changes in
  50  * regular-expression syntax might require a total rethink.
  51  */
  52 #include <stdio.h>
  53 #include <ctype.h>
  54 #include <stdlib.h>
  55 #include <string.h>
  56
  57 #include "jim.h"
  58 #include "jimautoconf.h"
  59 #include "jimregexp.h"
  60 #include "utf8.h"
  61
  62 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
  63
  64 /*
  65  * Structure for regexp "program".  This is essentially a linear encoding
  66  * of a nondeterministic finite-state machine (aka syntax charts or
  67  * "railroad normal form" in parsing technology).  Each node is an opcode
  68  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  69  * all nodes except BRANCH implement concatenation; a "next" pointer with
  70  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  71  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  72  * opposed to a collection of them) is never concatenated with anything
  73  * because of operator precedence.)  The operand of some types of node is
  74  * a literal string; for others, it is a node leading into a sub-FSM.  In
  75  * particular, the operand of a BRANCH node is the first node of the branch.
  76  * (NB this is *not* a tree structure:  the tail of the branch connects
  77  * to the thing following the set of BRANCHes.)  The opcodes are:
  78  */
  79
  80 /* This *MUST* be less than (255-20-1)/2=117 */
  81 #define REG_MAX_PAREN 100
  82
  83 /* definition   number  opnd?   meaning */
  84 #define END     0       /* no   End of program. */
  85 #define BOL     1       /* no   Match "" at beginning of line. */
  86 #define EOL     2       /* no   Match "" at end of line. */
  87 #define ANY     3       /* no   Match any one character. */
  88 #define ANYOF   4       /* str  Match any character in this string. */
  89 #define ANYBUT  5       /* str  Match any character not in this string. */
  90 #define BRANCH  6       /* node Match this alternative, or the next... */
  91 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  92 #define EXACTLY 8       /* str  Match this string. */
  93 #define NOTHING 9       /* no   Match empty string. */
  94 #define REP             10      /* max,min      Match this (simple) thing [min,max] times. */
  95 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, mininal match. */
  96 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
  97 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
  98
  99 #define WORDA   15      /* no   Match "" at wordchar, where prev is nonword */
 100 #define WORDZ   16      /* no   Match "" at nonwordchar, where prev is word */
 101 #define OPENNC  19      /* no   Non-capturing parentheses - must be OPEN-1 */
 102 #define OPEN    20      /* no   Mark this point in input as start of #n. */
 103                         /*      OPEN+1 is number 1, etc. */
 104 #define CLOSE   (OPEN+REG_MAX_PAREN+1)  /* no   Analogous to OPEN. */
 105 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 106 #define CLOSENC (CLOSE-1)       /* no   Non-capturing parentheses - must be CLOSE-1 */
 107
 108 /*
 109  * The first byte of the regexp internal "program" is actually this magic
 110  * number; the start node begins in the second byte.
 111  */
 112 #define REG_MAGIC       0xFADED00D
 113
 114 /*
 115  * Opcode notes:
 116  *
 117  * BRANCH       The set of branches constituting a single choice are hooked
 118  *              together with their "next" pointers, since precedence prevents
 119  *              anything being concatenated to any individual branch.  The
 120  *              "next" pointer of the last BRANCH in a choice points to the
 121  *              thing following the whole choice.  This is also where the
 122  *              final "next" pointer of each individual branch points; each
 123  *              branch starts with the operand node of a BRANCH node.
 124  *
 125  * BACK         Normal "next" pointers all implicitly point forward; BACK
 126  *              exists to make loop structures possible.
 127  *
 128  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
 129  *              BRANCH structures using BACK.  Simple cases (one character
 130  *              per match) are implemented with STAR and PLUS for speed
 131  *              and to minimize recursive plunges.
 132  *
 133  * OPEN,CLOSE   ...are numbered at compile time.
 134  */
 135
 136 /*
 137  * A node is one char of opcode followed by two chars of "next" pointer.
 138  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 139  * value is a positive offset from the opcode of the node containing it.
 140  * An operand, if any, simply follows the node.  (Note that much of the
 141  * code generation knows about this implicit relationship.)
 142  *
 143  * Using two bytes for the "next" pointer is vast overkill for most things,
 144  * but allows patterns to get big without disasters.
 145  */
 146 #define OP(preg, p)     (preg->program[p])
 147 #define NEXT(preg, p)   (preg->program[p + 1])
 148 #define OPERAND(p)      ((p) + 2)
 149
 150 /*
 151  * See regmagic.h for one further detail of program structure.
 152  */
 153
 154
 155 /*
 156  * Utility definitions.
 157  */
 158
 159 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 160 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 161 #define META    "^$.[()|?{+*"
 162
 163 /*
 164  * Flags to be passed up and down.
 165  */
 166 #define HASWIDTH        01      /* Known never to match null string. */
 167 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
 168 #define SPSTART         04      /* Starts with * or +. */
 169 #define WORST           0       /* Worst case. */
 170
 171 #define MAX_REP_COUNT 1000000
 172
 173 /*
 174  * Forward declarations for regcomp()'s friends.
 175  */
 176 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 177 static int regpiece(regex_t *preg, int *flagp );
 178 static int regbranch(regex_t *preg, int *flagp );
 179 static int regatom(regex_t *preg, int *flagp );
 180 static int regnode(regex_t *preg, int op );
 181 static int regnext(regex_t *preg, int p );
 182 static void regc(regex_t *preg, int b );
 183 static int reginsert(regex_t *preg, int op, int size, int opnd );
 184 static void regtail_(regex_t *preg, int p, int val, int line );
 185 static void regoptail(regex_t *preg, int p, int val );
 186 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
 187
 188 static int reg_range_find(const int *string, int c);
 189 static const char *str_find(const char *string, int c, int nocase);
 190 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 191
 192 /*#define DEBUG*/
 193 #ifdef DEBUG
 194 static int regnarrate = 0;
 195 static void regdump(regex_t *preg);
 196 static const char *regprop( int op );
 197 #endif
 198
 199
 200 /**
 201  * Returns the length of the null-terminated integer sequence.
 202  */
 203 static int str_int_len(const int *seq)
 204 {
 205         int n = 0;
 206         while (*seq++) {
 207                 n++;
 208         }
 209         return n;
 210 }
 211
 212 /*
 213  - regcomp - compile a regular expression into internal code
 214  *
 215  * We can't allocate space until we know how big the compiled form will be,
 216  * but we can't compile it (and thus know how big it is) until we've got a
 217  * place to put the code.  So we cheat:  we compile it twice, once with code
 218  * generation turned off and size counting turned on, and once "for real".
 219  * This also means that we don't allocate space until we are sure that the
 220  * thing really will compile successfully, and we never have to move the
 221  * code and thus invalidate pointers into it.  (Note that it has to be in
 222  * one piece because free() must be able to free it all.)
 223  *
 224  * Beware that the optimization-preparation code in here knows about some
 225  * of the structure of the compiled regexp.
 226  */
 227 int regcomp(regex_t *preg, const char *exp, int cflags)
 228 {
 229         int scan;
 230         int longest;
 231         unsigned len;
 232         int flags;
 233
 234 #ifdef DEBUG
 235         fprintf(stderr, "Compiling: '%s'\n", exp);
 236 #endif
 237         memset(preg, 0, sizeof(*preg));
 238
 239         if (exp == NULL)
 240                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 241
 242         /* First pass: determine size, legality. */
 243         preg->cflags = cflags;
 244         preg->regparse = exp;
 245         /* XXX: For now, start unallocated */
 246         preg->program = NULL;
 247         preg->proglen = 0;
 248
 249         /* Allocate space. */
 250         preg->proglen = (strlen(exp) + 1) * 5;
 251         preg->program = malloc(preg->proglen * sizeof(int));
 252         if (preg->program == NULL)
 253                 FAIL(preg, REG_ERR_NOMEM);
 254
 255         /* Note that since we store a magic value as the first item in the program,
 256          * program offsets will never be 0
 257          */
 258         regc(preg, REG_MAGIC);
 259         if (reg(preg, 0, &flags) == 0) {
 260                 return preg->err;
 261         }
 262
 263         /* Small enough for pointer-storage convention? */
 264         if (preg->re_nsub >= REG_MAX_PAREN)             /* Probably could be 65535L. */
 265                 FAIL(preg,REG_ERR_TOO_BIG);
 266
 267         /* Dig out information for optimizations. */
 268         preg->regstart = 0;     /* Worst-case defaults. */
 269         preg->reganch = 0;
 270         preg->regmust = 0;
 271         preg->regmlen = 0;
 272         scan = 1;                       /* First BRANCH. */
 273         if (OP(preg, regnext(preg, scan)) == END) {             /* Only one top-level choice. */
 274                 scan = OPERAND(scan);
 275
 276                 /* Starting-point info. */
 277                 if (OP(preg, scan) == EXACTLY) {
 278                         preg->regstart = preg->program[OPERAND(scan)];
 279                 }
 280                 else if (OP(preg, scan) == BOL)
 281                         preg->reganch++;
 282
 283                 /*
 284                  * If there's something expensive in the r.e., find the
 285                  * longest literal string that must appear and make it the
 286                  * regmust.  Resolve ties in favor of later strings, since
 287                  * the regstart check works with the beginning of the r.e.
 288                  * and avoiding duplication strengthens checking.  Not a
 289                  * strong reason, but sufficient in the absence of others.
 290                  */
 291                 if (flags&SPSTART) {
 292                         longest = 0;
 293                         len = 0;
 294                         for (; scan != 0; scan = regnext(preg, scan)) {
 295                                 if (OP(preg, scan) == EXACTLY) {
 296                                         int plen = str_int_len(preg->program + OPERAND(scan));
 297                                         if (plen >= len) {
 298                                                 longest = OPERAND(scan);
 299                                                 len = plen;
 300                                         }
 301                                 }
 302                         }
 303                         preg->regmust = longest;
 304                         preg->regmlen = len;
 305                 }
 306         }
 307
 308 #ifdef DEBUG
 309         regdump(preg);
 310 #endif
 311
 312         return 0;
 313 }
 314
 315 /*
 316  - reg - regular expression, i.e. main body or parenthesized thing
 317  *
 318  * Caller must absorb opening parenthesis.
 319  *
 320  * Combining parenthesis handling with the base level of regular expression
 321  * is a trifle forced, but the need to tie the tails of the branches to what
 322  * follows makes it hard to avoid.
 323  */
 324 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 325 {
 326         int ret;
 327         int br;
 328         int ender;
 329         int parno = 0;
 330         int flags;
 331
 332         *flagp = HASWIDTH;      /* Tentatively. */
 333
 334         /* Make an OPEN node, if parenthesized. */
 335         if (paren) {
 336                 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 337                         /* non-capturing paren */
 338                         preg->regparse += 2;
 339                         parno = -1;
 340                 }
 341                 else {
 342                         parno = ++preg->re_nsub;
 343                 }
 344                 ret = regnode(preg, OPEN+parno);
 345         } else
 346                 ret = 0;
 347
 348         /* Pick up the branches, linking them together. */
 349         br = regbranch(preg, &flags);
 350         if (br == 0)
 351                 return 0;
 352         if (ret != 0)
 353                 regtail(preg, ret, br); /* OPEN -> first. */
 354         else
 355                 ret = br;
 356         if (!(flags&HASWIDTH))
 357                 *flagp &= ~HASWIDTH;
 358         *flagp |= flags&SPSTART;
 359         while (*preg->regparse == '|') {
 360                 preg->regparse++;
 361                 br = regbranch(preg, &flags);
 362                 if (br == 0)
 363                         return 0;
 364                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 365                 if (!(flags&HASWIDTH))
 366                         *flagp &= ~HASWIDTH;
 367                 *flagp |= flags&SPSTART;
 368         }
 369
 370         /* Make a closing node, and hook it on the end. */
 371         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 372         regtail(preg, ret, ender);
 373
 374         /* Hook the tails of the branches to the closing node. */
 375         for (br = ret; br != 0; br = regnext(preg, br))
 376                 regoptail(preg, br, ender);
 377
 378         /* Check for proper termination. */
 379         if (paren && *preg->regparse++ != ')') {
 380                 preg->err = REG_ERR_UNMATCHED_PAREN;
 381                 return 0;
 382         } else if (!paren && *preg->regparse != '\0') {
 383                 if (*preg->regparse == ')') {
 384                         preg->err = REG_ERR_UNMATCHED_PAREN;
 385                         return 0;
 386                 } else {
 387                         preg->err = REG_ERR_JUNK_ON_END;
 388                         return 0;
 389                 }
 390         }
 391
 392         return(ret);
 393 }
 394
 395 /*
 396  - regbranch - one alternative of an | operator
 397  *
 398  * Implements the concatenation operator.
 399  */
 400 static int regbranch(regex_t *preg, int *flagp )
 401 {
 402         int ret;
 403         int chain;
 404         int latest;
 405         int flags;
 406
 407         *flagp = WORST;         /* Tentatively. */
 408
 409         ret = regnode(preg, BRANCH);
 410         chain = 0;
 411         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 412                *preg->regparse != '|') {
 413                 latest = regpiece(preg, &flags);
 414                 if (latest == 0)
 415                         return 0;
 416                 *flagp |= flags&HASWIDTH;
 417                 if (chain == 0) {/* First piece. */
 418                         *flagp |= flags&SPSTART;
 419                 }
 420                 else {
 421                         regtail(preg, chain, latest);
 422                 }
 423                 chain = latest;
 424         }
 425         if (chain == 0) /* Loop ran zero times. */
 426                 (void) regnode(preg, NOTHING);
 427
 428         return(ret);
 429 }
 430
 431 /*
 432  - regpiece - something followed by possible [*+?]
 433  *
 434  * Note that the branching code sequences used for ? and the general cases
 435  * of * and + are somewhat optimized:  they use the same NOTHING node as
 436  * both the endmarker for their branch list and the body of the last branch.
 437  * It might seem that this node could be dispensed with entirely, but the
 438  * endmarker role is not redundant.
 439  */
 440 static int regpiece(regex_t *preg, int *flagp)
 441 {
 442         int ret;
 443         char op;
 444         int next;
 445         int flags;
 446         int chain = 0;
 447         int min;
 448         int max;
 449
 450         ret = regatom(preg, &flags);
 451         if (ret == 0)
 452                 return 0;
 453
 454         op = *preg->regparse;
 455         if (!ISMULT(op)) {
 456                 *flagp = flags;
 457                 return(ret);
 458         }
 459
 460         if (!(flags&HASWIDTH) && op != '?') {
 461                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 462                 return 0;
 463         }
 464
 465         /* Handle braces (counted repetition) by expansion */
 466         if (op == '{') {
 467                 char *end;
 468
 469                 min = strtoul(preg->regparse + 1, &end, 10);
 470                 if (end == preg->regparse + 1) {
 471                         preg->err = REG_ERR_BAD_COUNT;
 472                         return 0;
 473                 }
 474                 if (*end == '}') {
 475                         max = min;
 476                 }
 477                 else {
 478                         preg->regparse = end;
 479                         max = strtoul(preg->regparse + 1, &end, 10);
 480                         if (*end != '}') {
 481                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 482                                 return 0;
 483                         }
 484                 }
 485                 if (end == preg->regparse + 1) {
 486                         max = MAX_REP_COUNT;
 487                 }
 488                 else if (max < min || max >= 100) {
 489                         preg->err = REG_ERR_BAD_COUNT;
 490                         return 0;
 491                 }
 492                 if (min >= 100) {
 493                         preg->err = REG_ERR_BAD_COUNT;
 494                         return 0;
 495                 }
 496
 497                 preg->regparse = strchr(preg->regparse, '}');
 498         }
 499         else {
 500                 min = (op == '+');
 501                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 502         }
 503
 504         if (preg->regparse[1] == '?') {
 505                 preg->regparse++;
 506                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 507         }
 508         else {
 509                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 510         }
 511         preg->program[ret + 2] = max;
 512         preg->program[ret + 3] = min;
 513         preg->program[ret + 4] = 0;
 514
 515         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 516
 517         if (!(flags & SIMPLE)) {
 518                 int back = regnode(preg, BACK);
 519                 regtail(preg, back, ret);
 520                 regtail(preg, next, back);
 521         }
 522
 523         preg->regparse++;
 524         if (ISMULT(*preg->regparse)) {
 525                 preg->err = REG_ERR_NESTED_COUNT;
 526                 return 0;
 527         }
 528
 529         return chain ? chain : ret;
 530 }
 531
 532 /**
 533  * Add all characters in the inclusive range between lower and upper.
 534  *
 535  * Handles a swapped range (upper < lower).
 536  */
 537 static void reg_addrange(regex_t *preg, int lower, int upper)
 538 {
 539         if (lower > upper) {
 540                 reg_addrange(preg, upper, lower);
 541         }
 542         /* Add a range as length, start */
 543         regc(preg, upper - lower + 1);
 544         regc(preg, lower);
 545 }
 546
 547 /**
 548  * Add a null-terminated literal string as a set of ranges.
 549  */
 550 static void reg_addrange_str(regex_t *preg, const char *str)
 551 {
 552         while (*str) {
 553                 reg_addrange(preg, *str, *str);
 554                 str++;
 555         }
 556 }
 557
 558 /**
 559  * Extracts the next unicode char from utf8.
 560  *
 561  * If 'upper' is set, converts the char to uppercase.
 562  */
 563 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 564 {
 565         int l = utf8_tounicode(s, uc);
 566         if (upper) {
 567                 *uc = utf8_upper(*uc);
 568         }
 569         return l;
 570 }
 571
 572 /**
 573  * Converts a hex digit to decimal.
 574  *
 575  * Returns -1 for an invalid hex digit.
 576  */
 577 static int hexdigitval(int c)
 578 {
 579         if (c >= '0' && c <= '9')
 580                 return c - '0';
 581         if (c >= 'a' && c <= 'f')
 582                 return c - 'a' + 10;
 583         if (c >= 'A' && c <= 'F')
 584                 return c - 'A' + 10;
 585         return -1;
 586 }
 587
 588 /**
 589  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 590  *
 591  * Returns the number of hex digits parsed.
 592  * If there are no hex digits, returns 0 and stores nothing.
 593  */
 594 static int parse_hex(const char *s, int n, int *uc)
 595 {
 596         int val = 0;
 597         int k;
 598
 599         for (k = 0; k < n; k++) {
 600                 int c = hexdigitval(*s++);
 601                 if (c == -1) {
 602                         break;
 603                 }
 604                 val = (val << 4) | c;
 605         }
 606         if (k) {
 607                 *uc = val;
 608         }
 609         return k;
 610 }
 611
 612 /**
 613  * Call for chars after a backlash to decode the escape sequence.
 614  *
 615  * Stores the result in *ch.
 616  *
 617  * Returns the number of bytes consumed.
 618  */
 619 static int reg_decode_escape(const char *s, int *ch)
 620 {
 621         int n;
 622         const char *s0 = s;
 623
 624         *ch = *s++;
 625
 626         switch (*ch) {
 627                 case 'b': *ch = '\b'; break;
 628                 case 'e': *ch = 27; break;
 629                 case 'f': *ch = '\f'; break;
 630                 case 'n': *ch = '\n'; break;
 631                 case 'r': *ch = '\r'; break;
 632                 case 't': *ch = '\t'; break;
 633                 case 'v': *ch = '\v'; break;
 634                 case 'u':
 635                         if (*s == '{') {
 636                                 /* Expect \u{NNNN} */
 637                                 n = parse_hex(s + 1, 6, ch);
 638                                 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
 639                                         s += n + 2;
 640                                 }
 641                                 else {
 642                                         /* Invalid, so just treat as an escaped 'u' */
 643                                         *ch = 'u';
 644                                 }
 645                         }
 646                         else if ((n = parse_hex(s, 4, ch)) > 0) {
 647                                 s += n;
 648                         }
 649                         break;
 650                 case 'U':
 651                         if ((n = parse_hex(s, 8, ch)) > 0) {
 652                                 s += n;
 653                         }
 654                 case 'x':
 655                         if ((n = parse_hex(s, 2, ch)) > 0) {
 656                                 s += n;
 657                         }
 658                         break;
 659                 case '\0':
 660                         s--;
 661                         *ch = '\\';
 662                         break;
 663         }
 664         return s - s0;
 665 }
 666
 667 /*
 668  - regatom - the lowest level
 669  *
 670  * Optimization:  gobbles an entire sequence of ordinary characters so that
 671  * it can turn them into a single node, which is smaller to store and
 672  * faster to run.  Backslashed characters are exceptions, each becoming a
 673  * separate node; the code is simpler that way and it's not worth fixing.
 674  */
 675 static int regatom(regex_t *preg, int *flagp)
 676 {
 677         int ret;
 678         int flags;
 679         int nocase = (preg->cflags & REG_ICASE);
 680
 681         int ch;
 682         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 683
 684         *flagp = WORST;         /* Tentatively. */
 685
 686         preg->regparse += n;
 687         switch (ch) {
 688         /* FIXME: these chars only have meaning at beg/end of pat? */
 689         case '^':
 690                 ret = regnode(preg, BOL);
 691                 break;
 692         case '$':
 693                 ret = regnode(preg, EOL);
 694                 break;
 695         case '.':
 696                 ret = regnode(preg, ANY);
 697                 *flagp |= HASWIDTH|SIMPLE;
 698                 break;
 699         case '[': {
 700                         const char *pattern = preg->regparse;
 701
 702                         if (*pattern == '^') {  /* Complement of range. */
 703                                 ret = regnode(preg, ANYBUT);
 704                                 pattern++;
 705                         } else
 706                                 ret = regnode(preg, ANYOF);
 707
 708                         /* Special case. If the first char is ']' or '-', it is part of the set */
 709                         if (*pattern == ']' || *pattern == '-') {
 710                                 reg_addrange(preg, *pattern, *pattern);
 711                                 pattern++;
 712                         }
 713
 714                         while (*pattern && *pattern != ']') {
 715                                 /* Is this a range? a-z */
 716                                 int start;
 717                                 int end;
 718
 719                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 720                                 if (start == '\\') {
 721                                         pattern += reg_decode_escape(pattern, &start);
 722                                         if (start == 0) {
 723                                                 preg->err = REG_ERR_NULL_CHAR;
 724                                                 return 0;
 725                                         }
 726                                 }
 727                                 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
 728                                         /* skip '-' */
 729                                         pattern += utf8_tounicode(pattern, &end);
 730                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 731                                         if (end == '\\') {
 732                                                 pattern += reg_decode_escape(pattern, &end);
 733                                                 if (end == 0) {
 734                                                         preg->err = REG_ERR_NULL_CHAR;
 735                                                         return 0;
 736                                                 }
 737                                         }
 738
 739                                         reg_addrange(preg, start, end);
 740                                         continue;
 741                                 }
 742                                 if (start == '[') {
 743                                         if (strncmp(pattern, ":alpha:]", 8) == 0) {
 744                                                 if ((preg->cflags & REG_ICASE) == 0) {
 745                                                         reg_addrange(preg, 'a', 'z');
 746                                                 }
 747                                                 reg_addrange(preg, 'A', 'Z');
 748                                                 pattern += 8;
 749                                                 continue;
 750                                         }
 751                                         if (strncmp(pattern, ":alnum:]", 8) == 0) {
 752                                                 if ((preg->cflags & REG_ICASE) == 0) {
 753                                                         reg_addrange(preg, 'a', 'z');
 754                                                 }
 755                                                 reg_addrange(preg, 'A', 'Z');
 756                                                 reg_addrange(preg, '0', '9');
 757                                                 pattern += 8;
 758                                                 continue;
 759                                         }
 760                                         if (strncmp(pattern, ":space:]", 8) == 0) {
 761                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 762                                                 pattern += 8;
 763                                                 continue;
 764                                         }
 765                                 }
 766                                 /* Not a range, so just add the char */
 767                                 reg_addrange(preg, start, start);
 768                         }
 769                         regc(preg, '\0');
 770
 771                         if (*pattern) {
 772                                 pattern++;
 773                         }
 774                         preg->regparse = pattern;
 775
 776                         *flagp |= HASWIDTH|SIMPLE;
 777                 }
 778                 break;
 779         case '(':
 780                 ret = reg(preg, 1, &flags);
 781                 if (ret == 0)
 782                         return 0;
 783                 *flagp |= flags&(HASWIDTH|SPSTART);
 784                 break;
 785         case '\0':
 786         case '|':
 787         case ')':
 788                 preg->err = REG_ERR_INTERNAL;
 789                 return 0;       /* Supposed to be caught earlier. */
 790         case '?':
 791         case '+':
 792         case '*':
 793         case '{':
 794                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 795                 return 0;
 796         case '\\':
 797                 switch (*preg->regparse++) {
 798                 case '\0':
 799                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 800                         return 0;
 801                 case '<':
 802                 case 'm':
 803                         ret = regnode(preg, WORDA);
 804                         break;
 805                 case '>':
 806                 case 'M':
 807                         ret = regnode(preg, WORDZ);
 808                         break;
 809                 case 'd':
 810                         ret = regnode(preg, ANYOF);
 811                         reg_addrange(preg, '0', '9');
 812                         regc(preg, '\0');
 813                         *flagp |= HASWIDTH|SIMPLE;
 814                         break;
 815                 case 'w':
 816                         ret = regnode(preg, ANYOF);
 817                         if ((preg->cflags & REG_ICASE) == 0) {
 818                                 reg_addrange(preg, 'a', 'z');
 819                         }
 820                         reg_addrange(preg, 'A', 'Z');
 821                         reg_addrange(preg, '0', '9');
 822                         reg_addrange(preg, '_', '_');
 823                         regc(preg, '\0');
 824                         *flagp |= HASWIDTH|SIMPLE;
 825                         break;
 826                 case 's':
 827                         ret = regnode(preg, ANYOF);
 828                         reg_addrange_str(preg," \t\r\n\f\v");
 829                         regc(preg, '\0');
 830                         *flagp |= HASWIDTH|SIMPLE;
 831                         break;
 832                 /* FIXME: Someday handle \1, \2, ... */
 833                 default:
 834                         /* Handle general quoted chars in exact-match routine */
 835                         /* Back up to include the backslash */
 836                         preg->regparse--;
 837                         goto de_fault;
 838                 }
 839                 break;
 840         de_fault:
 841         default: {
 842                         /*
 843                          * Encode a string of characters to be matched exactly.
 844                          */
 845                         int added = 0;
 846
 847                         /* Back up to pick up the first char of interest */
 848                         preg->regparse -= n;
 849
 850                         ret = regnode(preg, EXACTLY);
 851
 852                         /* Note that a META operator such as ? or * consumes the
 853                          * preceding char.
 854                          * Thus we must be careful to look ahead by 2 and add the
 855                          * last char as it's own EXACTLY if necessary
 856                          */
 857
 858                         /* Until end of string or a META char is reached */
 859                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 860                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 861                                 if (ch == '\\' && preg->regparse[n]) {
 862                                         /* Non-trailing backslash.
 863                                          * Is this a special escape, or a regular escape?
 864                                          */
 865                                         if (strchr("<>mMwds", preg->regparse[n])) {
 866                                                 /* A special escape. All done with EXACTLY */
 867                                                 break;
 868                                         }
 869                                         /* Decode it. Note that we add the length for the escape
 870                                          * sequence to the length for the backlash so we can skip
 871                                          * the entire sequence, or not as required.
 872                                          */
 873                                         n += reg_decode_escape(preg->regparse + n, &ch);
 874                                         if (ch == 0) {
 875                                                 preg->err = REG_ERR_NULL_CHAR;
 876                                                 return 0;
 877                                         }
 878                                 }
 879
 880                                 /* Now we have one char 'ch' of length 'n'.
 881                                  * Check to see if the following char is a MULT
 882                                  */
 883
 884                                 if (ISMULT(preg->regparse[n])) {
 885                                         /* Yes. But do we already have some EXACTLY chars? */
 886                                         if (added) {
 887                                                 /* Yes, so return what we have and pick up the current char next time around */
 888                                                 break;
 889                                         }
 890                                         /* No, so add this single char and finish */
 891                                         regc(preg, ch);
 892                                         added++;
 893                                         preg->regparse += n;
 894                                         break;
 895                                 }
 896
 897                                 /* No, so just add this char normally */
 898                                 regc(preg, ch);
 899                                 added++;
 900                                 preg->regparse += n;
 901                         }
 902                         regc(preg, '\0');
 903
 904                         *flagp |= HASWIDTH;
 905                         if (added == 1)
 906                                 *flagp |= SIMPLE;
 907                         break;
 908                 }
 909                 break;
 910         }
 911
 912         return(ret);
 913 }
 914
 915 static void reg_grow(regex_t *preg, int n)
 916 {
 917         if (preg->p + n >= preg->proglen) {
 918                 preg->proglen = (preg->p + n) * 2;
 919                 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
 920         }
 921 }
 922
 923 /*
 924  - regnode - emit a node
 925  */
 926 /* Location. */
 927 static int regnode(regex_t *preg, int op)
 928 {
 929         reg_grow(preg, 2);
 930
 931         preg->program[preg->p++] = op;
 932         preg->program[preg->p++] = 0;
 933
 934         /* Return the start of the node */
 935         return preg->p - 2;
 936 }
 937
 938 /*
 939  - regc - emit (if appropriate) a byte of code
 940  */
 941 static void regc(regex_t *preg, int b )
 942 {
 943         reg_grow(preg, 1);
 944         preg->program[preg->p++] = b;
 945 }
 946
 947 /*
 948  - reginsert - insert an operator in front of already-emitted operand
 949  *
 950  * Means relocating the operand.
 951  * Returns the new location of the original operand.
 952  */
 953 static int reginsert(regex_t *preg, int op, int size, int opnd )
 954 {
 955         reg_grow(preg, size);
 956
 957         /* Move everything from opnd up */
 958         memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
 959         /* Zero out the new space */
 960         memset(preg->program + opnd, 0, sizeof(int) * size);
 961
 962         preg->program[opnd] = op;
 963
 964         preg->p += size;
 965
 966         return opnd + size;
 967 }
 968
 969 /*
 970  - regtail - set the next-pointer at the end of a node chain
 971  */
 972 static void regtail_(regex_t *preg, int p, int val, int line )
 973 {
 974         int scan;
 975         int temp;
 976         int offset;
 977
 978         /* Find last node. */
 979         scan = p;
 980         for (;;) {
 981                 temp = regnext(preg, scan);
 982                 if (temp == 0)
 983                         break;
 984                 scan = temp;
 985         }
 986
 987         if (OP(preg, scan) == BACK)
 988                 offset = scan - val;
 989         else
 990                 offset = val - scan;
 991
 992         preg->program[scan + 1] = offset;
 993 }
 994
 995 /*
 996  - regoptail - regtail on operand of first argument; nop if operandless
 997  */
 998
 999 static void regoptail(regex_t *preg, int p, int val )
1000 {
1001         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1002         if (p != 0 && OP(preg, p) == BRANCH) {
1003                 regtail(preg, OPERAND(p), val);
1004         }
1005 }
1006
1007 /*
1008  * regexec and friends
1009  */
1010
1011 /*
1012  * Forwards.
1013  */
1014 static int regtry(regex_t *preg, const char *string );
1015 static int regmatch(regex_t *preg, int prog);
1016 static int regrepeat(regex_t *preg, int p, int max);
1017
1018 /*
1019  - regexec - match a regexp against a string
1020  */
1021 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1022 {
1023         const char *s;
1024         int scan;
1025
1026         /* Be paranoid... */
1027         if (preg == NULL || preg->program == NULL || string == NULL) {
1028                 return REG_ERR_NULL_ARGUMENT;
1029         }
1030
1031         /* Check validity of program. */
1032         if (*preg->program != REG_MAGIC) {
1033                 return REG_ERR_CORRUPTED;
1034         }
1035
1036 #ifdef DEBUG
1037         fprintf(stderr, "regexec: %s\n", string);
1038         regdump(preg);
1039 #endif
1040
1041         preg->eflags = eflags;
1042         preg->pmatch = pmatch;
1043         preg->nmatch = nmatch;
1044         preg->start = string;   /* All offsets are computed from here */
1045
1046         /* Must clear out the embedded repeat counts */
1047         for (scan = OPERAND(1); scan != 0; ) {
1048                 switch (OP(preg, scan)) {
1049                 case REP:
1050                 case REPMIN:
1051                 case REPX:
1052                 case REPXMIN:
1053                         preg->program[scan + 4] = 0;
1054                         scan += 5;
1055                         break;
1056
1057                 case ANYOF:
1058                 case ANYBUT:
1059                 case EXACTLY:
1060                         scan += 2;
1061                         while (preg->program[scan++]) {
1062                         }
1063                         break;
1064
1065                 case END:
1066                         scan = 0;
1067                         break;
1068
1069                 default:
1070                         scan += 2;
1071                         break;
1072                 }
1073         }
1074
1075         /* If there is a "must appear" string, look for it. */
1076         if (preg->regmust != 0) {
1077                 s = string;
1078                 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1079                         if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1080                                 break;
1081                         }
1082                         s++;
1083                 }
1084                 if (s == NULL)  /* Not present. */
1085                         return REG_NOMATCH;
1086         }
1087
1088         /* Mark beginning of line for ^ . */
1089         preg->regbol = string;
1090
1091         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1092         if (preg->reganch) {
1093                 if (eflags & REG_NOTBOL) {
1094                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1095                         goto nextline;
1096                 }
1097                 while (1) {
1098                         if (regtry(preg, string)) {
1099                                 return REG_NOERROR;
1100                         }
1101                         if (*string) {
1102 nextline:
1103                                 if (preg->cflags & REG_NEWLINE) {
1104                                         /* Try the next anchor? */
1105                                         string = strchr(string, '\n');
1106                                         if (string) {
1107                                                 preg->regbol = ++string;
1108                                                 continue;
1109                                         }
1110                                 }
1111                         }
1112                         return REG_NOMATCH;
1113                 }
1114         }
1115
1116         /* Messy cases:  unanchored match. */
1117         s = string;
1118         if (preg->regstart != '\0') {
1119                 /* We know what char it must start with. */
1120                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1121                         if (regtry(preg, s))
1122                                 return REG_NOERROR;
1123                         s++;
1124                 }
1125         }
1126         else
1127                 /* We don't -- general case. */
1128                 while (1) {
1129                         if (regtry(preg, s))
1130                                 return REG_NOERROR;
1131                         if (*s == '\0') {
1132                                 break;
1133                         }
1134                         else {
1135                                 int c;
1136                                 s += utf8_tounicode(s, &c);
1137                         }
1138                 }
1139
1140         /* Failure. */
1141         return REG_NOMATCH;
1142 }
1143
1144 /*
1145  - regtry - try match at specific point
1146  */
1147                         /* 0 failure, 1 success */
1148 static int regtry( regex_t *preg, const char *string )
1149 {
1150         int i;
1151
1152         preg->reginput = string;
1153
1154         for (i = 0; i < preg->nmatch; i++) {
1155                 preg->pmatch[i].rm_so = -1;
1156                 preg->pmatch[i].rm_eo = -1;
1157         }
1158         if (regmatch(preg, 1)) {
1159                 preg->pmatch[0].rm_so = string - preg->start;
1160                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1161                 return(1);
1162         } else
1163                 return(0);
1164 }
1165
1166 /**
1167  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1168  *
1169  * If 'nocase' is non-zero, does a case-insensitive match.
1170  *
1171  * Returns -1 on not found.
1172  */
1173 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1174 {
1175         const char *s = string;
1176         while (proglen && *s) {
1177                 int ch;
1178                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1179                 if (ch != *prog) {
1180                         return -1;
1181                 }
1182                 prog++;
1183                 s += n;
1184                 proglen--;
1185         }
1186         if (proglen == 0) {
1187                 return s - string;
1188         }
1189         return -1;
1190 }
1191
1192 /**
1193  * Searchs for 'c' in the range 'range'.
1194  *
1195  * Returns 1 if found, or 0 if not.
1196  */
1197 static int reg_range_find(const int *range, int c)
1198 {
1199         while (*range) {
1200                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1201                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1202                         return 1;
1203                 }
1204                 range += 2;
1205         }
1206         return 0;
1207 }
1208
1209 /**
1210  * Search for the character 'c' in the utf-8 string 'string'.
1211  *
1212  * If 'nocase' is set, the 'string' is assumed to be uppercase
1213  * and 'c' is converted to uppercase before matching.
1214  *
1215  * Returns the byte position in the string where the 'c' was found, or
1216  * NULL if not found.
1217  */
1218 static const char *str_find(const char *string, int c, int nocase)
1219 {
1220         if (nocase) {
1221                 /* The "string" should already be converted to uppercase */
1222                 c = utf8_upper(c);
1223         }
1224         while (*string) {
1225                 int ch;
1226                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1227                 if (c == ch) {
1228                         return string;
1229                 }
1230                 string += n;
1231         }
1232         return NULL;
1233 }
1234
1235 /**
1236  * Returns true if 'ch' is an end-of-line char.
1237  *
1238  * In REG_NEWLINE mode, \n is considered EOL in
1239  * addition to \0
1240  */
1241 static int reg_iseol(regex_t *preg, int ch)
1242 {
1243         if (preg->cflags & REG_NEWLINE) {
1244                 return ch == '\0' || ch == '\n';
1245         }
1246         else {
1247                 return ch == '\0';
1248         }
1249 }
1250
1251 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1252 {
1253         int nextch = '\0';
1254         const char *save;
1255         int no;
1256         int c;
1257
1258         int max = preg->program[scan + 2];
1259         int min = preg->program[scan + 3];
1260         int next = regnext(preg, scan);
1261
1262         /*
1263          * Lookahead to avoid useless match attempts
1264          * when we know what character comes next.
1265          */
1266         if (OP(preg, next) == EXACTLY) {
1267                 nextch = preg->program[OPERAND(next)];
1268         }
1269         save = preg->reginput;
1270         no = regrepeat(preg, scan + 5, max);
1271         if (no < min) {
1272                 return 0;
1273         }
1274         if (matchmin) {
1275                 /* from min up to no */
1276                 max = no;
1277                 no = min;
1278         }
1279         /* else from no down to min */
1280         while (1) {
1281                 if (matchmin) {
1282                         if (no > max) {
1283                                 break;
1284                         }
1285                 }
1286                 else {
1287                         if (no < min) {
1288                                 break;
1289                         }
1290                 }
1291                 preg->reginput = save + utf8_index(save, no);
1292                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1293                 /* If it could work, try it. */
1294                 if (reg_iseol(preg, nextch) || c == nextch) {
1295                         if (regmatch(preg, next)) {
1296                                 return(1);
1297                         }
1298                 }
1299                 if (matchmin) {
1300                         /* Couldn't or didn't, add one more */
1301                         no++;
1302                 }
1303                 else {
1304                         /* Couldn't or didn't -- back up. */
1305                         no--;
1306                 }
1307         }
1308         return(0);
1309 }
1310
1311 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1312 {
1313         int *scanpt = preg->program + scan;
1314
1315         int max = scanpt[2];
1316         int min = scanpt[3];
1317
1318         /* Have we reached min? */
1319         if (scanpt[4] < min) {
1320                 /* No, so get another one */
1321                 scanpt[4]++;
1322                 if (regmatch(preg, scan + 5)) {
1323                         return 1;
1324                 }
1325                 scanpt[4]--;
1326                 return 0;
1327         }
1328         if (scanpt[4] > max) {
1329                 return 0;
1330         }
1331
1332         if (matchmin) {
1333                 /* minimal, so try other branch first */
1334                 if (regmatch(preg, regnext(preg, scan))) {
1335                         return 1;
1336                 }
1337                 /* No, so try one more */
1338                 scanpt[4]++;
1339                 if (regmatch(preg, scan + 5)) {
1340                         return 1;
1341                 }
1342                 scanpt[4]--;
1343                 return 0;
1344         }
1345         /* maximal, so try this branch again */
1346         if (scanpt[4] < max) {
1347                 scanpt[4]++;
1348                 if (regmatch(preg, scan + 5)) {
1349                         return 1;
1350                 }
1351                 scanpt[4]--;
1352         }
1353         /* At this point we are at max with no match. Try the other branch */
1354         return regmatch(preg, regnext(preg, scan));
1355 }
1356
1357 /*
1358  - regmatch - main matching routine
1359  *
1360  * Conceptually the strategy is simple:  check to see whether the current
1361  * node matches, call self recursively to see whether the rest matches,
1362  * and then act accordingly.  In practice we make some effort to avoid
1363  * recursion, in particular by going through "ordinary" nodes (that don't
1364  * need to know whether the rest of the match failed) by a loop instead of
1365  * by recursion.
1366  */
1367 /* 0 failure, 1 success */
1368 static int regmatch(regex_t *preg, int prog)
1369 {
1370         int scan;       /* Current node. */
1371         int next;               /* Next node. */
1372
1373         scan = prog;
1374
1375 #ifdef DEBUG
1376         if (scan != 0 && regnarrate)
1377                 fprintf(stderr, "%s(\n", regprop(scan));
1378 #endif
1379         while (scan != 0) {
1380                 int n;
1381                 int c;
1382 #ifdef DEBUG
1383                 if (regnarrate) {
1384                         fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1385                 }
1386 #endif
1387                 next = regnext(preg, scan);
1388                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1389
1390                 switch (OP(preg, scan)) {
1391                 case BOL:
1392                         if (preg->reginput != preg->regbol)
1393                                 return(0);
1394                         break;
1395                 case EOL:
1396                         if (!reg_iseol(preg, c)) {
1397                                 return(0);
1398                         }
1399                         break;
1400                 case WORDA:
1401                         /* Must be looking at a letter, digit, or _ */
1402                         if ((!isalnum(UCHAR(c))) && c != '_')
1403                                 return(0);
1404                         /* Prev must be BOL or nonword */
1405                         if (preg->reginput > preg->regbol &&
1406                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1407                                 return(0);
1408                         break;
1409                 case WORDZ:
1410                         /* Can't match at BOL */
1411                         if (preg->reginput > preg->regbol) {
1412                                 /* Current must be EOL or nonword */
1413                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1414                                         c = preg->reginput[-1];
1415                                         /* Previous must be word */
1416                                         if (isalnum(UCHAR(c)) || c == '_') {
1417                                                 break;
1418                                         }
1419                                 }
1420                         }
1421                         /* No */
1422                         return(0);
1423
1424                 case ANY:
1425                         if (reg_iseol(preg, c))
1426                                 return 0;
1427                         preg->reginput += n;
1428                         break;
1429                 case EXACTLY: {
1430                                 int opnd;
1431                                 int len;
1432                                 int slen;
1433
1434                                 opnd = OPERAND(scan);
1435                                 len = str_int_len(preg->program + opnd);
1436
1437                                 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1438                                 if (slen < 0) {
1439                                         return(0);
1440                                 }
1441                                 preg->reginput += slen;
1442                         }
1443                         break;
1444                 case ANYOF:
1445                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1446                                 return(0);
1447                         }
1448                         preg->reginput += n;
1449                         break;
1450                 case ANYBUT:
1451                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1452                                 return(0);
1453                         }
1454                         preg->reginput += n;
1455                         break;
1456                 case NOTHING:
1457                         break;
1458                 case BACK:
1459                         break;
1460                 case BRANCH: {
1461                                 const char *save;
1462
1463                                 if (OP(preg, next) != BRANCH)           /* No choice. */
1464                                         next = OPERAND(scan);   /* Avoid recursion. */
1465                                 else {
1466                                         do {
1467                                                 save = preg->reginput;
1468                                                 if (regmatch(preg, OPERAND(scan))) {
1469                                                         return(1);
1470                                                 }
1471                                                 preg->reginput = save;
1472                                                 scan = regnext(preg, scan);
1473                                         } while (scan != 0 && OP(preg, scan) == BRANCH);
1474                                         return(0);
1475                                         /* NOTREACHED */
1476                                 }
1477                         }
1478                         break;
1479                 case REP:
1480                 case REPMIN:
1481                         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1482
1483                 case REPX:
1484                 case REPXMIN:
1485                         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1486
1487                 case END:
1488                         return(1);      /* Success! */
1489                         break;
1490
1491                 case OPENNC:
1492                 case CLOSENC:
1493                         if (regmatch(preg, next)) {
1494                                 return 1;
1495                         }
1496                         return 0;
1497
1498                 default:
1499                         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1500                                 const char *save;
1501
1502                                 save = preg->reginput;
1503
1504                                 if (regmatch(preg, next)) {
1505                                         int no;
1506                                         /*
1507                                          * Don't set startp if some later
1508                                          * invocation of the same parentheses
1509                                          * already has.
1510                                          */
1511                                         if (OP(preg, scan) < CLOSE) {
1512                                                 no = OP(preg, scan) - OPEN;
1513                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1514                                                         preg->pmatch[no].rm_so = save - preg->start;
1515                                                 }
1516                                         }
1517                                         else {
1518                                                 no = OP(preg, scan) - CLOSE;
1519                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1520                                                         preg->pmatch[no].rm_eo = save - preg->start;
1521                                                 }
1522                                         }
1523                                         return(1);
1524                                 } else
1525                                         return(0);
1526                         }
1527                         return REG_ERR_INTERNAL;
1528                 }
1529
1530                 scan = next;
1531         }
1532
1533         /*
1534          * We get here only if there's trouble -- normally "case END" is
1535          * the terminating point.
1536          */
1537         return REG_ERR_INTERNAL;
1538 }
1539
1540 /*
1541  - regrepeat - repeatedly match something simple, report how many
1542  */
1543 static int regrepeat(regex_t *preg, int p, int max)
1544 {
1545         int count = 0;
1546         const char *scan;
1547         int opnd;
1548         int ch;
1549         int n;
1550
1551         scan = preg->reginput;
1552         opnd = OPERAND(p);
1553         switch (OP(preg, p)) {
1554         case ANY:
1555                 /* No need to handle utf8 specially here */
1556                 while (!reg_iseol(preg, *scan) && count < max) {
1557                         count++;
1558                         scan++;
1559                 }
1560                 break;
1561         case EXACTLY:
1562                 while (count < max) {
1563                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1564                         if (preg->program[opnd] != ch) {
1565                                 break;
1566                         }
1567                         count++;
1568                         scan += n;
1569                 }
1570                 break;
1571         case ANYOF:
1572                 while (count < max) {
1573                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1574                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1575                                 break;
1576                         }
1577                         count++;
1578                         scan += n;
1579                 }
1580                 break;
1581         case ANYBUT:
1582                 while (count < max) {
1583                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1584                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1585                                 break;
1586                         }
1587                         count++;
1588                         scan += n;
1589                 }
1590                 break;
1591         default:                /* Oh dear.  Called inappropriately. */
1592                 preg->err = REG_ERR_INTERNAL;
1593                 count = 0;      /* Best compromise. */
1594                 break;
1595         }
1596         preg->reginput = scan;
1597
1598         return(count);
1599 }
1600
1601 /*
1602  - regnext - dig the "next" pointer out of a node
1603  */
1604 static int regnext(regex_t *preg, int p )
1605 {
1606         int offset;
1607
1608         offset = NEXT(preg, p);
1609
1610         if (offset == 0)
1611                 return 0;
1612
1613         if (OP(preg, p) == BACK)
1614                 return(p-offset);
1615         else
1616                 return(p+offset);
1617 }
1618
1619 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1620
1621 /*
1622  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1623  */
1624 static void regdump(regex_t *preg)
1625 {
1626         int s;
1627         int op = EXACTLY;       /* Arbitrary non-END op. */
1628         int next;
1629         char buf[MAX_UTF8_LEN + 1];
1630
1631         int i;
1632         for (i = 1; i < preg->p; i++) {
1633                 printf("%02x ", (unsigned char)preg->program[i]);
1634                 if (i % 16 == 0) {
1635                         printf("\n");
1636                 }
1637         }
1638         printf("\n");
1639
1640         s = 1;
1641         while (op != END && s < preg->p) {      /* While that wasn't END last time... */
1642                 op = OP(preg, s);
1643                 printf("%3d: %s", s, regprop(op));      /* Where, what. */
1644                 next = regnext(preg, s);
1645                 if (next == 0)          /* Next ptr. */
1646                         printf("(0)");
1647                 else
1648                         printf("(%d)", next);
1649                 s += 2;
1650                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1651                         int max = preg->program[s];
1652                         int min = preg->program[s + 1];
1653                         if (max == 65535) {
1654                                 printf("{%d,*}", min);
1655                         }
1656                         else {
1657                                 printf("{%d,%d}", min, max);
1658                         }
1659                         printf(" %d", preg->program[s + 2]);
1660                         s += 3;
1661                 }
1662                 else if (op == ANYOF || op == ANYBUT) {
1663                         /* set of ranges */
1664
1665                         while (preg->program[s]) {
1666                                 int len = preg->program[s++];
1667                                 int first = preg->program[s++];
1668                                 buf[utf8_getchars(buf, first)] = 0;
1669                                 printf("%s", buf);
1670                                 if (len > 1) {
1671                                         buf[utf8_getchars(buf, first + len - 1)] = 0;
1672                                         printf("-%s", buf);
1673                                 }
1674                         }
1675                         s++;
1676                 }
1677                 else if (op == EXACTLY) {
1678                         /* Literal string, where present. */
1679
1680                         while (preg->program[s]) {
1681                                 buf[utf8_getchars(buf, preg->program[s])] = 0;
1682                                 printf("%s", buf);
1683                                 s++;
1684                         }
1685                         s++;
1686                 }
1687                 putchar('\n');
1688         }
1689
1690         if (op == END) {
1691                 /* Header fields of interest. */
1692                 if (preg->regstart) {
1693                         buf[utf8_getchars(buf, preg->regstart)] = 0;
1694                         printf("start '%s' ", buf);
1695                 }
1696                 if (preg->reganch)
1697                         printf("anchored ");
1698                 if (preg->regmust != 0) {
1699                         int i;
1700                         printf("must have:");
1701                         for (i = 0; i < preg->regmlen; i++) {
1702                                 putchar(preg->program[preg->regmust + i]);
1703                         }
1704                         putchar('\n');
1705                 }
1706         }
1707         printf("\n");
1708 }
1709
1710 /*
1711  - regprop - printable representation of opcode
1712  */
1713 static const char *regprop( int op )
1714 {
1715         static char buf[50];
1716
1717         switch (op) {
1718         case BOL:
1719                 return "BOL";
1720         case EOL:
1721                 return "EOL";
1722         case ANY:
1723                 return "ANY";
1724         case ANYOF:
1725                 return "ANYOF";
1726         case ANYBUT:
1727                 return "ANYBUT";
1728         case BRANCH:
1729                 return "BRANCH";
1730         case EXACTLY:
1731                 return "EXACTLY";
1732         case NOTHING:
1733                 return "NOTHING";
1734         case BACK:
1735                 return "BACK";
1736         case END:
1737                 return "END";
1738         case REP:
1739                 return "REP";
1740         case REPMIN:
1741                 return "REPMIN";
1742         case REPX:
1743                 return "REPX";
1744         case REPXMIN:
1745                 return "REPXMIN";
1746         case WORDA:
1747                 return "WORDA";
1748         case WORDZ:
1749                 return "WORDZ";
1750         case OPENNC:
1751                 return "OPEN";
1752         case CLOSENC:
1753                 return "CLOSE";
1754         default:
1755                 if (op >= OPEN && op < CLOSE) {
1756                         snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1757                 }
1758                 else if (op >= CLOSE && op < CLOSE_END) {
1759                         snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1760                 }
1761                 else {
1762                         snprintf(buf, sizeof(buf), "?%d?\n", op);
1763                 }
1764                 return(buf);
1765         }
1766 }
1767 #endif /* JIM_BOOTSTRAP */
1768
1769 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1770 {
1771         static const char *error_strings[] = {
1772                 "success",
1773                 "no match",
1774                 "bad pattern",
1775                 "null argument",
1776                 "unknown error",
1777                 "too big",
1778                 "out of memory",
1779                 "too many ()",
1780                 "parentheses () not balanced",
1781                 "braces {} not balanced",
1782                 "invalid repetition count(s)",
1783                 "extra characters",
1784                 "*+ of empty atom",
1785                 "nested count",
1786                 "internal error",
1787                 "count follows nothing",
1788                 "trailing backslash",
1789                 "corrupted program",
1790                 "contains null char",
1791         };
1792         const char *err;
1793
1794         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1795                 err = "Bad error code";
1796         }
1797         else {
1798                 err = error_strings[errcode];
1799         }
1800
1801         return snprintf(errbuf, errbuf_size, "%s", err);
1802 }
1803
1804 void regfree(regex_t *preg)
1805 {
1806         free(preg->program);
1807 }
1808
1809 #endif