jimregexp.c

   1 /*
   2  * vi:se ts=8:
   3  *
   4  * regcomp and regexec -- regsub and regerror are elsewhere
   5  *
   6  *      Copyright (c) 1986 by University of Toronto.
   7  *      Written by Henry Spencer.  Not derived from licensed software.
   8  *
   9  *      Permission is granted to anyone to use this software for any
  10  *      purpose on any computer system, and to redistribute it freely,
  11  *      subject to the following restrictions:
  12  *
  13  *      1. The author is not responsible for the consequences of use of
  14  *              this software, no matter how awful, even if they arise
  15  *              from defects in it.
  16  *
  17  *      2. The origin of this software must not be misrepresented, either
  18  *              by explicit claim or by omission.
  19  *
  20  *      3. Altered versions must be plainly marked as such, and must not
  21  *              be misrepresented as being the original software.
  22  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  23  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  24  *** to assist in implementing egrep.
  25  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  26  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  27  *** as in BSD grep and ex.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  29  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  31  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  32  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  33  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  34  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  35  *** to avoid conflicting with the system regexp.h.  Const, bless its
  36  *** soul, was removed so it can compile everywhere.  The declaration
  37  *** of strchr() was in conflict on AIX, so it was removed (as it is
  38  *** happily defined in string.h).
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  41  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  42  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  43  *
  44  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  45  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  46  *   This includes counted repetitions, UTF-8 support, character classes,
  47  *   shorthand character classes, increased number of parentheses to 100,
  48  *   backslash escape sequences. It also removes \n as an alternative to |.
  49  *
  50  * Beware that some of this code is subtly aware of the way operator
  51  * precedence is structured in regular expressions.  Serious changes in
  52  * regular-expression syntax might require a total rethink.
  53  */
  54
  55 #include "jimautoconf.h"
  56
  57 #if defined(JIM_REGEXP)
  58 #include <stdio.h>
  59 #include <ctype.h>
  60 #include <stdlib.h>
  61 #include <string.h>
  62
  63 #include "jim.h"
  64 #include "jimregexp.h"
  65 #include "utf8.h"
  66
  67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
  68 #define REG_MAX_PAREN 100
  69
  70 /*
  71  * Structure for regexp "program".  This is essentially a linear encoding
  72  * of a nondeterministic finite-state machine (aka syntax charts or
  73  * "railroad normal form" in parsing technology).  Each node is an opcode
  74  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  75  * all nodes except BRANCH implement concatenation; a "next" pointer with
  76  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  77  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  78  * opposed to a collection of them) is never concatenated with anything
  79  * because of operator precedence.)  The operand of some types of node is
  80  * a literal string; for others, it is a node leading into a sub-FSM.  In
  81  * particular, the operand of a BRANCH node is the first node of the branch.
  82  * (NB this is *not* a tree structure:  the tail of the branch connects
  83  * to the thing following the set of BRANCHes.)  The opcodes are:
  84  */
  85
  86 /* definition           number  opnd?   meaning */
  87 #define END     0       /* no   End of program. */
  88 #define BOL     1       /* no   Match "" at beginning of line. */
  89 #define EOL     2       /* no   Match "" at end of line. */
  90 #define ANY     3       /* no   Match any one character. */
  91 #define ANYOF   4       /* str  Match any character in this string. */
  92 #define ANYBUT  5       /* str  Match any character not in this string. */
  93 #define BRANCH  6       /* node Match this alternative, or the next... */
  94 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  95 #define EXACTLY 8       /* str  Match this string. */
  96 #define NOTHING 9       /* no   Match empty string. */
  97 #define REP     10      /* max,min      Match this (simple) thing [min,max] times. */
  98 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, minimal match. */
  99 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
 100 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
 101
 102 #define WORDA   15      /* no   Match "" at wordchar, where prev is nonword */
 103 #define WORDZ   16      /* no   Match "" at nonwordchar, where prev is word */
 104
 105 #define OPENNC  1000    /* no   Non-capturing parentheses - must be OPEN-1 */
 106 #define OPEN    1001    /* no   Mark this point in input as start of #n. */
 107                         /*      OPEN+1 is number 1, etc. */
 108
 109 /* must not be any other opts between OPEN and CLOSE */
 110
 111 #define CLOSENC 2000    /* no   Non-capturing parentheses - must be CLOSE-1 */
 112 #define CLOSE   2001    /* no   Analogous to OPEN. */
 113 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 114
 115 /*
 116  * The first word of the regexp internal "program" is actually this magic
 117  * number; the start node begins in the second word.
 118  */
 119 #define REG_MAGIC       0xFADED00D
 120
 121 /*
 122  * Opcode notes:
 123  *
 124  * BRANCH       The set of branches constituting a single choice are hooked
 125  *              together with their "next" pointers, since precedence prevents
 126  *              anything being concatenated to any individual branch.  The
 127  *              "next" pointer of the last BRANCH in a choice points to the
 128  *              thing following the whole choice.  This is also where the
 129  *              final "next" pointer of each individual branch points; each
 130  *              branch starts with the operand node of a BRANCH node.
 131  *
 132  * BACK         Normal "next" pointers all implicitly point forward; BACK
 133  *              exists to make loop structures possible.
 134  *
 135  * REP,REPX     Repeated matches ('?', '*', '+' and {min,max}) are implemented
 136  *              as either simple repeats (REP) or complex repeats (REPX).
 137  *              These opcodes include a "min" and "max" count after the opcode.
 138  *              This is followed by a fourth "current count" word that is
 139  *              only used by REPX, as it implements a recursive match.
 140  *              REPMIN and REPXMIN are identical except they implement minimal repeats.
 141  *
 142  * OPEN,CLOSE   ...are numbered at compile time.
 143  */
 144
 145 /*
 146  * A node is one word of opcode followed by one word of "next" pointer.
 147  * The "next" pointer value is a positive offset from the opcode of the node
 148  * containing it.
 149  * An operand, if any, simply follows the node.  (Note that much of the
 150  * code generation knows about this implicit relationship.)
 151  */
 152 #define OP(preg, p)     (preg->program[p])
 153 #define NEXT(preg, p)   (preg->program[p + 1])
 154 #define OPERAND(p)      ((p) + 2)
 155
 156 /*
 157  * See regmagic.h for one further detail of program structure.
 158  */
 159
 160
 161 /*
 162  * Utility definitions.
 163  */
 164
 165 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 166 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 167 #define META            "^$.[()|?{+*"
 168
 169 /*
 170  * Flags to be passed up and down.
 171  */
 172 #define HASWIDTH        1       /* Known never to match null string. */
 173 #define SIMPLE          2       /* Simple enough to be STAR/PLUS operand. */
 174 #define SPSTART         4       /* Starts with * or +. */
 175 #define WORST           0       /* Worst case. */
 176
 177 #define MAX_REP_COUNT 1000000
 178
 179 /*
 180  * Forward declarations for regcomp()'s friends.
 181  */
 182 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 183 static int regpiece(regex_t *preg, int *flagp );
 184 static int regbranch(regex_t *preg, int *flagp );
 185 static int regatom(regex_t *preg, int *flagp );
 186 static int regnode(regex_t *preg, int op );
 187 static int regnext(regex_t *preg, int p );
 188 static void regc(regex_t *preg, int b );
 189 static int reginsert(regex_t *preg, int op, int size, int opnd );
 190 static void regtail(regex_t *preg, int p, int val);
 191 static void regoptail(regex_t *preg, int p, int val );
 192 static int regopsize(regex_t *preg, int p );
 193
 194 static int reg_range_find(const int *string, int c);
 195 static const char *str_find(const char *string, int c, int nocase);
 196 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 197
 198 /*#define DEBUG*/
 199 #ifdef DEBUG
 200 static int regnarrate = 0;
 201 static void regdump(regex_t *preg);
 202 static const char *regprop( int op );
 203 #endif
 204
 205
 206 /**
 207  * Returns the length of the null-terminated integer sequence.
 208  */
 209 static int str_int_len(const int *seq)
 210 {
 211         int n = 0;
 212         while (*seq++) {
 213                 n++;
 214         }
 215         return n;
 216 }
 217
 218 /*
 219  - regcomp - compile a regular expression into internal code
 220  *
 221  * We can't allocate space until we know how big the compiled form will be,
 222  * but we can't compile it (and thus know how big it is) until we've got a
 223  * place to put the code.  So we cheat:  we compile it twice, once with code
 224  * generation turned off and size counting turned on, and once "for real".
 225  * This also means that we don't allocate space until we are sure that the
 226  * thing really will compile successfully, and we never have to move the
 227  * code and thus invalidate pointers into it.  (Note that it has to be in
 228  * one piece because free() must be able to free it all.)
 229  *
 230  * Beware that the optimization-preparation code in here knows about some
 231  * of the structure of the compiled regexp.
 232  */
 233 int regcomp(regex_t *preg, const char *exp, int cflags)
 234 {
 235         int scan;
 236         int longest;
 237         unsigned len;
 238         int flags;
 239
 240 #ifdef DEBUG
 241         fprintf(stderr, "Compiling: '%s'\n", exp);
 242 #endif
 243         memset(preg, 0, sizeof(*preg));
 244
 245         if (exp == NULL)
 246                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 247
 248         /* First pass: determine size, legality. */
 249         preg->cflags = cflags;
 250         preg->regparse = exp;
 251
 252         /* Allocate space. */
 253         preg->proglen = (strlen(exp) + 1) * 5;
 254         preg->program = malloc(preg->proglen * sizeof(int));
 255         if (preg->program == NULL)
 256                 FAIL(preg, REG_ERR_NOMEM);
 257
 258         /* Note that since we store a magic value as the first item in the program,
 259          * program offsets will never be 0
 260          */
 261         regc(preg, REG_MAGIC);
 262         if (reg(preg, 0, &flags) == 0) {
 263                 return preg->err;
 264         }
 265
 266         /* Small enough for pointer-storage convention? */
 267         if (preg->re_nsub >= REG_MAX_PAREN)             /* Probably could be 65535L. */
 268                 FAIL(preg,REG_ERR_TOO_BIG);
 269
 270         /* Dig out information for optimizations. */
 271         preg->regstart = 0;     /* Worst-case defaults. */
 272         preg->reganch = 0;
 273         preg->regmust = 0;
 274         preg->regmlen = 0;
 275         scan = 1;                       /* First BRANCH. */
 276         if (OP(preg, regnext(preg, scan)) == END) {             /* Only one top-level choice. */
 277                 scan = OPERAND(scan);
 278
 279                 /* Starting-point info. */
 280                 if (OP(preg, scan) == EXACTLY) {
 281                         preg->regstart = preg->program[OPERAND(scan)];
 282                 }
 283                 else if (OP(preg, scan) == BOL)
 284                         preg->reganch++;
 285
 286                 /*
 287                  * If there's something expensive in the r.e., find the
 288                  * longest literal string that must appear and make it the
 289                  * regmust.  Resolve ties in favor of later strings, since
 290                  * the regstart check works with the beginning of the r.e.
 291                  * and avoiding duplication strengthens checking.  Not a
 292                  * strong reason, but sufficient in the absence of others.
 293                  */
 294                 if (flags&SPSTART) {
 295                         longest = 0;
 296                         len = 0;
 297                         for (; scan != 0; scan = regnext(preg, scan)) {
 298                                 if (OP(preg, scan) == EXACTLY) {
 299                                         int plen = str_int_len(preg->program + OPERAND(scan));
 300                                         if (plen >= len) {
 301                                                 longest = OPERAND(scan);
 302                                                 len = plen;
 303                                         }
 304                                 }
 305                         }
 306                         preg->regmust = longest;
 307                         preg->regmlen = len;
 308                 }
 309         }
 310
 311 #ifdef DEBUG
 312         regdump(preg);
 313 #endif
 314
 315         return 0;
 316 }
 317
 318 /*
 319  - reg - regular expression, i.e. main body or parenthesized thing
 320  *
 321  * Caller must absorb opening parenthesis.
 322  *
 323  * Combining parenthesis handling with the base level of regular expression
 324  * is a trifle forced, but the need to tie the tails of the branches to what
 325  * follows makes it hard to avoid.
 326  */
 327 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 328 {
 329         int ret;
 330         int br;
 331         int ender;
 332         int parno = 0;
 333         int flags;
 334
 335         *flagp = HASWIDTH;      /* Tentatively. */
 336
 337         /* Make an OPEN node, if parenthesized. */
 338         if (paren) {
 339                 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 340                         /* non-capturing paren */
 341                         preg->regparse += 2;
 342                         parno = -1;
 343                 }
 344                 else {
 345                         parno = ++preg->re_nsub;
 346                 }
 347                 ret = regnode(preg, OPEN+parno);
 348         } else
 349                 ret = 0;
 350
 351         /* Pick up the branches, linking them together. */
 352         br = regbranch(preg, &flags);
 353         if (br == 0)
 354                 return 0;
 355         if (ret != 0)
 356                 regtail(preg, ret, br); /* OPEN -> first. */
 357         else
 358                 ret = br;
 359         if (!(flags&HASWIDTH))
 360                 *flagp &= ~HASWIDTH;
 361         *flagp |= flags&SPSTART;
 362         while (*preg->regparse == '|') {
 363                 preg->regparse++;
 364                 br = regbranch(preg, &flags);
 365                 if (br == 0)
 366                         return 0;
 367                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 368                 if (!(flags&HASWIDTH))
 369                         *flagp &= ~HASWIDTH;
 370                 *flagp |= flags&SPSTART;
 371         }
 372
 373         /* Make a closing node, and hook it on the end. */
 374         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 375         regtail(preg, ret, ender);
 376
 377         /* Hook the tails of the branches to the closing node. */
 378         for (br = ret; br != 0; br = regnext(preg, br))
 379                 regoptail(preg, br, ender);
 380
 381         /* Check for proper termination. */
 382         if (paren && *preg->regparse++ != ')') {
 383                 preg->err = REG_ERR_UNMATCHED_PAREN;
 384                 return 0;
 385         } else if (!paren && *preg->regparse != '\0') {
 386                 if (*preg->regparse == ')') {
 387                         preg->err = REG_ERR_UNMATCHED_PAREN;
 388                         return 0;
 389                 } else {
 390                         preg->err = REG_ERR_JUNK_ON_END;
 391                         return 0;
 392                 }
 393         }
 394
 395         return(ret);
 396 }
 397
 398 /*
 399  - regbranch - one alternative of an | operator
 400  *
 401  * Implements the concatenation operator.
 402  */
 403 static int regbranch(regex_t *preg, int *flagp )
 404 {
 405         int ret;
 406         int chain;
 407         int latest;
 408         int flags;
 409
 410         *flagp = WORST;         /* Tentatively. */
 411
 412         ret = regnode(preg, BRANCH);
 413         chain = 0;
 414         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 415                *preg->regparse != '|') {
 416                 latest = regpiece(preg, &flags);
 417                 if (latest == 0)
 418                         return 0;
 419                 *flagp |= flags&HASWIDTH;
 420                 if (chain == 0) {/* First piece. */
 421                         *flagp |= flags&SPSTART;
 422                 }
 423                 else {
 424                         regtail(preg, chain, latest);
 425                 }
 426                 chain = latest;
 427         }
 428         if (chain == 0) /* Loop ran zero times. */
 429                 (void) regnode(preg, NOTHING);
 430
 431         return(ret);
 432 }
 433
 434 /*
 435  - regpiece - something followed by possible [*+?]
 436  *
 437  * Note that the branching code sequences used for ? and the general cases
 438  * of * and + are somewhat optimized:  they use the same NOTHING node as
 439  * both the endmarker for their branch list and the body of the last branch.
 440  * It might seem that this node could be dispensed with entirely, but the
 441  * endmarker role is not redundant.
 442  */
 443 static int regpiece(regex_t *preg, int *flagp)
 444 {
 445         int ret;
 446         char op;
 447         int next;
 448         int flags;
 449         int min;
 450         int max;
 451
 452         ret = regatom(preg, &flags);
 453         if (ret == 0)
 454                 return 0;
 455
 456         op = *preg->regparse;
 457         if (!ISMULT(op)) {
 458                 *flagp = flags;
 459                 return(ret);
 460         }
 461
 462         if (!(flags&HASWIDTH) && op != '?') {
 463                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 464                 return 0;
 465         }
 466
 467         /* Handle braces (counted repetition) by expansion */
 468         if (op == '{') {
 469                 char *end;
 470
 471                 min = strtoul(preg->regparse + 1, &end, 10);
 472                 if (end == preg->regparse + 1) {
 473                         preg->err = REG_ERR_BAD_COUNT;
 474                         return 0;
 475                 }
 476                 if (*end == '}') {
 477                         max = min;
 478                 }
 479                 else {
 480                         preg->regparse = end;
 481                         max = strtoul(preg->regparse + 1, &end, 10);
 482                         if (*end != '}') {
 483                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 484                                 return 0;
 485                         }
 486                 }
 487                 if (end == preg->regparse + 1) {
 488                         max = MAX_REP_COUNT;
 489                 }
 490                 else if (max < min || max >= 100) {
 491                         preg->err = REG_ERR_BAD_COUNT;
 492                         return 0;
 493                 }
 494                 if (min >= 100) {
 495                         preg->err = REG_ERR_BAD_COUNT;
 496                         return 0;
 497                 }
 498
 499                 preg->regparse = strchr(preg->regparse, '}');
 500         }
 501         else {
 502                 min = (op == '+');
 503                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 504         }
 505
 506         if (preg->regparse[1] == '?') {
 507                 preg->regparse++;
 508                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 509         }
 510         else {
 511                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 512         }
 513         preg->program[ret + 2] = max;
 514         preg->program[ret + 3] = min;
 515         preg->program[ret + 4] = 0;
 516
 517         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 518
 519         if (!(flags & SIMPLE)) {
 520                 int back = regnode(preg, BACK);
 521                 regtail(preg, back, ret);
 522                 regtail(preg, next, back);
 523         }
 524
 525         preg->regparse++;
 526         if (ISMULT(*preg->regparse)) {
 527                 preg->err = REG_ERR_NESTED_COUNT;
 528                 return 0;
 529         }
 530
 531         return ret;
 532 }
 533
 534 /**
 535  * Add all characters in the inclusive range between lower and upper.
 536  *
 537  * Handles a swapped range (upper < lower).
 538  */
 539 static void reg_addrange(regex_t *preg, int lower, int upper)
 540 {
 541         if (lower > upper) {
 542                 reg_addrange(preg, upper, lower);
 543         }
 544         /* Add a range as length, start */
 545         regc(preg, upper - lower + 1);
 546         regc(preg, lower);
 547 }
 548
 549 /**
 550  * Add a null-terminated literal string as a set of ranges.
 551  */
 552 static void reg_addrange_str(regex_t *preg, const char *str)
 553 {
 554         while (*str) {
 555                 reg_addrange(preg, *str, *str);
 556                 str++;
 557         }
 558 }
 559
 560 /**
 561  * Extracts the next unicode char from utf8.
 562  *
 563  * If 'upper' is set, converts the char to uppercase.
 564  */
 565 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 566 {
 567         int l = utf8_tounicode(s, uc);
 568         if (upper) {
 569                 *uc = utf8_upper(*uc);
 570         }
 571         return l;
 572 }
 573
 574 /**
 575  * Converts a hex digit to decimal.
 576  *
 577  * Returns -1 for an invalid hex digit.
 578  */
 579 static int hexdigitval(int c)
 580 {
 581         if (c >= '0' && c <= '9')
 582                 return c - '0';
 583         if (c >= 'a' && c <= 'f')
 584                 return c - 'a' + 10;
 585         if (c >= 'A' && c <= 'F')
 586                 return c - 'A' + 10;
 587         return -1;
 588 }
 589
 590 /**
 591  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 592  *
 593  * Returns the number of hex digits parsed.
 594  * If there are no hex digits, returns 0 and stores nothing.
 595  */
 596 static int parse_hex(const char *s, int n, int *uc)
 597 {
 598         int val = 0;
 599         int k;
 600
 601         for (k = 0; k < n; k++) {
 602                 int c = hexdigitval(*s++);
 603                 if (c == -1) {
 604                         break;
 605                 }
 606                 val = (val << 4) | c;
 607         }
 608         if (k) {
 609                 *uc = val;
 610         }
 611         return k;
 612 }
 613
 614 /**
 615  * Call for chars after a backlash to decode the escape sequence.
 616  *
 617  * Stores the result in *ch.
 618  *
 619  * Returns the number of bytes consumed.
 620  */
 621 static int reg_decode_escape(const char *s, int *ch)
 622 {
 623         int n;
 624         const char *s0 = s;
 625
 626         *ch = *s++;
 627
 628         switch (*ch) {
 629                 case 'b': *ch = '\b'; break;
 630                 case 'e': *ch = 27; break;
 631                 case 'f': *ch = '\f'; break;
 632                 case 'n': *ch = '\n'; break;
 633                 case 'r': *ch = '\r'; break;
 634                 case 't': *ch = '\t'; break;
 635                 case 'v': *ch = '\v'; break;
 636                 case 'u':
 637                         if (*s == '{') {
 638                                 /* Expect \u{NNNN} */
 639                                 n = parse_hex(s + 1, 6, ch);
 640                                 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
 641                                         s += n + 2;
 642                                 }
 643                                 else {
 644                                         /* Invalid, so just treat as an escaped 'u' */
 645                                         *ch = 'u';
 646                                 }
 647                         }
 648                         else if ((n = parse_hex(s, 4, ch)) > 0) {
 649                                 s += n;
 650                         }
 651                         break;
 652                 case 'U':
 653                         if ((n = parse_hex(s, 8, ch)) > 0) {
 654                                 s += n;
 655                         }
 656                         break;
 657                 case 'x':
 658                         if ((n = parse_hex(s, 2, ch)) > 0) {
 659                                 s += n;
 660                         }
 661                         break;
 662                 case '\0':
 663                         s--;
 664                         *ch = '\\';
 665                         break;
 666         }
 667         return s - s0;
 668 }
 669
 670 /*
 671  - regatom - the lowest level
 672  *
 673  * Optimization:  gobbles an entire sequence of ordinary characters so that
 674  * it can turn them into a single node, which is smaller to store and
 675  * faster to run.  Backslashed characters are exceptions, each becoming a
 676  * separate node; the code is simpler that way and it's not worth fixing.
 677  */
 678 static int regatom(regex_t *preg, int *flagp)
 679 {
 680         int ret;
 681         int flags;
 682         int nocase = (preg->cflags & REG_ICASE);
 683
 684         int ch;
 685         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 686
 687         *flagp = WORST;         /* Tentatively. */
 688
 689         preg->regparse += n;
 690         switch (ch) {
 691         /* FIXME: these chars only have meaning at beg/end of pat? */
 692         case '^':
 693                 ret = regnode(preg, BOL);
 694                 break;
 695         case '$':
 696                 ret = regnode(preg, EOL);
 697                 break;
 698         case '.':
 699                 ret = regnode(preg, ANY);
 700                 *flagp |= HASWIDTH|SIMPLE;
 701                 break;
 702         case '[': {
 703                         const char *pattern = preg->regparse;
 704
 705                         if (*pattern == '^') {  /* Complement of range. */
 706                                 ret = regnode(preg, ANYBUT);
 707                                 pattern++;
 708                         } else
 709                                 ret = regnode(preg, ANYOF);
 710
 711                         /* Special case. If the first char is ']' or '-', it is part of the set */
 712                         if (*pattern == ']' || *pattern == '-') {
 713                                 reg_addrange(preg, *pattern, *pattern);
 714                                 pattern++;
 715                         }
 716
 717                         while (*pattern && *pattern != ']') {
 718                                 /* Is this a range? a-z */
 719                                 int start;
 720                                 int end;
 721
 722                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 723                                 if (start == '\\') {
 724                                         pattern += reg_decode_escape(pattern, &start);
 725                                         if (start == 0) {
 726                                                 preg->err = REG_ERR_NULL_CHAR;
 727                                                 return 0;
 728                                         }
 729                                 }
 730                                 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
 731                                         /* skip '-' */
 732                                         pattern += utf8_tounicode(pattern, &end);
 733                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 734                                         if (end == '\\') {
 735                                                 pattern += reg_decode_escape(pattern, &end);
 736                                                 if (end == 0) {
 737                                                         preg->err = REG_ERR_NULL_CHAR;
 738                                                         return 0;
 739                                                 }
 740                                         }
 741
 742                                         reg_addrange(preg, start, end);
 743                                         continue;
 744                                 }
 745                                 if (start == '[') {
 746                                         if (strncmp(pattern, ":alpha:]", 8) == 0) {
 747                                                 if ((preg->cflags & REG_ICASE) == 0) {
 748                                                         reg_addrange(preg, 'a', 'z');
 749                                                 }
 750                                                 reg_addrange(preg, 'A', 'Z');
 751                                                 pattern += 8;
 752                                                 continue;
 753                                         }
 754                                         if (strncmp(pattern, ":alnum:]", 8) == 0) {
 755                                                 if ((preg->cflags & REG_ICASE) == 0) {
 756                                                         reg_addrange(preg, 'a', 'z');
 757                                                 }
 758                                                 reg_addrange(preg, 'A', 'Z');
 759                                                 reg_addrange(preg, '0', '9');
 760                                                 pattern += 8;
 761                                                 continue;
 762                                         }
 763                                         if (strncmp(pattern, ":space:]", 8) == 0) {
 764                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 765                                                 pattern += 8;
 766                                                 continue;
 767                                         }
 768                                 }
 769                                 /* Not a range, so just add the char */
 770                                 reg_addrange(preg, start, start);
 771                         }
 772                         regc(preg, '\0');
 773
 774                         if (*pattern) {
 775                                 pattern++;
 776                         }
 777                         preg->regparse = pattern;
 778
 779                         *flagp |= HASWIDTH|SIMPLE;
 780                 }
 781                 break;
 782         case '(':
 783                 ret = reg(preg, 1, &flags);
 784                 if (ret == 0)
 785                         return 0;
 786                 *flagp |= flags&(HASWIDTH|SPSTART);
 787                 break;
 788         case '\0':
 789         case '|':
 790         case ')':
 791                 preg->err = REG_ERR_INTERNAL;
 792                 return 0;       /* Supposed to be caught earlier. */
 793         case '?':
 794         case '+':
 795         case '*':
 796         case '{':
 797                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 798                 return 0;
 799         case '\\':
 800                 switch (*preg->regparse++) {
 801                 case '\0':
 802                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 803                         return 0;
 804                 case '<':
 805                 case 'm':
 806                         ret = regnode(preg, WORDA);
 807                         break;
 808                 case '>':
 809                 case 'M':
 810                         ret = regnode(preg, WORDZ);
 811                         break;
 812                 case 'd':
 813                         ret = regnode(preg, ANYOF);
 814                         reg_addrange(preg, '0', '9');
 815                         regc(preg, '\0');
 816                         *flagp |= HASWIDTH|SIMPLE;
 817                         break;
 818                 case 'w':
 819                         ret = regnode(preg, ANYOF);
 820                         if ((preg->cflags & REG_ICASE) == 0) {
 821                                 reg_addrange(preg, 'a', 'z');
 822                         }
 823                         reg_addrange(preg, 'A', 'Z');
 824                         reg_addrange(preg, '0', '9');
 825                         reg_addrange(preg, '_', '_');
 826                         regc(preg, '\0');
 827                         *flagp |= HASWIDTH|SIMPLE;
 828                         break;
 829                 case 's':
 830                         ret = regnode(preg, ANYOF);
 831                         reg_addrange_str(preg," \t\r\n\f\v");
 832                         regc(preg, '\0');
 833                         *flagp |= HASWIDTH|SIMPLE;
 834                         break;
 835                 /* FIXME: Someday handle \1, \2, ... */
 836                 default:
 837                         /* Handle general quoted chars in exact-match routine */
 838                         /* Back up to include the backslash */
 839                         preg->regparse--;
 840                         goto de_fault;
 841                 }
 842                 break;
 843         de_fault:
 844         default: {
 845                         /*
 846                          * Encode a string of characters to be matched exactly.
 847                          */
 848                         int added = 0;
 849
 850                         /* Back up to pick up the first char of interest */
 851                         preg->regparse -= n;
 852
 853                         ret = regnode(preg, EXACTLY);
 854
 855                         /* Note that a META operator such as ? or * consumes the
 856                          * preceding char.
 857                          * Thus we must be careful to look ahead by 2 and add the
 858                          * last char as it's own EXACTLY if necessary
 859                          */
 860
 861                         /* Until end of string or a META char is reached */
 862                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 863                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 864                                 if (ch == '\\' && preg->regparse[n]) {
 865                                         /* Non-trailing backslash.
 866                                          * Is this a special escape, or a regular escape?
 867                                          */
 868                                         if (strchr("<>mMwds", preg->regparse[n])) {
 869                                                 /* A special escape. All done with EXACTLY */
 870                                                 break;
 871                                         }
 872                                         /* Decode it. Note that we add the length for the escape
 873                                          * sequence to the length for the backlash so we can skip
 874                                          * the entire sequence, or not as required.
 875                                          */
 876                                         n += reg_decode_escape(preg->regparse + n, &ch);
 877                                         if (ch == 0) {
 878                                                 preg->err = REG_ERR_NULL_CHAR;
 879                                                 return 0;
 880                                         }
 881                                 }
 882
 883                                 /* Now we have one char 'ch' of length 'n'.
 884                                  * Check to see if the following char is a MULT
 885                                  */
 886
 887                                 if (ISMULT(preg->regparse[n])) {
 888                                         /* Yes. But do we already have some EXACTLY chars? */
 889                                         if (added) {
 890                                                 /* Yes, so return what we have and pick up the current char next time around */
 891                                                 break;
 892                                         }
 893                                         /* No, so add this single char and finish */
 894                                         regc(preg, ch);
 895                                         added++;
 896                                         preg->regparse += n;
 897                                         break;
 898                                 }
 899
 900                                 /* No, so just add this char normally */
 901                                 regc(preg, ch);
 902                                 added++;
 903                                 preg->regparse += n;
 904                         }
 905                         regc(preg, '\0');
 906
 907                         *flagp |= HASWIDTH;
 908                         if (added == 1)
 909                                 *flagp |= SIMPLE;
 910                         break;
 911                 }
 912                 break;
 913         }
 914
 915         return(ret);
 916 }
 917
 918 static void reg_grow(regex_t *preg, int n)
 919 {
 920         if (preg->p + n >= preg->proglen) {
 921                 preg->proglen = (preg->p + n) * 2;
 922                 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
 923         }
 924 }
 925
 926 /*
 927  - regnode - emit a node
 928  */
 929 /* Location. */
 930 static int regnode(regex_t *preg, int op)
 931 {
 932         reg_grow(preg, 2);
 933
 934         /* The OP followed by a next pointer */
 935         preg->program[preg->p++] = op;
 936         preg->program[preg->p++] = 0;
 937
 938         /* Return the start of the node */
 939         return preg->p - 2;
 940 }
 941
 942 /*
 943  - regc - emit (if appropriate) a byte of code
 944  */
 945 static void regc(regex_t *preg, int b )
 946 {
 947         reg_grow(preg, 1);
 948         preg->program[preg->p++] = b;
 949 }
 950
 951 /*
 952  - reginsert - insert an operator in front of already-emitted operand
 953  *
 954  * Means relocating the operand.
 955  * Returns the new location of the original operand.
 956  */
 957 static int reginsert(regex_t *preg, int op, int size, int opnd )
 958 {
 959         reg_grow(preg, size);
 960
 961         /* Move everything from opnd up */
 962         memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
 963         /* Zero out the new space */
 964         memset(preg->program + opnd, 0, sizeof(int) * size);
 965
 966         preg->program[opnd] = op;
 967
 968         preg->p += size;
 969
 970         return opnd + size;
 971 }
 972
 973 /*
 974  - regtail - set the next-pointer at the end of a node chain
 975  */
 976 static void regtail(regex_t *preg, int p, int val)
 977 {
 978         int scan;
 979         int temp;
 980         int offset;
 981
 982         /* Find last node. */
 983         scan = p;
 984         for (;;) {
 985                 temp = regnext(preg, scan);
 986                 if (temp == 0)
 987                         break;
 988                 scan = temp;
 989         }
 990
 991         if (OP(preg, scan) == BACK)
 992                 offset = scan - val;
 993         else
 994                 offset = val - scan;
 995
 996         preg->program[scan + 1] = offset;
 997 }
 998
 999 /*
1000  - regoptail - regtail on operand of first argument; nop if operandless
1001  */
1002
1003 static void regoptail(regex_t *preg, int p, int val )
1004 {
1005         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1006         if (p != 0 && OP(preg, p) == BRANCH) {
1007                 regtail(preg, OPERAND(p), val);
1008         }
1009 }
1010
1011 /*
1012  * regexec and friends
1013  */
1014
1015 /*
1016  * Forwards.
1017  */
1018 static int regtry(regex_t *preg, const char *string );
1019 static int regmatch(regex_t *preg, int prog);
1020 static int regrepeat(regex_t *preg, int p, int max);
1021
1022 /*
1023  - regexec - match a regexp against a string
1024  */
1025 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1026 {
1027         const char *s;
1028         int scan;
1029
1030         /* Be paranoid... */
1031         if (preg == NULL || preg->program == NULL || string == NULL) {
1032                 return REG_ERR_NULL_ARGUMENT;
1033         }
1034
1035         /* Check validity of program. */
1036         if (*preg->program != REG_MAGIC) {
1037                 return REG_ERR_CORRUPTED;
1038         }
1039
1040 #ifdef DEBUG
1041         fprintf(stderr, "regexec: %s\n", string);
1042         regdump(preg);
1043 #endif
1044
1045         preg->eflags = eflags;
1046         preg->pmatch = pmatch;
1047         preg->nmatch = nmatch;
1048         preg->start = string;   /* All offsets are computed from here */
1049
1050         /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1051         for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1052                 int op = OP(preg, scan);
1053                 if (op == END)
1054                         break;
1055                 if (op == REPX || op == REPXMIN)
1056                         preg->program[scan + 4] = 0;
1057         }
1058
1059         /* If there is a "must appear" string, look for it. */
1060         if (preg->regmust != 0) {
1061                 s = string;
1062                 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1063                         if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1064                                 break;
1065                         }
1066                         s++;
1067                 }
1068                 if (s == NULL)  /* Not present. */
1069                         return REG_NOMATCH;
1070         }
1071
1072         /* Mark beginning of line for ^ . */
1073         preg->regbol = string;
1074
1075         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1076         if (preg->reganch) {
1077                 if (eflags & REG_NOTBOL) {
1078                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1079                         goto nextline;
1080                 }
1081                 while (1) {
1082                         if (regtry(preg, string)) {
1083                                 return REG_NOERROR;
1084                         }
1085                         if (*string) {
1086 nextline:
1087                                 if (preg->cflags & REG_NEWLINE) {
1088                                         /* Try the next anchor? */
1089                                         string = strchr(string, '\n');
1090                                         if (string) {
1091                                                 preg->regbol = ++string;
1092                                                 continue;
1093                                         }
1094                                 }
1095                         }
1096                         return REG_NOMATCH;
1097                 }
1098         }
1099
1100         /* Messy cases:  unanchored match. */
1101         s = string;
1102         if (preg->regstart != '\0') {
1103                 /* We know what char it must start with. */
1104                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1105                         if (regtry(preg, s))
1106                                 return REG_NOERROR;
1107                         s++;
1108                 }
1109         }
1110         else
1111                 /* We don't -- general case. */
1112                 while (1) {
1113                         if (regtry(preg, s))
1114                                 return REG_NOERROR;
1115                         if (*s == '\0') {
1116                                 break;
1117                         }
1118                         else {
1119                                 int c;
1120                                 s += utf8_tounicode(s, &c);
1121                         }
1122                 }
1123
1124         /* Failure. */
1125         return REG_NOMATCH;
1126 }
1127
1128 /*
1129  - regtry - try match at specific point
1130  */
1131                         /* 0 failure, 1 success */
1132 static int regtry( regex_t *preg, const char *string )
1133 {
1134         int i;
1135
1136         preg->reginput = string;
1137
1138         for (i = 0; i < preg->nmatch; i++) {
1139                 preg->pmatch[i].rm_so = -1;
1140                 preg->pmatch[i].rm_eo = -1;
1141         }
1142         if (regmatch(preg, 1)) {
1143                 preg->pmatch[0].rm_so = string - preg->start;
1144                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1145                 return(1);
1146         } else
1147                 return(0);
1148 }
1149
1150 /**
1151  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1152  *
1153  * If 'nocase' is non-zero, does a case-insensitive match.
1154  *
1155  * Returns -1 on not found.
1156  */
1157 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1158 {
1159         const char *s = string;
1160         while (proglen && *s) {
1161                 int ch;
1162                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1163                 if (ch != *prog) {
1164                         return -1;
1165                 }
1166                 prog++;
1167                 s += n;
1168                 proglen--;
1169         }
1170         if (proglen == 0) {
1171                 return s - string;
1172         }
1173         return -1;
1174 }
1175
1176 /**
1177  * Searchs for 'c' in the range 'range'.
1178  *
1179  * Returns 1 if found, or 0 if not.
1180  */
1181 static int reg_range_find(const int *range, int c)
1182 {
1183         while (*range) {
1184                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1185                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1186                         return 1;
1187                 }
1188                 range += 2;
1189         }
1190         return 0;
1191 }
1192
1193 /**
1194  * Search for the character 'c' in the utf-8 string 'string'.
1195  *
1196  * If 'nocase' is set, the 'string' is assumed to be uppercase
1197  * and 'c' is converted to uppercase before matching.
1198  *
1199  * Returns the byte position in the string where the 'c' was found, or
1200  * NULL if not found.
1201  */
1202 static const char *str_find(const char *string, int c, int nocase)
1203 {
1204         if (nocase) {
1205                 /* The "string" should already be converted to uppercase */
1206                 c = utf8_upper(c);
1207         }
1208         while (*string) {
1209                 int ch;
1210                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1211                 if (c == ch) {
1212                         return string;
1213                 }
1214                 string += n;
1215         }
1216         return NULL;
1217 }
1218
1219 /**
1220  * Returns true if 'ch' is an end-of-line char.
1221  *
1222  * In REG_NEWLINE mode, \n is considered EOL in
1223  * addition to \0
1224  */
1225 static int reg_iseol(regex_t *preg, int ch)
1226 {
1227         if (preg->cflags & REG_NEWLINE) {
1228                 return ch == '\0' || ch == '\n';
1229         }
1230         else {
1231                 return ch == '\0';
1232         }
1233 }
1234
1235 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1236 {
1237         int nextch = '\0';
1238         const char *save;
1239         int no;
1240         int c;
1241
1242         int max = preg->program[scan + 2];
1243         int min = preg->program[scan + 3];
1244         int next = regnext(preg, scan);
1245
1246         /*
1247          * Lookahead to avoid useless match attempts
1248          * when we know what character comes next.
1249          */
1250         if (OP(preg, next) == EXACTLY) {
1251                 nextch = preg->program[OPERAND(next)];
1252         }
1253         save = preg->reginput;
1254         no = regrepeat(preg, scan + 5, max);
1255         if (no < min) {
1256                 return 0;
1257         }
1258         if (matchmin) {
1259                 /* from min up to no */
1260                 max = no;
1261                 no = min;
1262         }
1263         /* else from no down to min */
1264         while (1) {
1265                 if (matchmin) {
1266                         if (no > max) {
1267                                 break;
1268                         }
1269                 }
1270                 else {
1271                         if (no < min) {
1272                                 break;
1273                         }
1274                 }
1275                 preg->reginput = save + utf8_index(save, no);
1276                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1277                 /* If it could work, try it. */
1278                 if (reg_iseol(preg, nextch) || c == nextch) {
1279                         if (regmatch(preg, next)) {
1280                                 return(1);
1281                         }
1282                 }
1283                 if (matchmin) {
1284                         /* Couldn't or didn't, add one more */
1285                         no++;
1286                 }
1287                 else {
1288                         /* Couldn't or didn't -- back up. */
1289                         no--;
1290                 }
1291         }
1292         return(0);
1293 }
1294
1295 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1296 {
1297         int *scanpt = preg->program + scan;
1298
1299         int max = scanpt[2];
1300         int min = scanpt[3];
1301
1302         /* Have we reached min? */
1303         if (scanpt[4] < min) {
1304                 /* No, so get another one */
1305                 scanpt[4]++;
1306                 if (regmatch(preg, scan + 5)) {
1307                         return 1;
1308                 }
1309                 scanpt[4]--;
1310                 return 0;
1311         }
1312         if (scanpt[4] > max) {
1313                 return 0;
1314         }
1315
1316         if (matchmin) {
1317                 /* minimal, so try other branch first */
1318                 if (regmatch(preg, regnext(preg, scan))) {
1319                         return 1;
1320                 }
1321                 /* No, so try one more */
1322                 scanpt[4]++;
1323                 if (regmatch(preg, scan + 5)) {
1324                         return 1;
1325                 }
1326                 scanpt[4]--;
1327                 return 0;
1328         }
1329         /* maximal, so try this branch again */
1330         if (scanpt[4] < max) {
1331                 scanpt[4]++;
1332                 if (regmatch(preg, scan + 5)) {
1333                         return 1;
1334                 }
1335                 scanpt[4]--;
1336         }
1337         /* At this point we are at max with no match. Try the other branch */
1338         return regmatch(preg, regnext(preg, scan));
1339 }
1340
1341 /*
1342  - regmatch - main matching routine
1343  *
1344  * Conceptually the strategy is simple:  check to see whether the current
1345  * node matches, call self recursively to see whether the rest matches,
1346  * and then act accordingly.  In practice we make some effort to avoid
1347  * recursion, in particular by going through "ordinary" nodes (that don't
1348  * need to know whether the rest of the match failed) by a loop instead of
1349  * by recursion.
1350  */
1351 /* 0 failure, 1 success */
1352 static int regmatch(regex_t *preg, int prog)
1353 {
1354         int scan;       /* Current node. */
1355         int next;               /* Next node. */
1356         const char *save;
1357
1358         scan = prog;
1359
1360 #ifdef DEBUG
1361         if (scan != 0 && regnarrate)
1362                 fprintf(stderr, "%s(\n", regprop(scan));
1363 #endif
1364         while (scan != 0) {
1365                 int n;
1366                 int c;
1367 #ifdef DEBUG
1368                 if (regnarrate) {
1369                         fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1370                 }
1371 #endif
1372                 next = regnext(preg, scan);
1373                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1374
1375                 switch (OP(preg, scan)) {
1376                 case BOL:
1377                         if (preg->reginput != preg->regbol)
1378                                 return(0);
1379                         break;
1380                 case EOL:
1381                         if (!reg_iseol(preg, c)) {
1382                                 return(0);
1383                         }
1384                         break;
1385                 case WORDA:
1386                         /* Must be looking at a letter, digit, or _ */
1387                         if ((!isalnum(UCHAR(c))) && c != '_')
1388                                 return(0);
1389                         /* Prev must be BOL or nonword */
1390                         if (preg->reginput > preg->regbol &&
1391                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1392                                 return(0);
1393                         break;
1394                 case WORDZ:
1395                         /* Can't match at BOL */
1396                         if (preg->reginput > preg->regbol) {
1397                                 /* Current must be EOL or nonword */
1398                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1399                                         c = preg->reginput[-1];
1400                                         /* Previous must be word */
1401                                         if (isalnum(UCHAR(c)) || c == '_') {
1402                                                 break;
1403                                         }
1404                                 }
1405                         }
1406                         /* No */
1407                         return(0);
1408
1409                 case ANY:
1410                         if (reg_iseol(preg, c))
1411                                 return 0;
1412                         preg->reginput += n;
1413                         break;
1414                 case EXACTLY: {
1415                                 int opnd;
1416                                 int len;
1417                                 int slen;
1418
1419                                 opnd = OPERAND(scan);
1420                                 len = str_int_len(preg->program + opnd);
1421
1422                                 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1423                                 if (slen < 0) {
1424                                         return(0);
1425                                 }
1426                                 preg->reginput += slen;
1427                         }
1428                         break;
1429                 case ANYOF:
1430                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1431                                 return(0);
1432                         }
1433                         preg->reginput += n;
1434                         break;
1435                 case ANYBUT:
1436                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1437                                 return(0);
1438                         }
1439                         preg->reginput += n;
1440                         break;
1441                 case NOTHING:
1442                         break;
1443                 case BACK:
1444                         break;
1445                 case BRANCH:
1446                         if (OP(preg, next) != BRANCH)           /* No choice. */
1447                                 next = OPERAND(scan);   /* Avoid recursion. */
1448                         else {
1449                                 do {
1450                                         save = preg->reginput;
1451                                         if (regmatch(preg, OPERAND(scan))) {
1452                                                 return(1);
1453                                         }
1454                                         preg->reginput = save;
1455                                         scan = regnext(preg, scan);
1456                                 } while (scan != 0 && OP(preg, scan) == BRANCH);
1457                                 return(0);
1458                                 /* NOTREACHED */
1459                         }
1460                         break;
1461                 case REP:
1462                 case REPMIN:
1463                         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1464
1465                 case REPX:
1466                 case REPXMIN:
1467                         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1468
1469                 case END:
1470                         return 1;       /* Success! */
1471
1472                 case OPENNC:
1473                 case CLOSENC:
1474                         return regmatch(preg, next);
1475
1476                 default:
1477                         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1478                                 save = preg->reginput;
1479                                 if (regmatch(preg, next)) {
1480                                         if (OP(preg, scan) < CLOSE) {
1481                                                 int no = OP(preg, scan) - OPEN;
1482                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1483                                                         preg->pmatch[no].rm_so = save - preg->start;
1484                                                 }
1485                                         }
1486                                         else {
1487                                                 int no = OP(preg, scan) - CLOSE;
1488                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1489                                                         preg->pmatch[no].rm_eo = save - preg->start;
1490                                                 }
1491                                         }
1492                                         return(1);
1493                                 }
1494                                 return(0);
1495                         }
1496                         return REG_ERR_INTERNAL;
1497                 }
1498
1499                 scan = next;
1500         }
1501
1502         /*
1503          * We get here only if there's trouble -- normally "case END" is
1504          * the terminating point.
1505          */
1506         return REG_ERR_INTERNAL;
1507 }
1508
1509 /*
1510  - regrepeat - repeatedly match something simple, report how many
1511  */
1512 static int regrepeat(regex_t *preg, int p, int max)
1513 {
1514         int count = 0;
1515         const char *scan;
1516         int opnd;
1517         int ch;
1518         int n;
1519
1520         scan = preg->reginput;
1521         opnd = OPERAND(p);
1522         switch (OP(preg, p)) {
1523         case ANY:
1524                 /* No need to handle utf8 specially here */
1525                 while (!reg_iseol(preg, *scan) && count < max) {
1526                         count++;
1527                         scan++;
1528                 }
1529                 break;
1530         case EXACTLY:
1531                 while (count < max) {
1532                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1533                         if (preg->program[opnd] != ch) {
1534                                 break;
1535                         }
1536                         count++;
1537                         scan += n;
1538                 }
1539                 break;
1540         case ANYOF:
1541                 while (count < max) {
1542                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1543                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1544                                 break;
1545                         }
1546                         count++;
1547                         scan += n;
1548                 }
1549                 break;
1550         case ANYBUT:
1551                 while (count < max) {
1552                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1553                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1554                                 break;
1555                         }
1556                         count++;
1557                         scan += n;
1558                 }
1559                 break;
1560         default:                /* Oh dear.  Called inappropriately. */
1561                 preg->err = REG_ERR_INTERNAL;
1562                 count = 0;      /* Best compromise. */
1563                 break;
1564         }
1565         preg->reginput = scan;
1566
1567         return(count);
1568 }
1569
1570 /*
1571  - regnext - dig the "next" pointer out of a node
1572  */
1573 static int regnext(regex_t *preg, int p )
1574 {
1575         int offset;
1576
1577         offset = NEXT(preg, p);
1578
1579         if (offset == 0)
1580                 return 0;
1581
1582         if (OP(preg, p) == BACK)
1583                 return(p-offset);
1584         else
1585                 return(p+offset);
1586 }
1587
1588 /*
1589  - regopsize - returns the size of opcode + operands at 'p' in words
1590  */
1591 static int regopsize(regex_t *preg, int p )
1592 {
1593         /* Almost all opcodes are 2 words, but some are more */
1594         switch (OP(preg, p)) {
1595                 case REP:
1596                 case REPMIN:
1597                 case REPX:
1598                 case REPXMIN:
1599                         return 5;
1600
1601                 case ANYOF:
1602                 case ANYBUT:
1603                 case EXACTLY: {
1604                         int s = p + 2;
1605                         while (preg->program[s++]) {
1606                         }
1607                         return s - p;
1608                 }
1609         }
1610         return 2;
1611 }
1612
1613 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1614
1615 /*
1616  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1617  */
1618 static void regdump(regex_t *preg)
1619 {
1620         int s;
1621         int op = EXACTLY;       /* Arbitrary non-END op. */
1622         int next;
1623         char buf[MAX_UTF8_LEN + 1];
1624
1625         int i;
1626         for (i = 1; i < preg->p; i++) {
1627                 printf("%02x ", (unsigned char)preg->program[i]);
1628                 if (i % 16 == 0) {
1629                         printf("\n");
1630                 }
1631         }
1632         printf("\n");
1633
1634         s = 1;
1635         while (op != END && s < preg->p) {      /* While that wasn't END last time... */
1636                 op = OP(preg, s);
1637                 printf("%3d: %s", s, regprop(op));      /* Where, what. */
1638                 next = regnext(preg, s);
1639                 if (next == 0)          /* Next ptr. */
1640                         printf("(0)");
1641                 else
1642                         printf("(%d)", next);
1643                 s += 2;
1644                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1645                         int max = preg->program[s];
1646                         int min = preg->program[s + 1];
1647                         if (max == 65535) {
1648                                 printf("{%d,*}", min);
1649                         }
1650                         else {
1651                                 printf("{%d,%d}", min, max);
1652                         }
1653                         printf(" %d", preg->program[s + 2]);
1654                         s += 3;
1655                 }
1656                 else if (op == ANYOF || op == ANYBUT) {
1657                         /* set of ranges */
1658
1659                         while (preg->program[s]) {
1660                                 int len = preg->program[s++];
1661                                 int first = preg->program[s++];
1662                                 buf[utf8_getchars(buf, first)] = 0;
1663                                 printf("%s", buf);
1664                                 if (len > 1) {
1665                                         buf[utf8_getchars(buf, first + len - 1)] = 0;
1666                                         printf("-%s", buf);
1667                                 }
1668                         }
1669                         s++;
1670                 }
1671                 else if (op == EXACTLY) {
1672                         /* Literal string, where present. */
1673
1674                         while (preg->program[s]) {
1675                                 buf[utf8_getchars(buf, preg->program[s])] = 0;
1676                                 printf("%s", buf);
1677                                 s++;
1678                         }
1679                         s++;
1680                 }
1681                 putchar('\n');
1682         }
1683
1684         if (op == END) {
1685                 /* Header fields of interest. */
1686                 if (preg->regstart) {
1687                         buf[utf8_getchars(buf, preg->regstart)] = 0;
1688                         printf("start '%s' ", buf);
1689                 }
1690                 if (preg->reganch)
1691                         printf("anchored ");
1692                 if (preg->regmust != 0) {
1693                         int i;
1694                         printf("must have:");
1695                         for (i = 0; i < preg->regmlen; i++) {
1696                                 putchar(preg->program[preg->regmust + i]);
1697                         }
1698                         putchar('\n');
1699                 }
1700         }
1701         printf("\n");
1702 }
1703
1704 /*
1705  - regprop - printable representation of opcode
1706  */
1707 static const char *regprop( int op )
1708 {
1709         static char buf[50];
1710
1711         switch (op) {
1712         case BOL:
1713                 return "BOL";
1714         case EOL:
1715                 return "EOL";
1716         case ANY:
1717                 return "ANY";
1718         case ANYOF:
1719                 return "ANYOF";
1720         case ANYBUT:
1721                 return "ANYBUT";
1722         case BRANCH:
1723                 return "BRANCH";
1724         case EXACTLY:
1725                 return "EXACTLY";
1726         case NOTHING:
1727                 return "NOTHING";
1728         case BACK:
1729                 return "BACK";
1730         case END:
1731                 return "END";
1732         case REP:
1733                 return "REP";
1734         case REPMIN:
1735                 return "REPMIN";
1736         case REPX:
1737                 return "REPX";
1738         case REPXMIN:
1739                 return "REPXMIN";
1740         case WORDA:
1741                 return "WORDA";
1742         case WORDZ:
1743                 return "WORDZ";
1744         case OPENNC:
1745                 return "OPEN";
1746         case CLOSENC:
1747                 return "CLOSE";
1748         default:
1749                 if (op >= OPEN && op < CLOSE) {
1750                         snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1751                 }
1752                 else if (op >= CLOSE && op < CLOSE_END) {
1753                         snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1754                 }
1755                 else {
1756                         snprintf(buf, sizeof(buf), "?%d?\n", op);
1757                 }
1758                 return(buf);
1759         }
1760 }
1761 #endif /* JIM_BOOTSTRAP */
1762
1763 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1764 {
1765         static const char *error_strings[] = {
1766                 "success",
1767                 "no match",
1768                 "bad pattern",
1769                 "null argument",
1770                 "unknown error",
1771                 "too big",
1772                 "out of memory",
1773                 "too many ()",
1774                 "parentheses () not balanced",
1775                 "braces {} not balanced",
1776                 "invalid repetition count(s)",
1777                 "extra characters",
1778                 "*+ of empty atom",
1779                 "nested count",
1780                 "internal error",
1781                 "count follows nothing",
1782                 "trailing backslash",
1783                 "corrupted program",
1784                 "contains null char",
1785         };
1786         const char *err;
1787
1788         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1789                 err = "Bad error code";
1790         }
1791         else {
1792                 err = error_strings[errcode];
1793         }
1794
1795         return snprintf(errbuf, errbuf_size, "%s", err);
1796 }
1797
1798 void regfree(regex_t *preg)
1799 {
1800         free(preg->program);
1801 }
1802
1803 #endif