jimregexp.c

   1 /*
   2  * vi:se ts=8:
   3  *
   4  * regcomp and regexec -- regsub and regerror are elsewhere
   5  *
   6  *      Copyright (c) 1986 by University of Toronto.
   7  *      Written by Henry Spencer.  Not derived from licensed software.
   8  *
   9  *      Permission is granted to anyone to use this software for any
  10  *      purpose on any computer system, and to redistribute it freely,
  11  *      subject to the following restrictions:
  12  *
  13  *      1. The author is not responsible for the consequences of use of
  14  *              this software, no matter how awful, even if they arise
  15  *              from defects in it.
  16  *
  17  *      2. The origin of this software must not be misrepresented, either
  18  *              by explicit claim or by omission.
  19  *
  20  *      3. Altered versions must be plainly marked as such, and must not
  21  *              be misrepresented as being the original software.
  22  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  23  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  24  *** to assist in implementing egrep.
  25  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  26  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  27  *** as in BSD grep and ex.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  29  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  31  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  32  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  33  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  34  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  35  *** to avoid conflicting with the system regexp.h.  Const, bless its
  36  *** soul, was removed so it can compile everywhere.  The declaration
  37  *** of strchr() was in conflict on AIX, so it was removed (as it is
  38  *** happily defined in string.h).
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  41  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  42  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  43  *
  44  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  45  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  46  *   This includes counted repetitions, UTF-8 support, character classes,
  47  *   shorthand character classes, increased number of parentheses to 100,
  48  *   backslash escape sequences. It also removes \n as an alternative to |.
  49  *
  50  * Beware that some of this code is subtly aware of the way operator
  51  * precedence is structured in regular expressions.  Serious changes in
  52  * regular-expression syntax might require a total rethink.
  53  */
  54 #include <stdio.h>
  55 #include <ctype.h>
  56 #include <stdlib.h>
  57 #include <string.h>
  58
  59 #include "jim.h"
  60 #include "jimautoconf.h"
  61 #include "jimregexp.h"
  62 #include "utf8.h"
  63
  64 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
  65
  66 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
  67 #define REG_MAX_PAREN 100
  68
  69 /*
  70  * Structure for regexp "program".  This is essentially a linear encoding
  71  * of a nondeterministic finite-state machine (aka syntax charts or
  72  * "railroad normal form" in parsing technology).  Each node is an opcode
  73  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  74  * all nodes except BRANCH implement concatenation; a "next" pointer with
  75  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  76  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  77  * opposed to a collection of them) is never concatenated with anything
  78  * because of operator precedence.)  The operand of some types of node is
  79  * a literal string; for others, it is a node leading into a sub-FSM.  In
  80  * particular, the operand of a BRANCH node is the first node of the branch.
  81  * (NB this is *not* a tree structure:  the tail of the branch connects
  82  * to the thing following the set of BRANCHes.)  The opcodes are:
  83  */
  84
  85 /* definition           number  opnd?   meaning */
  86 #define END     0       /* no   End of program. */
  87 #define BOL     1       /* no   Match "" at beginning of line. */
  88 #define EOL     2       /* no   Match "" at end of line. */
  89 #define ANY     3       /* no   Match any one character. */
  90 #define ANYOF   4       /* str  Match any character in this string. */
  91 #define ANYBUT  5       /* str  Match any character not in this string. */
  92 #define BRANCH  6       /* node Match this alternative, or the next... */
  93 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  94 #define EXACTLY 8       /* str  Match this string. */
  95 #define NOTHING 9       /* no   Match empty string. */
  96 #define REP     10      /* max,min      Match this (simple) thing [min,max] times. */
  97 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, minimal match. */
  98 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
  99 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
 100
 101 #define WORDA   15      /* no   Match "" at wordchar, where prev is nonword */
 102 #define WORDZ   16      /* no   Match "" at nonwordchar, where prev is word */
 103
 104 #define OPENNC  1000    /* no   Non-capturing parentheses - must be OPEN-1 */
 105 #define OPEN    1001    /* no   Mark this point in input as start of #n. */
 106                         /*      OPEN+1 is number 1, etc. */
 107
 108 /* must not be any other opts between OPEN and CLOSE */
 109
 110 #define CLOSENC 2000    /* no   Non-capturing parentheses - must be CLOSE-1 */
 111 #define CLOSE   2001    /* no   Analogous to OPEN. */
 112 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 113
 114 /*
 115  * The first word of the regexp internal "program" is actually this magic
 116  * number; the start node begins in the second word.
 117  */
 118 #define REG_MAGIC       0xFADED00D
 119
 120 /*
 121  * Opcode notes:
 122  *
 123  * BRANCH       The set of branches constituting a single choice are hooked
 124  *              together with their "next" pointers, since precedence prevents
 125  *              anything being concatenated to any individual branch.  The
 126  *              "next" pointer of the last BRANCH in a choice points to the
 127  *              thing following the whole choice.  This is also where the
 128  *              final "next" pointer of each individual branch points; each
 129  *              branch starts with the operand node of a BRANCH node.
 130  *
 131  * BACK         Normal "next" pointers all implicitly point forward; BACK
 132  *              exists to make loop structures possible.
 133  *
 134  * REP,REPX     Repeated matches ('?', '*', '+' and {min,max}) are implemented
 135  *              as either simple repeats (REP) or complex repeats (REPX).
 136  *              These opcodes include a "min" and "max" count after the opcode.
 137  *              This is followed by a fourth "current count" word that is
 138  *              only used by REPX, as it implements a recursive match.
 139  *              REPMIN and REPXMIN are identical except they implement minimal repeats.
 140  *
 141  * OPEN,CLOSE   ...are numbered at compile time.
 142  */
 143
 144 /*
 145  * A node is one word of opcode followed by one word of "next" pointer.
 146  * The "next" pointer value is a positive offset from the opcode of the node
 147  * containing it.
 148  * An operand, if any, simply follows the node.  (Note that much of the
 149  * code generation knows about this implicit relationship.)
 150  */
 151 #define OP(preg, p)     (preg->program[p])
 152 #define NEXT(preg, p)   (preg->program[p + 1])
 153 #define OPERAND(p)      ((p) + 2)
 154
 155 /*
 156  * See regmagic.h for one further detail of program structure.
 157  */
 158
 159
 160 /*
 161  * Utility definitions.
 162  */
 163
 164 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 165 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 166 #define META            "^$.[()|?{+*"
 167
 168 /*
 169  * Flags to be passed up and down.
 170  */
 171 #define HASWIDTH        1       /* Known never to match null string. */
 172 #define SIMPLE          2       /* Simple enough to be STAR/PLUS operand. */
 173 #define SPSTART         4       /* Starts with * or +. */
 174 #define WORST           0       /* Worst case. */
 175
 176 #define MAX_REP_COUNT 1000000
 177
 178 /*
 179  * Forward declarations for regcomp()'s friends.
 180  */
 181 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 182 static int regpiece(regex_t *preg, int *flagp );
 183 static int regbranch(regex_t *preg, int *flagp );
 184 static int regatom(regex_t *preg, int *flagp );
 185 static int regnode(regex_t *preg, int op );
 186 static int regnext(regex_t *preg, int p );
 187 static void regc(regex_t *preg, int b );
 188 static int reginsert(regex_t *preg, int op, int size, int opnd );
 189 static void regtail(regex_t *preg, int p, int val);
 190 static void regoptail(regex_t *preg, int p, int val );
 191 static int regopsize(regex_t *preg, int p );
 192
 193 static int reg_range_find(const int *string, int c);
 194 static const char *str_find(const char *string, int c, int nocase);
 195 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 196
 197 /*#define DEBUG*/
 198 #ifdef DEBUG
 199 static int regnarrate = 0;
 200 static void regdump(regex_t *preg);
 201 static const char *regprop( int op );
 202 #endif
 203
 204
 205 /**
 206  * Returns the length of the null-terminated integer sequence.
 207  */
 208 static int str_int_len(const int *seq)
 209 {
 210         int n = 0;
 211         while (*seq++) {
 212                 n++;
 213         }
 214         return n;
 215 }
 216
 217 /*
 218  - regcomp - compile a regular expression into internal code
 219  *
 220  * We can't allocate space until we know how big the compiled form will be,
 221  * but we can't compile it (and thus know how big it is) until we've got a
 222  * place to put the code.  So we cheat:  we compile it twice, once with code
 223  * generation turned off and size counting turned on, and once "for real".
 224  * This also means that we don't allocate space until we are sure that the
 225  * thing really will compile successfully, and we never have to move the
 226  * code and thus invalidate pointers into it.  (Note that it has to be in
 227  * one piece because free() must be able to free it all.)
 228  *
 229  * Beware that the optimization-preparation code in here knows about some
 230  * of the structure of the compiled regexp.
 231  */
 232 int regcomp(regex_t *preg, const char *exp, int cflags)
 233 {
 234         int scan;
 235         int longest;
 236         unsigned len;
 237         int flags;
 238
 239 #ifdef DEBUG
 240         fprintf(stderr, "Compiling: '%s'\n", exp);
 241 #endif
 242         memset(preg, 0, sizeof(*preg));
 243
 244         if (exp == NULL)
 245                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 246
 247         /* First pass: determine size, legality. */
 248         preg->cflags = cflags;
 249         preg->regparse = exp;
 250
 251         /* Allocate space. */
 252         preg->proglen = (strlen(exp) + 1) * 5;
 253         preg->program = malloc(preg->proglen * sizeof(int));
 254         if (preg->program == NULL)
 255                 FAIL(preg, REG_ERR_NOMEM);
 256
 257         /* Note that since we store a magic value as the first item in the program,
 258          * program offsets will never be 0
 259          */
 260         regc(preg, REG_MAGIC);
 261         if (reg(preg, 0, &flags) == 0) {
 262                 return preg->err;
 263         }
 264
 265         /* Small enough for pointer-storage convention? */
 266         if (preg->re_nsub >= REG_MAX_PAREN)             /* Probably could be 65535L. */
 267                 FAIL(preg,REG_ERR_TOO_BIG);
 268
 269         /* Dig out information for optimizations. */
 270         preg->regstart = 0;     /* Worst-case defaults. */
 271         preg->reganch = 0;
 272         preg->regmust = 0;
 273         preg->regmlen = 0;
 274         scan = 1;                       /* First BRANCH. */
 275         if (OP(preg, regnext(preg, scan)) == END) {             /* Only one top-level choice. */
 276                 scan = OPERAND(scan);
 277
 278                 /* Starting-point info. */
 279                 if (OP(preg, scan) == EXACTLY) {
 280                         preg->regstart = preg->program[OPERAND(scan)];
 281                 }
 282                 else if (OP(preg, scan) == BOL)
 283                         preg->reganch++;
 284
 285                 /*
 286                  * If there's something expensive in the r.e., find the
 287                  * longest literal string that must appear and make it the
 288                  * regmust.  Resolve ties in favor of later strings, since
 289                  * the regstart check works with the beginning of the r.e.
 290                  * and avoiding duplication strengthens checking.  Not a
 291                  * strong reason, but sufficient in the absence of others.
 292                  */
 293                 if (flags&SPSTART) {
 294                         longest = 0;
 295                         len = 0;
 296                         for (; scan != 0; scan = regnext(preg, scan)) {
 297                                 if (OP(preg, scan) == EXACTLY) {
 298                                         int plen = str_int_len(preg->program + OPERAND(scan));
 299                                         if (plen >= len) {
 300                                                 longest = OPERAND(scan);
 301                                                 len = plen;
 302                                         }
 303                                 }
 304                         }
 305                         preg->regmust = longest;
 306                         preg->regmlen = len;
 307                 }
 308         }
 309
 310 #ifdef DEBUG
 311         regdump(preg);
 312 #endif
 313
 314         return 0;
 315 }
 316
 317 /*
 318  - reg - regular expression, i.e. main body or parenthesized thing
 319  *
 320  * Caller must absorb opening parenthesis.
 321  *
 322  * Combining parenthesis handling with the base level of regular expression
 323  * is a trifle forced, but the need to tie the tails of the branches to what
 324  * follows makes it hard to avoid.
 325  */
 326 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 327 {
 328         int ret;
 329         int br;
 330         int ender;
 331         int parno = 0;
 332         int flags;
 333
 334         *flagp = HASWIDTH;      /* Tentatively. */
 335
 336         /* Make an OPEN node, if parenthesized. */
 337         if (paren) {
 338                 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 339                         /* non-capturing paren */
 340                         preg->regparse += 2;
 341                         parno = -1;
 342                 }
 343                 else {
 344                         parno = ++preg->re_nsub;
 345                 }
 346                 ret = regnode(preg, OPEN+parno);
 347         } else
 348                 ret = 0;
 349
 350         /* Pick up the branches, linking them together. */
 351         br = regbranch(preg, &flags);
 352         if (br == 0)
 353                 return 0;
 354         if (ret != 0)
 355                 regtail(preg, ret, br); /* OPEN -> first. */
 356         else
 357                 ret = br;
 358         if (!(flags&HASWIDTH))
 359                 *flagp &= ~HASWIDTH;
 360         *flagp |= flags&SPSTART;
 361         while (*preg->regparse == '|') {
 362                 preg->regparse++;
 363                 br = regbranch(preg, &flags);
 364                 if (br == 0)
 365                         return 0;
 366                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 367                 if (!(flags&HASWIDTH))
 368                         *flagp &= ~HASWIDTH;
 369                 *flagp |= flags&SPSTART;
 370         }
 371
 372         /* Make a closing node, and hook it on the end. */
 373         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 374         regtail(preg, ret, ender);
 375
 376         /* Hook the tails of the branches to the closing node. */
 377         for (br = ret; br != 0; br = regnext(preg, br))
 378                 regoptail(preg, br, ender);
 379
 380         /* Check for proper termination. */
 381         if (paren && *preg->regparse++ != ')') {
 382                 preg->err = REG_ERR_UNMATCHED_PAREN;
 383                 return 0;
 384         } else if (!paren && *preg->regparse != '\0') {
 385                 if (*preg->regparse == ')') {
 386                         preg->err = REG_ERR_UNMATCHED_PAREN;
 387                         return 0;
 388                 } else {
 389                         preg->err = REG_ERR_JUNK_ON_END;
 390                         return 0;
 391                 }
 392         }
 393
 394         return(ret);
 395 }
 396
 397 /*
 398  - regbranch - one alternative of an | operator
 399  *
 400  * Implements the concatenation operator.
 401  */
 402 static int regbranch(regex_t *preg, int *flagp )
 403 {
 404         int ret;
 405         int chain;
 406         int latest;
 407         int flags;
 408
 409         *flagp = WORST;         /* Tentatively. */
 410
 411         ret = regnode(preg, BRANCH);
 412         chain = 0;
 413         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 414                *preg->regparse != '|') {
 415                 latest = regpiece(preg, &flags);
 416                 if (latest == 0)
 417                         return 0;
 418                 *flagp |= flags&HASWIDTH;
 419                 if (chain == 0) {/* First piece. */
 420                         *flagp |= flags&SPSTART;
 421                 }
 422                 else {
 423                         regtail(preg, chain, latest);
 424                 }
 425                 chain = latest;
 426         }
 427         if (chain == 0) /* Loop ran zero times. */
 428                 (void) regnode(preg, NOTHING);
 429
 430         return(ret);
 431 }
 432
 433 /*
 434  - regpiece - something followed by possible [*+?]
 435  *
 436  * Note that the branching code sequences used for ? and the general cases
 437  * of * and + are somewhat optimized:  they use the same NOTHING node as
 438  * both the endmarker for their branch list and the body of the last branch.
 439  * It might seem that this node could be dispensed with entirely, but the
 440  * endmarker role is not redundant.
 441  */
 442 static int regpiece(regex_t *preg, int *flagp)
 443 {
 444         int ret;
 445         char op;
 446         int next;
 447         int flags;
 448         int chain = 0;
 449         int min;
 450         int max;
 451
 452         ret = regatom(preg, &flags);
 453         if (ret == 0)
 454                 return 0;
 455
 456         op = *preg->regparse;
 457         if (!ISMULT(op)) {
 458                 *flagp = flags;
 459                 return(ret);
 460         }
 461
 462         if (!(flags&HASWIDTH) && op != '?') {
 463                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 464                 return 0;
 465         }
 466
 467         /* Handle braces (counted repetition) by expansion */
 468         if (op == '{') {
 469                 char *end;
 470
 471                 min = strtoul(preg->regparse + 1, &end, 10);
 472                 if (end == preg->regparse + 1) {
 473                         preg->err = REG_ERR_BAD_COUNT;
 474                         return 0;
 475                 }
 476                 if (*end == '}') {
 477                         max = min;
 478                 }
 479                 else {
 480                         preg->regparse = end;
 481                         max = strtoul(preg->regparse + 1, &end, 10);
 482                         if (*end != '}') {
 483                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 484                                 return 0;
 485                         }
 486                 }
 487                 if (end == preg->regparse + 1) {
 488                         max = MAX_REP_COUNT;
 489                 }
 490                 else if (max < min || max >= 100) {
 491                         preg->err = REG_ERR_BAD_COUNT;
 492                         return 0;
 493                 }
 494                 if (min >= 100) {
 495                         preg->err = REG_ERR_BAD_COUNT;
 496                         return 0;
 497                 }
 498
 499                 preg->regparse = strchr(preg->regparse, '}');
 500         }
 501         else {
 502                 min = (op == '+');
 503                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 504         }
 505
 506         if (preg->regparse[1] == '?') {
 507                 preg->regparse++;
 508                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 509         }
 510         else {
 511                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 512         }
 513         preg->program[ret + 2] = max;
 514         preg->program[ret + 3] = min;
 515         preg->program[ret + 4] = 0;
 516
 517         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 518
 519         if (!(flags & SIMPLE)) {
 520                 int back = regnode(preg, BACK);
 521                 regtail(preg, back, ret);
 522                 regtail(preg, next, back);
 523         }
 524
 525         preg->regparse++;
 526         if (ISMULT(*preg->regparse)) {
 527                 preg->err = REG_ERR_NESTED_COUNT;
 528                 return 0;
 529         }
 530
 531         return chain ? chain : ret;
 532 }
 533
 534 /**
 535  * Add all characters in the inclusive range between lower and upper.
 536  *
 537  * Handles a swapped range (upper < lower).
 538  */
 539 static void reg_addrange(regex_t *preg, int lower, int upper)
 540 {
 541         if (lower > upper) {
 542                 reg_addrange(preg, upper, lower);
 543         }
 544         /* Add a range as length, start */
 545         regc(preg, upper - lower + 1);
 546         regc(preg, lower);
 547 }
 548
 549 /**
 550  * Add a null-terminated literal string as a set of ranges.
 551  */
 552 static void reg_addrange_str(regex_t *preg, const char *str)
 553 {
 554         while (*str) {
 555                 reg_addrange(preg, *str, *str);
 556                 str++;
 557         }
 558 }
 559
 560 /**
 561  * Extracts the next unicode char from utf8.
 562  *
 563  * If 'upper' is set, converts the char to uppercase.
 564  */
 565 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 566 {
 567         int l = utf8_tounicode(s, uc);
 568         if (upper) {
 569                 *uc = utf8_upper(*uc);
 570         }
 571         return l;
 572 }
 573
 574 /**
 575  * Converts a hex digit to decimal.
 576  *
 577  * Returns -1 for an invalid hex digit.
 578  */
 579 static int hexdigitval(int c)
 580 {
 581         if (c >= '0' && c <= '9')
 582                 return c - '0';
 583         if (c >= 'a' && c <= 'f')
 584                 return c - 'a' + 10;
 585         if (c >= 'A' && c <= 'F')
 586                 return c - 'A' + 10;
 587         return -1;
 588 }
 589
 590 /**
 591  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 592  *
 593  * Returns the number of hex digits parsed.
 594  * If there are no hex digits, returns 0 and stores nothing.
 595  */
 596 static int parse_hex(const char *s, int n, int *uc)
 597 {
 598         int val = 0;
 599         int k;
 600
 601         for (k = 0; k < n; k++) {
 602                 int c = hexdigitval(*s++);
 603                 if (c == -1) {
 604                         break;
 605                 }
 606                 val = (val << 4) | c;
 607         }
 608         if (k) {
 609                 *uc = val;
 610         }
 611         return k;
 612 }
 613
 614 /**
 615  * Call for chars after a backlash to decode the escape sequence.
 616  *
 617  * Stores the result in *ch.
 618  *
 619  * Returns the number of bytes consumed.
 620  */
 621 static int reg_decode_escape(const char *s, int *ch)
 622 {
 623         int n;
 624         const char *s0 = s;
 625
 626         *ch = *s++;
 627
 628         switch (*ch) {
 629                 case 'b': *ch = '\b'; break;
 630                 case 'e': *ch = 27; break;
 631                 case 'f': *ch = '\f'; break;
 632                 case 'n': *ch = '\n'; break;
 633                 case 'r': *ch = '\r'; break;
 634                 case 't': *ch = '\t'; break;
 635                 case 'v': *ch = '\v'; break;
 636                 case 'u':
 637                         if (*s == '{') {
 638                                 /* Expect \u{NNNN} */
 639                                 n = parse_hex(s + 1, 6, ch);
 640                                 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
 641                                         s += n + 2;
 642                                 }
 643                                 else {
 644                                         /* Invalid, so just treat as an escaped 'u' */
 645                                         *ch = 'u';
 646                                 }
 647                         }
 648                         else if ((n = parse_hex(s, 4, ch)) > 0) {
 649                                 s += n;
 650                         }
 651                         break;
 652                 case 'U':
 653                         if ((n = parse_hex(s, 8, ch)) > 0) {
 654                                 s += n;
 655                         }
 656                 case 'x':
 657                         if ((n = parse_hex(s, 2, ch)) > 0) {
 658                                 s += n;
 659                         }
 660                         break;
 661                 case '\0':
 662                         s--;
 663                         *ch = '\\';
 664                         break;
 665         }
 666         return s - s0;
 667 }
 668
 669 /*
 670  - regatom - the lowest level
 671  *
 672  * Optimization:  gobbles an entire sequence of ordinary characters so that
 673  * it can turn them into a single node, which is smaller to store and
 674  * faster to run.  Backslashed characters are exceptions, each becoming a
 675  * separate node; the code is simpler that way and it's not worth fixing.
 676  */
 677 static int regatom(regex_t *preg, int *flagp)
 678 {
 679         int ret;
 680         int flags;
 681         int nocase = (preg->cflags & REG_ICASE);
 682
 683         int ch;
 684         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 685
 686         *flagp = WORST;         /* Tentatively. */
 687
 688         preg->regparse += n;
 689         switch (ch) {
 690         /* FIXME: these chars only have meaning at beg/end of pat? */
 691         case '^':
 692                 ret = regnode(preg, BOL);
 693                 break;
 694         case '$':
 695                 ret = regnode(preg, EOL);
 696                 break;
 697         case '.':
 698                 ret = regnode(preg, ANY);
 699                 *flagp |= HASWIDTH|SIMPLE;
 700                 break;
 701         case '[': {
 702                         const char *pattern = preg->regparse;
 703
 704                         if (*pattern == '^') {  /* Complement of range. */
 705                                 ret = regnode(preg, ANYBUT);
 706                                 pattern++;
 707                         } else
 708                                 ret = regnode(preg, ANYOF);
 709
 710                         /* Special case. If the first char is ']' or '-', it is part of the set */
 711                         if (*pattern == ']' || *pattern == '-') {
 712                                 reg_addrange(preg, *pattern, *pattern);
 713                                 pattern++;
 714                         }
 715
 716                         while (*pattern && *pattern != ']') {
 717                                 /* Is this a range? a-z */
 718                                 int start;
 719                                 int end;
 720
 721                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 722                                 if (start == '\\') {
 723                                         pattern += reg_decode_escape(pattern, &start);
 724                                         if (start == 0) {
 725                                                 preg->err = REG_ERR_NULL_CHAR;
 726                                                 return 0;
 727                                         }
 728                                 }
 729                                 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
 730                                         /* skip '-' */
 731                                         pattern += utf8_tounicode(pattern, &end);
 732                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 733                                         if (end == '\\') {
 734                                                 pattern += reg_decode_escape(pattern, &end);
 735                                                 if (end == 0) {
 736                                                         preg->err = REG_ERR_NULL_CHAR;
 737                                                         return 0;
 738                                                 }
 739                                         }
 740
 741                                         reg_addrange(preg, start, end);
 742                                         continue;
 743                                 }
 744                                 if (start == '[') {
 745                                         if (strncmp(pattern, ":alpha:]", 8) == 0) {
 746                                                 if ((preg->cflags & REG_ICASE) == 0) {
 747                                                         reg_addrange(preg, 'a', 'z');
 748                                                 }
 749                                                 reg_addrange(preg, 'A', 'Z');
 750                                                 pattern += 8;
 751                                                 continue;
 752                                         }
 753                                         if (strncmp(pattern, ":alnum:]", 8) == 0) {
 754                                                 if ((preg->cflags & REG_ICASE) == 0) {
 755                                                         reg_addrange(preg, 'a', 'z');
 756                                                 }
 757                                                 reg_addrange(preg, 'A', 'Z');
 758                                                 reg_addrange(preg, '0', '9');
 759                                                 pattern += 8;
 760                                                 continue;
 761                                         }
 762                                         if (strncmp(pattern, ":space:]", 8) == 0) {
 763                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 764                                                 pattern += 8;
 765                                                 continue;
 766                                         }
 767                                 }
 768                                 /* Not a range, so just add the char */
 769                                 reg_addrange(preg, start, start);
 770                         }
 771                         regc(preg, '\0');
 772
 773                         if (*pattern) {
 774                                 pattern++;
 775                         }
 776                         preg->regparse = pattern;
 777
 778                         *flagp |= HASWIDTH|SIMPLE;
 779                 }
 780                 break;
 781         case '(':
 782                 ret = reg(preg, 1, &flags);
 783                 if (ret == 0)
 784                         return 0;
 785                 *flagp |= flags&(HASWIDTH|SPSTART);
 786                 break;
 787         case '\0':
 788         case '|':
 789         case ')':
 790                 preg->err = REG_ERR_INTERNAL;
 791                 return 0;       /* Supposed to be caught earlier. */
 792         case '?':
 793         case '+':
 794         case '*':
 795         case '{':
 796                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 797                 return 0;
 798         case '\\':
 799                 switch (*preg->regparse++) {
 800                 case '\0':
 801                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 802                         return 0;
 803                 case '<':
 804                 case 'm':
 805                         ret = regnode(preg, WORDA);
 806                         break;
 807                 case '>':
 808                 case 'M':
 809                         ret = regnode(preg, WORDZ);
 810                         break;
 811                 case 'd':
 812                         ret = regnode(preg, ANYOF);
 813                         reg_addrange(preg, '0', '9');
 814                         regc(preg, '\0');
 815                         *flagp |= HASWIDTH|SIMPLE;
 816                         break;
 817                 case 'w':
 818                         ret = regnode(preg, ANYOF);
 819                         if ((preg->cflags & REG_ICASE) == 0) {
 820                                 reg_addrange(preg, 'a', 'z');
 821                         }
 822                         reg_addrange(preg, 'A', 'Z');
 823                         reg_addrange(preg, '0', '9');
 824                         reg_addrange(preg, '_', '_');
 825                         regc(preg, '\0');
 826                         *flagp |= HASWIDTH|SIMPLE;
 827                         break;
 828                 case 's':
 829                         ret = regnode(preg, ANYOF);
 830                         reg_addrange_str(preg," \t\r\n\f\v");
 831                         regc(preg, '\0');
 832                         *flagp |= HASWIDTH|SIMPLE;
 833                         break;
 834                 /* FIXME: Someday handle \1, \2, ... */
 835                 default:
 836                         /* Handle general quoted chars in exact-match routine */
 837                         /* Back up to include the backslash */
 838                         preg->regparse--;
 839                         goto de_fault;
 840                 }
 841                 break;
 842         de_fault:
 843         default: {
 844                         /*
 845                          * Encode a string of characters to be matched exactly.
 846                          */
 847                         int added = 0;
 848
 849                         /* Back up to pick up the first char of interest */
 850                         preg->regparse -= n;
 851
 852                         ret = regnode(preg, EXACTLY);
 853
 854                         /* Note that a META operator such as ? or * consumes the
 855                          * preceding char.
 856                          * Thus we must be careful to look ahead by 2 and add the
 857                          * last char as it's own EXACTLY if necessary
 858                          */
 859
 860                         /* Until end of string or a META char is reached */
 861                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 862                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 863                                 if (ch == '\\' && preg->regparse[n]) {
 864                                         /* Non-trailing backslash.
 865                                          * Is this a special escape, or a regular escape?
 866                                          */
 867                                         if (strchr("<>mMwds", preg->regparse[n])) {
 868                                                 /* A special escape. All done with EXACTLY */
 869                                                 break;
 870                                         }
 871                                         /* Decode it. Note that we add the length for the escape
 872                                          * sequence to the length for the backlash so we can skip
 873                                          * the entire sequence, or not as required.
 874                                          */
 875                                         n += reg_decode_escape(preg->regparse + n, &ch);
 876                                         if (ch == 0) {
 877                                                 preg->err = REG_ERR_NULL_CHAR;
 878                                                 return 0;
 879                                         }
 880                                 }
 881
 882                                 /* Now we have one char 'ch' of length 'n'.
 883                                  * Check to see if the following char is a MULT
 884                                  */
 885
 886                                 if (ISMULT(preg->regparse[n])) {
 887                                         /* Yes. But do we already have some EXACTLY chars? */
 888                                         if (added) {
 889                                                 /* Yes, so return what we have and pick up the current char next time around */
 890                                                 break;
 891                                         }
 892                                         /* No, so add this single char and finish */
 893                                         regc(preg, ch);
 894                                         added++;
 895                                         preg->regparse += n;
 896                                         break;
 897                                 }
 898
 899                                 /* No, so just add this char normally */
 900                                 regc(preg, ch);
 901                                 added++;
 902                                 preg->regparse += n;
 903                         }
 904                         regc(preg, '\0');
 905
 906                         *flagp |= HASWIDTH;
 907                         if (added == 1)
 908                                 *flagp |= SIMPLE;
 909                         break;
 910                 }
 911                 break;
 912         }
 913
 914         return(ret);
 915 }
 916
 917 static void reg_grow(regex_t *preg, int n)
 918 {
 919         if (preg->p + n >= preg->proglen) {
 920                 preg->proglen = (preg->p + n) * 2;
 921                 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
 922         }
 923 }
 924
 925 /*
 926  - regnode - emit a node
 927  */
 928 /* Location. */
 929 static int regnode(regex_t *preg, int op)
 930 {
 931         reg_grow(preg, 2);
 932
 933         /* The OP followed by a next pointer */
 934         preg->program[preg->p++] = op;
 935         preg->program[preg->p++] = 0;
 936
 937         /* Return the start of the node */
 938         return preg->p - 2;
 939 }
 940
 941 /*
 942  - regc - emit (if appropriate) a byte of code
 943  */
 944 static void regc(regex_t *preg, int b )
 945 {
 946         reg_grow(preg, 1);
 947         preg->program[preg->p++] = b;
 948 }
 949
 950 /*
 951  - reginsert - insert an operator in front of already-emitted operand
 952  *
 953  * Means relocating the operand.
 954  * Returns the new location of the original operand.
 955  */
 956 static int reginsert(regex_t *preg, int op, int size, int opnd )
 957 {
 958         reg_grow(preg, size);
 959
 960         /* Move everything from opnd up */
 961         memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
 962         /* Zero out the new space */
 963         memset(preg->program + opnd, 0, sizeof(int) * size);
 964
 965         preg->program[opnd] = op;
 966
 967         preg->p += size;
 968
 969         return opnd + size;
 970 }
 971
 972 /*
 973  - regtail - set the next-pointer at the end of a node chain
 974  */
 975 static void regtail(regex_t *preg, int p, int val)
 976 {
 977         int scan;
 978         int temp;
 979         int offset;
 980
 981         /* Find last node. */
 982         scan = p;
 983         for (;;) {
 984                 temp = regnext(preg, scan);
 985                 if (temp == 0)
 986                         break;
 987                 scan = temp;
 988         }
 989
 990         if (OP(preg, scan) == BACK)
 991                 offset = scan - val;
 992         else
 993                 offset = val - scan;
 994
 995         preg->program[scan + 1] = offset;
 996 }
 997
 998 /*
 999  - regoptail - regtail on operand of first argument; nop if operandless
1000  */
1001
1002 static void regoptail(regex_t *preg, int p, int val )
1003 {
1004         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1005         if (p != 0 && OP(preg, p) == BRANCH) {
1006                 regtail(preg, OPERAND(p), val);
1007         }
1008 }
1009
1010 /*
1011  * regexec and friends
1012  */
1013
1014 /*
1015  * Forwards.
1016  */
1017 static int regtry(regex_t *preg, const char *string );
1018 static int regmatch(regex_t *preg, int prog);
1019 static int regrepeat(regex_t *preg, int p, int max);
1020
1021 /*
1022  - regexec - match a regexp against a string
1023  */
1024 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1025 {
1026         const char *s;
1027         int scan;
1028
1029         /* Be paranoid... */
1030         if (preg == NULL || preg->program == NULL || string == NULL) {
1031                 return REG_ERR_NULL_ARGUMENT;
1032         }
1033
1034         /* Check validity of program. */
1035         if (*preg->program != REG_MAGIC) {
1036                 return REG_ERR_CORRUPTED;
1037         }
1038
1039 #ifdef DEBUG
1040         fprintf(stderr, "regexec: %s\n", string);
1041         regdump(preg);
1042 #endif
1043
1044         preg->eflags = eflags;
1045         preg->pmatch = pmatch;
1046         preg->nmatch = nmatch;
1047         preg->start = string;   /* All offsets are computed from here */
1048
1049         /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1050         for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1051                 int op = OP(preg, scan);
1052                 if (op == END)
1053                         break;
1054                 if (op == REPX || op == REPXMIN)
1055                         preg->program[scan + 4] = 0;
1056         }
1057
1058         /* If there is a "must appear" string, look for it. */
1059         if (preg->regmust != 0) {
1060                 s = string;
1061                 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1062                         if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1063                                 break;
1064                         }
1065                         s++;
1066                 }
1067                 if (s == NULL)  /* Not present. */
1068                         return REG_NOMATCH;
1069         }
1070
1071         /* Mark beginning of line for ^ . */
1072         preg->regbol = string;
1073
1074         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1075         if (preg->reganch) {
1076                 if (eflags & REG_NOTBOL) {
1077                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1078                         goto nextline;
1079                 }
1080                 while (1) {
1081                         if (regtry(preg, string)) {
1082                                 return REG_NOERROR;
1083                         }
1084                         if (*string) {
1085 nextline:
1086                                 if (preg->cflags & REG_NEWLINE) {
1087                                         /* Try the next anchor? */
1088                                         string = strchr(string, '\n');
1089                                         if (string) {
1090                                                 preg->regbol = ++string;
1091                                                 continue;
1092                                         }
1093                                 }
1094                         }
1095                         return REG_NOMATCH;
1096                 }
1097         }
1098
1099         /* Messy cases:  unanchored match. */
1100         s = string;
1101         if (preg->regstart != '\0') {
1102                 /* We know what char it must start with. */
1103                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1104                         if (regtry(preg, s))
1105                                 return REG_NOERROR;
1106                         s++;
1107                 }
1108         }
1109         else
1110                 /* We don't -- general case. */
1111                 while (1) {
1112                         if (regtry(preg, s))
1113                                 return REG_NOERROR;
1114                         if (*s == '\0') {
1115                                 break;
1116                         }
1117                         else {
1118                                 int c;
1119                                 s += utf8_tounicode(s, &c);
1120                         }
1121                 }
1122
1123         /* Failure. */
1124         return REG_NOMATCH;
1125 }
1126
1127 /*
1128  - regtry - try match at specific point
1129  */
1130                         /* 0 failure, 1 success */
1131 static int regtry( regex_t *preg, const char *string )
1132 {
1133         int i;
1134
1135         preg->reginput = string;
1136
1137         for (i = 0; i < preg->nmatch; i++) {
1138                 preg->pmatch[i].rm_so = -1;
1139                 preg->pmatch[i].rm_eo = -1;
1140         }
1141         if (regmatch(preg, 1)) {
1142                 preg->pmatch[0].rm_so = string - preg->start;
1143                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1144                 return(1);
1145         } else
1146                 return(0);
1147 }
1148
1149 /**
1150  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1151  *
1152  * If 'nocase' is non-zero, does a case-insensitive match.
1153  *
1154  * Returns -1 on not found.
1155  */
1156 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1157 {
1158         const char *s = string;
1159         while (proglen && *s) {
1160                 int ch;
1161                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1162                 if (ch != *prog) {
1163                         return -1;
1164                 }
1165                 prog++;
1166                 s += n;
1167                 proglen--;
1168         }
1169         if (proglen == 0) {
1170                 return s - string;
1171         }
1172         return -1;
1173 }
1174
1175 /**
1176  * Searchs for 'c' in the range 'range'.
1177  *
1178  * Returns 1 if found, or 0 if not.
1179  */
1180 static int reg_range_find(const int *range, int c)
1181 {
1182         while (*range) {
1183                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1184                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1185                         return 1;
1186                 }
1187                 range += 2;
1188         }
1189         return 0;
1190 }
1191
1192 /**
1193  * Search for the character 'c' in the utf-8 string 'string'.
1194  *
1195  * If 'nocase' is set, the 'string' is assumed to be uppercase
1196  * and 'c' is converted to uppercase before matching.
1197  *
1198  * Returns the byte position in the string where the 'c' was found, or
1199  * NULL if not found.
1200  */
1201 static const char *str_find(const char *string, int c, int nocase)
1202 {
1203         if (nocase) {
1204                 /* The "string" should already be converted to uppercase */
1205                 c = utf8_upper(c);
1206         }
1207         while (*string) {
1208                 int ch;
1209                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1210                 if (c == ch) {
1211                         return string;
1212                 }
1213                 string += n;
1214         }
1215         return NULL;
1216 }
1217
1218 /**
1219  * Returns true if 'ch' is an end-of-line char.
1220  *
1221  * In REG_NEWLINE mode, \n is considered EOL in
1222  * addition to \0
1223  */
1224 static int reg_iseol(regex_t *preg, int ch)
1225 {
1226         if (preg->cflags & REG_NEWLINE) {
1227                 return ch == '\0' || ch == '\n';
1228         }
1229         else {
1230                 return ch == '\0';
1231         }
1232 }
1233
1234 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1235 {
1236         int nextch = '\0';
1237         const char *save;
1238         int no;
1239         int c;
1240
1241         int max = preg->program[scan + 2];
1242         int min = preg->program[scan + 3];
1243         int next = regnext(preg, scan);
1244
1245         /*
1246          * Lookahead to avoid useless match attempts
1247          * when we know what character comes next.
1248          */
1249         if (OP(preg, next) == EXACTLY) {
1250                 nextch = preg->program[OPERAND(next)];
1251         }
1252         save = preg->reginput;
1253         no = regrepeat(preg, scan + 5, max);
1254         if (no < min) {
1255                 return 0;
1256         }
1257         if (matchmin) {
1258                 /* from min up to no */
1259                 max = no;
1260                 no = min;
1261         }
1262         /* else from no down to min */
1263         while (1) {
1264                 if (matchmin) {
1265                         if (no > max) {
1266                                 break;
1267                         }
1268                 }
1269                 else {
1270                         if (no < min) {
1271                                 break;
1272                         }
1273                 }
1274                 preg->reginput = save + utf8_index(save, no);
1275                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1276                 /* If it could work, try it. */
1277                 if (reg_iseol(preg, nextch) || c == nextch) {
1278                         if (regmatch(preg, next)) {
1279                                 return(1);
1280                         }
1281                 }
1282                 if (matchmin) {
1283                         /* Couldn't or didn't, add one more */
1284                         no++;
1285                 }
1286                 else {
1287                         /* Couldn't or didn't -- back up. */
1288                         no--;
1289                 }
1290         }
1291         return(0);
1292 }
1293
1294 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1295 {
1296         int *scanpt = preg->program + scan;
1297
1298         int max = scanpt[2];
1299         int min = scanpt[3];
1300
1301         /* Have we reached min? */
1302         if (scanpt[4] < min) {
1303                 /* No, so get another one */
1304                 scanpt[4]++;
1305                 if (regmatch(preg, scan + 5)) {
1306                         return 1;
1307                 }
1308                 scanpt[4]--;
1309                 return 0;
1310         }
1311         if (scanpt[4] > max) {
1312                 return 0;
1313         }
1314
1315         if (matchmin) {
1316                 /* minimal, so try other branch first */
1317                 if (regmatch(preg, regnext(preg, scan))) {
1318                         return 1;
1319                 }
1320                 /* No, so try one more */
1321                 scanpt[4]++;
1322                 if (regmatch(preg, scan + 5)) {
1323                         return 1;
1324                 }
1325                 scanpt[4]--;
1326                 return 0;
1327         }
1328         /* maximal, so try this branch again */
1329         if (scanpt[4] < max) {
1330                 scanpt[4]++;
1331                 if (regmatch(preg, scan + 5)) {
1332                         return 1;
1333                 }
1334                 scanpt[4]--;
1335         }
1336         /* At this point we are at max with no match. Try the other branch */
1337         return regmatch(preg, regnext(preg, scan));
1338 }
1339
1340 /*
1341  - regmatch - main matching routine
1342  *
1343  * Conceptually the strategy is simple:  check to see whether the current
1344  * node matches, call self recursively to see whether the rest matches,
1345  * and then act accordingly.  In practice we make some effort to avoid
1346  * recursion, in particular by going through "ordinary" nodes (that don't
1347  * need to know whether the rest of the match failed) by a loop instead of
1348  * by recursion.
1349  */
1350 /* 0 failure, 1 success */
1351 static int regmatch(regex_t *preg, int prog)
1352 {
1353         int scan;       /* Current node. */
1354         int next;               /* Next node. */
1355         const char *save;
1356
1357         scan = prog;
1358
1359 #ifdef DEBUG
1360         if (scan != 0 && regnarrate)
1361                 fprintf(stderr, "%s(\n", regprop(scan));
1362 #endif
1363         while (scan != 0) {
1364                 int n;
1365                 int c;
1366 #ifdef DEBUG
1367                 if (regnarrate) {
1368                         fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1369                 }
1370 #endif
1371                 next = regnext(preg, scan);
1372                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1373
1374                 switch (OP(preg, scan)) {
1375                 case BOL:
1376                         if (preg->reginput != preg->regbol)
1377                                 return(0);
1378                         break;
1379                 case EOL:
1380                         if (!reg_iseol(preg, c)) {
1381                                 return(0);
1382                         }
1383                         break;
1384                 case WORDA:
1385                         /* Must be looking at a letter, digit, or _ */
1386                         if ((!isalnum(UCHAR(c))) && c != '_')
1387                                 return(0);
1388                         /* Prev must be BOL or nonword */
1389                         if (preg->reginput > preg->regbol &&
1390                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1391                                 return(0);
1392                         break;
1393                 case WORDZ:
1394                         /* Can't match at BOL */
1395                         if (preg->reginput > preg->regbol) {
1396                                 /* Current must be EOL or nonword */
1397                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1398                                         c = preg->reginput[-1];
1399                                         /* Previous must be word */
1400                                         if (isalnum(UCHAR(c)) || c == '_') {
1401                                                 break;
1402                                         }
1403                                 }
1404                         }
1405                         /* No */
1406                         return(0);
1407
1408                 case ANY:
1409                         if (reg_iseol(preg, c))
1410                                 return 0;
1411                         preg->reginput += n;
1412                         break;
1413                 case EXACTLY: {
1414                                 int opnd;
1415                                 int len;
1416                                 int slen;
1417
1418                                 opnd = OPERAND(scan);
1419                                 len = str_int_len(preg->program + opnd);
1420
1421                                 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1422                                 if (slen < 0) {
1423                                         return(0);
1424                                 }
1425                                 preg->reginput += slen;
1426                         }
1427                         break;
1428                 case ANYOF:
1429                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1430                                 return(0);
1431                         }
1432                         preg->reginput += n;
1433                         break;
1434                 case ANYBUT:
1435                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1436                                 return(0);
1437                         }
1438                         preg->reginput += n;
1439                         break;
1440                 case NOTHING:
1441                         break;
1442                 case BACK:
1443                         break;
1444                 case BRANCH:
1445                         if (OP(preg, next) != BRANCH)           /* No choice. */
1446                                 next = OPERAND(scan);   /* Avoid recursion. */
1447                         else {
1448                                 do {
1449                                         save = preg->reginput;
1450                                         if (regmatch(preg, OPERAND(scan))) {
1451                                                 return(1);
1452                                         }
1453                                         preg->reginput = save;
1454                                         scan = regnext(preg, scan);
1455                                 } while (scan != 0 && OP(preg, scan) == BRANCH);
1456                                 return(0);
1457                                 /* NOTREACHED */
1458                         }
1459                         break;
1460                 case REP:
1461                 case REPMIN:
1462                         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1463
1464                 case REPX:
1465                 case REPXMIN:
1466                         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1467
1468                 case END:
1469                         return 1;       /* Success! */
1470
1471                 case OPENNC:
1472                 case CLOSENC:
1473                         return regmatch(preg, next);
1474
1475                 default:
1476                         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1477                                 save = preg->reginput;
1478                                 if (regmatch(preg, next)) {
1479                                         if (OP(preg, scan) < CLOSE) {
1480                                                 int no = OP(preg, scan) - OPEN;
1481                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1482                                                         preg->pmatch[no].rm_so = save - preg->start;
1483                                                 }
1484                                         }
1485                                         else {
1486                                                 int no = OP(preg, scan) - CLOSE;
1487                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1488                                                         preg->pmatch[no].rm_eo = save - preg->start;
1489                                                 }
1490                                         }
1491                                         return(1);
1492                                 }
1493                                 return(0);
1494                         }
1495                         return REG_ERR_INTERNAL;
1496                 }
1497
1498                 scan = next;
1499         }
1500
1501         /*
1502          * We get here only if there's trouble -- normally "case END" is
1503          * the terminating point.
1504          */
1505         return REG_ERR_INTERNAL;
1506 }
1507
1508 /*
1509  - regrepeat - repeatedly match something simple, report how many
1510  */
1511 static int regrepeat(regex_t *preg, int p, int max)
1512 {
1513         int count = 0;
1514         const char *scan;
1515         int opnd;
1516         int ch;
1517         int n;
1518
1519         scan = preg->reginput;
1520         opnd = OPERAND(p);
1521         switch (OP(preg, p)) {
1522         case ANY:
1523                 /* No need to handle utf8 specially here */
1524                 while (!reg_iseol(preg, *scan) && count < max) {
1525                         count++;
1526                         scan++;
1527                 }
1528                 break;
1529         case EXACTLY:
1530                 while (count < max) {
1531                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1532                         if (preg->program[opnd] != ch) {
1533                                 break;
1534                         }
1535                         count++;
1536                         scan += n;
1537                 }
1538                 break;
1539         case ANYOF:
1540                 while (count < max) {
1541                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1542                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1543                                 break;
1544                         }
1545                         count++;
1546                         scan += n;
1547                 }
1548                 break;
1549         case ANYBUT:
1550                 while (count < max) {
1551                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1552                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1553                                 break;
1554                         }
1555                         count++;
1556                         scan += n;
1557                 }
1558                 break;
1559         default:                /* Oh dear.  Called inappropriately. */
1560                 preg->err = REG_ERR_INTERNAL;
1561                 count = 0;      /* Best compromise. */
1562                 break;
1563         }
1564         preg->reginput = scan;
1565
1566         return(count);
1567 }
1568
1569 /*
1570  - regnext - dig the "next" pointer out of a node
1571  */
1572 static int regnext(regex_t *preg, int p )
1573 {
1574         int offset;
1575
1576         offset = NEXT(preg, p);
1577
1578         if (offset == 0)
1579                 return 0;
1580
1581         if (OP(preg, p) == BACK)
1582                 return(p-offset);
1583         else
1584                 return(p+offset);
1585 }
1586
1587 /*
1588  - regopsize - returns the size of opcode + operands at 'p' in words
1589  */
1590 static int regopsize(regex_t *preg, int p )
1591 {
1592         /* Almost all opcodes are 2 words, but some are more */
1593         switch (OP(preg, p)) {
1594                 case REP:
1595                 case REPMIN:
1596                 case REPX:
1597                 case REPXMIN:
1598                         return 5;
1599
1600                 case ANYOF:
1601                 case ANYBUT:
1602                 case EXACTLY: {
1603                         int s = p + 2;
1604                         while (preg->program[s++]) {
1605                         }
1606                         return s - p;
1607                 }
1608         }
1609         return 2;
1610 }
1611
1612 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1613
1614 /*
1615  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1616  */
1617 static void regdump(regex_t *preg)
1618 {
1619         int s;
1620         int op = EXACTLY;       /* Arbitrary non-END op. */
1621         int next;
1622         char buf[MAX_UTF8_LEN + 1];
1623
1624         int i;
1625         for (i = 1; i < preg->p; i++) {
1626                 printf("%02x ", (unsigned char)preg->program[i]);
1627                 if (i % 16 == 0) {
1628                         printf("\n");
1629                 }
1630         }
1631         printf("\n");
1632
1633         s = 1;
1634         while (op != END && s < preg->p) {      /* While that wasn't END last time... */
1635                 op = OP(preg, s);
1636                 printf("%3d: %s", s, regprop(op));      /* Where, what. */
1637                 next = regnext(preg, s);
1638                 if (next == 0)          /* Next ptr. */
1639                         printf("(0)");
1640                 else
1641                         printf("(%d)", next);
1642                 s += 2;
1643                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1644                         int max = preg->program[s];
1645                         int min = preg->program[s + 1];
1646                         if (max == 65535) {
1647                                 printf("{%d,*}", min);
1648                         }
1649                         else {
1650                                 printf("{%d,%d}", min, max);
1651                         }
1652                         printf(" %d", preg->program[s + 2]);
1653                         s += 3;
1654                 }
1655                 else if (op == ANYOF || op == ANYBUT) {
1656                         /* set of ranges */
1657
1658                         while (preg->program[s]) {
1659                                 int len = preg->program[s++];
1660                                 int first = preg->program[s++];
1661                                 buf[utf8_getchars(buf, first)] = 0;
1662                                 printf("%s", buf);
1663                                 if (len > 1) {
1664                                         buf[utf8_getchars(buf, first + len - 1)] = 0;
1665                                         printf("-%s", buf);
1666                                 }
1667                         }
1668                         s++;
1669                 }
1670                 else if (op == EXACTLY) {
1671                         /* Literal string, where present. */
1672
1673                         while (preg->program[s]) {
1674                                 buf[utf8_getchars(buf, preg->program[s])] = 0;
1675                                 printf("%s", buf);
1676                                 s++;
1677                         }
1678                         s++;
1679                 }
1680                 putchar('\n');
1681         }
1682
1683         if (op == END) {
1684                 /* Header fields of interest. */
1685                 if (preg->regstart) {
1686                         buf[utf8_getchars(buf, preg->regstart)] = 0;
1687                         printf("start '%s' ", buf);
1688                 }
1689                 if (preg->reganch)
1690                         printf("anchored ");
1691                 if (preg->regmust != 0) {
1692                         int i;
1693                         printf("must have:");
1694                         for (i = 0; i < preg->regmlen; i++) {
1695                                 putchar(preg->program[preg->regmust + i]);
1696                         }
1697                         putchar('\n');
1698                 }
1699         }
1700         printf("\n");
1701 }
1702
1703 /*
1704  - regprop - printable representation of opcode
1705  */
1706 static const char *regprop( int op )
1707 {
1708         static char buf[50];
1709
1710         switch (op) {
1711         case BOL:
1712                 return "BOL";
1713         case EOL:
1714                 return "EOL";
1715         case ANY:
1716                 return "ANY";
1717         case ANYOF:
1718                 return "ANYOF";
1719         case ANYBUT:
1720                 return "ANYBUT";
1721         case BRANCH:
1722                 return "BRANCH";
1723         case EXACTLY:
1724                 return "EXACTLY";
1725         case NOTHING:
1726                 return "NOTHING";
1727         case BACK:
1728                 return "BACK";
1729         case END:
1730                 return "END";
1731         case REP:
1732                 return "REP";
1733         case REPMIN:
1734                 return "REPMIN";
1735         case REPX:
1736                 return "REPX";
1737         case REPXMIN:
1738                 return "REPXMIN";
1739         case WORDA:
1740                 return "WORDA";
1741         case WORDZ:
1742                 return "WORDZ";
1743         case OPENNC:
1744                 return "OPEN";
1745         case CLOSENC:
1746                 return "CLOSE";
1747         default:
1748                 if (op >= OPEN && op < CLOSE) {
1749                         snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1750                 }
1751                 else if (op >= CLOSE && op < CLOSE_END) {
1752                         snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1753                 }
1754                 else {
1755                         snprintf(buf, sizeof(buf), "?%d?\n", op);
1756                 }
1757                 return(buf);
1758         }
1759 }
1760 #endif /* JIM_BOOTSTRAP */
1761
1762 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1763 {
1764         static const char *error_strings[] = {
1765                 "success",
1766                 "no match",
1767                 "bad pattern",
1768                 "null argument",
1769                 "unknown error",
1770                 "too big",
1771                 "out of memory",
1772                 "too many ()",
1773                 "parentheses () not balanced",
1774                 "braces {} not balanced",
1775                 "invalid repetition count(s)",
1776                 "extra characters",
1777                 "*+ of empty atom",
1778                 "nested count",
1779                 "internal error",
1780                 "count follows nothing",
1781                 "trailing backslash",
1782                 "corrupted program",
1783                 "contains null char",
1784         };
1785         const char *err;
1786
1787         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1788                 err = "Bad error code";
1789         }
1790         else {
1791                 err = error_strings[errcode];
1792         }
1793
1794         return snprintf(errbuf, errbuf_size, "%s", err);
1795 }
1796
1797 void regfree(regex_t *preg)
1798 {
1799         free(preg->program);
1800 }
1801
1802 #endif