jimregexp.c

   1 /*
   2  * regcomp and regexec -- regsub and regerror are elsewhere
   3  *
   4  *      Copyright (c) 1986 by University of Toronto.
   5  *      Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *      Permission is granted to anyone to use this software for any
   8  *      purpose on any computer system, and to redistribute it freely,
   9  *      subject to the following restrictions:
  10  *
  11  *      1. The author is not responsible for the consequences of use of
  12  *              this software, no matter how awful, even if they arise
  13  *              from defects in it.
  14  *
  15  *      2. The origin of this software must not be misrepresented, either
  16  *              by explicit claim or by omission.
  17  *
  18  *      3. Altered versions must be plainly marked as such, and must not
  19  *              be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  33  *** to avoid conflicting with the system regexp.h.  Const, bless its
  34  *** soul, was removed so it can compile everywhere.  The declaration
  35  *** of strchr() was in conflict on AIX, so it was removed (as it is
  36  *** happily defined in string.h).
  37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  41  *
  42  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  43  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  44  *   This includes counted repetitions, UTF-8 support, character classes,
  45  *   shorthand character classes, increased number of parentheses to 100,
  46  *   backslash escape sequences. It also removes \n as an alternative to |.
  47  *
  48  * Beware that some of this code is subtly aware of the way operator
  49  * precedence is structured in regular expressions.  Serious changes in
  50  * regular-expression syntax might require a total rethink.
  51  */
  52 #include <stdio.h>
  53 #include <ctype.h>
  54 #include <stdlib.h>
  55 #include <string.h>
  56
  57 #include "jim.h"
  58 #include "jimautoconf.h"
  59 #include "jimregexp.h"
  60 #include "utf8.h"
  61
  62 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
  63
  64 /*
  65  * Structure for regexp "program".  This is essentially a linear encoding
  66  * of a nondeterministic finite-state machine (aka syntax charts or
  67  * "railroad normal form" in parsing technology).  Each node is an opcode
  68  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  69  * all nodes except BRANCH implement concatenation; a "next" pointer with
  70  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  71  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  72  * opposed to a collection of them) is never concatenated with anything
  73  * because of operator precedence.)  The operand of some types of node is
  74  * a literal string; for others, it is a node leading into a sub-FSM.  In
  75  * particular, the operand of a BRANCH node is the first node of the branch.
  76  * (NB this is *not* a tree structure:  the tail of the branch connects
  77  * to the thing following the set of BRANCHes.)  The opcodes are:
  78  */
  79
  80 /* This *MUST* be less than (255-20-1)/2=117 */
  81 #define REG_MAX_PAREN 100
  82
  83 /* definition   number  opnd?   meaning */
  84 #define END     0       /* no   End of program. */
  85 #define BOL     1       /* no   Match "" at beginning of line. */
  86 #define EOL     2       /* no   Match "" at end of line. */
  87 #define ANY     3       /* no   Match any one character. */
  88 #define ANYOF   4       /* str  Match any character in this string. */
  89 #define ANYBUT  5       /* str  Match any character not in this string. */
  90 #define BRANCH  6       /* node Match this alternative, or the next... */
  91 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  92 #define EXACTLY 8       /* str  Match this string. */
  93 #define NOTHING 9       /* no   Match empty string. */
  94 #define REP             10      /* max,min      Match this (simple) thing [min,max] times. */
  95 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, mininal match. */
  96 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
  97 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
  98
  99 #define WORDA   15      /* no   Match "" at wordchar, where prev is nonword */
 100 #define WORDZ   16      /* no   Match "" at nonwordchar, where prev is word */
 101 #define OPENNC  19      /* no   Non-capturing parentheses - must be OPEN-1 */
 102 #define OPEN    20      /* no   Mark this point in input as start of #n. */
 103                         /*      OPEN+1 is number 1, etc. */
 104 #define CLOSE   (OPEN+REG_MAX_PAREN+1)  /* no   Analogous to OPEN. */
 105 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 106 #define CLOSENC (CLOSE-1)       /* no   Non-capturing parentheses - must be CLOSE-1 */
 107
 108 /*
 109  * The first byte of the regexp internal "program" is actually this magic
 110  * number; the start node begins in the second byte.
 111  */
 112 #define REG_MAGIC       0xFADED00D
 113
 114 /*
 115  * Opcode notes:
 116  *
 117  * BRANCH       The set of branches constituting a single choice are hooked
 118  *              together with their "next" pointers, since precedence prevents
 119  *              anything being concatenated to any individual branch.  The
 120  *              "next" pointer of the last BRANCH in a choice points to the
 121  *              thing following the whole choice.  This is also where the
 122  *              final "next" pointer of each individual branch points; each
 123  *              branch starts with the operand node of a BRANCH node.
 124  *
 125  * BACK         Normal "next" pointers all implicitly point forward; BACK
 126  *              exists to make loop structures possible.
 127  *
 128  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
 129  *              BRANCH structures using BACK.  Simple cases (one character
 130  *              per match) are implemented with STAR and PLUS for speed
 131  *              and to minimize recursive plunges.
 132  *
 133  * OPEN,CLOSE   ...are numbered at compile time.
 134  */
 135
 136 /*
 137  * A node is one char of opcode followed by two chars of "next" pointer.
 138  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 139  * value is a positive offset from the opcode of the node containing it.
 140  * An operand, if any, simply follows the node.  (Note that much of the
 141  * code generation knows about this implicit relationship.)
 142  *
 143  * Using two bytes for the "next" pointer is vast overkill for most things,
 144  * but allows patterns to get big without disasters.
 145  */
 146 #define OP(preg, p)     (preg->program[p])
 147 #define NEXT(preg, p)   (preg->program[p + 1])
 148 #define OPERAND(p)      ((p) + 2)
 149
 150 /*
 151  * See regmagic.h for one further detail of program structure.
 152  */
 153
 154
 155 /*
 156  * Utility definitions.
 157  */
 158
 159 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 160 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 161 #define META    "^$.[()|?{+*"
 162
 163 /*
 164  * Flags to be passed up and down.
 165  */
 166 #define HASWIDTH        01      /* Known never to match null string. */
 167 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
 168 #define SPSTART         04      /* Starts with * or +. */
 169 #define WORST           0       /* Worst case. */
 170
 171 #define MAX_REP_COUNT 1000000
 172
 173 /*
 174  * Forward declarations for regcomp()'s friends.
 175  */
 176 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 177 static int regpiece(regex_t *preg, int *flagp );
 178 static int regbranch(regex_t *preg, int *flagp );
 179 static int regatom(regex_t *preg, int *flagp );
 180 static int regnode(regex_t *preg, int op );
 181 static int regnext(regex_t *preg, int p );
 182 static void regc(regex_t *preg, int b );
 183 static int reginsert(regex_t *preg, int op, int size, int opnd );
 184 static void regtail_(regex_t *preg, int p, int val, int line );
 185 static void regoptail(regex_t *preg, int p, int val );
 186 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
 187
 188 static int reg_range_find(const int *string, int c);
 189 static const char *str_find(const char *string, int c, int nocase);
 190 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 191
 192 /*#define DEBUG*/
 193 #ifdef DEBUG
 194 static int regnarrate = 0;
 195 static void regdump(regex_t *preg);
 196 static const char *regprop( int op );
 197 #endif
 198
 199
 200 /**
 201  * Returns the length of the null-terminated integer sequence.
 202  */
 203 static int str_int_len(const int *seq)
 204 {
 205         int n = 0;
 206         while (*seq++) {
 207                 n++;
 208         }
 209         return n;
 210 }
 211
 212 /*
 213  - regcomp - compile a regular expression into internal code
 214  *
 215  * We can't allocate space until we know how big the compiled form will be,
 216  * but we can't compile it (and thus know how big it is) until we've got a
 217  * place to put the code.  So we cheat:  we compile it twice, once with code
 218  * generation turned off and size counting turned on, and once "for real".
 219  * This also means that we don't allocate space until we are sure that the
 220  * thing really will compile successfully, and we never have to move the
 221  * code and thus invalidate pointers into it.  (Note that it has to be in
 222  * one piece because free() must be able to free it all.)
 223  *
 224  * Beware that the optimization-preparation code in here knows about some
 225  * of the structure of the compiled regexp.
 226  */
 227 int regcomp(regex_t *preg, const char *exp, int cflags)
 228 {
 229         int scan;
 230         int longest;
 231         unsigned len;
 232         int flags;
 233
 234 #ifdef DEBUG
 235         fprintf(stderr, "Compiling: '%s'\n", exp);
 236 #endif
 237         memset(preg, 0, sizeof(*preg));
 238
 239         if (exp == NULL)
 240                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 241
 242         /* First pass: determine size, legality. */
 243         preg->cflags = cflags;
 244         preg->regparse = exp;
 245         /* XXX: For now, start unallocated */
 246         preg->program = NULL;
 247         preg->proglen = 0;
 248
 249         /* Allocate space. */
 250         preg->proglen = (strlen(exp) + 1) * 5;
 251         preg->program = malloc(preg->proglen * sizeof(int));
 252         if (preg->program == NULL)
 253                 FAIL(preg, REG_ERR_NOMEM);
 254
 255         /* Note that since we store a magic value as the first item in the program,
 256          * program offsets will never be 0
 257          */
 258         regc(preg, REG_MAGIC);
 259         if (reg(preg, 0, &flags) == 0) {
 260                 return preg->err;
 261         }
 262
 263         /* Small enough for pointer-storage convention? */
 264         if (preg->re_nsub >= REG_MAX_PAREN)             /* Probably could be 65535L. */
 265                 FAIL(preg,REG_ERR_TOO_BIG);
 266
 267         /* Dig out information for optimizations. */
 268         preg->regstart = 0;     /* Worst-case defaults. */
 269         preg->reganch = 0;
 270         preg->regmust = 0;
 271         preg->regmlen = 0;
 272         scan = 1;                       /* First BRANCH. */
 273         if (OP(preg, regnext(preg, scan)) == END) {             /* Only one top-level choice. */
 274                 scan = OPERAND(scan);
 275
 276                 /* Starting-point info. */
 277                 if (OP(preg, scan) == EXACTLY) {
 278                         preg->regstart = preg->program[OPERAND(scan)];
 279                 }
 280                 else if (OP(preg, scan) == BOL)
 281                         preg->reganch++;
 282
 283                 /*
 284                  * If there's something expensive in the r.e., find the
 285                  * longest literal string that must appear and make it the
 286                  * regmust.  Resolve ties in favor of later strings, since
 287                  * the regstart check works with the beginning of the r.e.
 288                  * and avoiding duplication strengthens checking.  Not a
 289                  * strong reason, but sufficient in the absence of others.
 290                  */
 291                 if (flags&SPSTART) {
 292                         longest = 0;
 293                         len = 0;
 294                         for (; scan != 0; scan = regnext(preg, scan)) {
 295                                 if (OP(preg, scan) == EXACTLY) {
 296                                         int plen = str_int_len(preg->program + OPERAND(scan));
 297                                         if (plen >= len) {
 298                                                 longest = OPERAND(scan);
 299                                                 len = plen;
 300                                         }
 301                                 }
 302                         }
 303                         preg->regmust = longest;
 304                         preg->regmlen = len;
 305                 }
 306         }
 307
 308 #ifdef DEBUG
 309         regdump(preg);
 310 #endif
 311
 312         return 0;
 313 }
 314
 315 /*
 316  - reg - regular expression, i.e. main body or parenthesized thing
 317  *
 318  * Caller must absorb opening parenthesis.
 319  *
 320  * Combining parenthesis handling with the base level of regular expression
 321  * is a trifle forced, but the need to tie the tails of the branches to what
 322  * follows makes it hard to avoid.
 323  */
 324 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 325 {
 326         int ret;
 327         int br;
 328         int ender;
 329         int parno = 0;
 330         int flags;
 331
 332         *flagp = HASWIDTH;      /* Tentatively. */
 333
 334         /* Make an OPEN node, if parenthesized. */
 335         if (paren) {
 336                 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
 337                         /* non-capturing paren */
 338                         preg->regparse += 2;
 339                         parno = -1;
 340                 }
 341                 else {
 342                         parno = ++preg->re_nsub;
 343                 }
 344                 ret = regnode(preg, OPEN+parno);
 345         } else
 346                 ret = 0;
 347
 348         /* Pick up the branches, linking them together. */
 349         br = regbranch(preg, &flags);
 350         if (br == 0)
 351                 return 0;
 352         if (ret != 0)
 353                 regtail(preg, ret, br); /* OPEN -> first. */
 354         else
 355                 ret = br;
 356         if (!(flags&HASWIDTH))
 357                 *flagp &= ~HASWIDTH;
 358         *flagp |= flags&SPSTART;
 359         while (*preg->regparse == '|') {
 360                 preg->regparse++;
 361                 br = regbranch(preg, &flags);
 362                 if (br == 0)
 363                         return 0;
 364                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 365                 if (!(flags&HASWIDTH))
 366                         *flagp &= ~HASWIDTH;
 367                 *flagp |= flags&SPSTART;
 368         }
 369
 370         /* Make a closing node, and hook it on the end. */
 371         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 372         regtail(preg, ret, ender);
 373
 374         /* Hook the tails of the branches to the closing node. */
 375         for (br = ret; br != 0; br = regnext(preg, br))
 376                 regoptail(preg, br, ender);
 377
 378         /* Check for proper termination. */
 379         if (paren && *preg->regparse++ != ')') {
 380                 preg->err = REG_ERR_UNMATCHED_PAREN;
 381                 return 0;
 382         } else if (!paren && *preg->regparse != '\0') {
 383                 if (*preg->regparse == ')') {
 384                         preg->err = REG_ERR_UNMATCHED_PAREN;
 385                         return 0;
 386                 } else {
 387                         preg->err = REG_ERR_JUNK_ON_END;
 388                         return 0;
 389                 }
 390         }
 391
 392         return(ret);
 393 }
 394
 395 /*
 396  - regbranch - one alternative of an | operator
 397  *
 398  * Implements the concatenation operator.
 399  */
 400 static int regbranch(regex_t *preg, int *flagp )
 401 {
 402         int ret;
 403         int chain;
 404         int latest;
 405         int flags;
 406
 407         *flagp = WORST;         /* Tentatively. */
 408
 409         ret = regnode(preg, BRANCH);
 410         chain = 0;
 411         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 412                *preg->regparse != '|') {
 413                 latest = regpiece(preg, &flags);
 414                 if (latest == 0)
 415                         return 0;
 416                 *flagp |= flags&HASWIDTH;
 417                 if (chain == 0) {/* First piece. */
 418                         *flagp |= flags&SPSTART;
 419                 }
 420                 else {
 421                         regtail(preg, chain, latest);
 422                 }
 423                 chain = latest;
 424         }
 425         if (chain == 0) /* Loop ran zero times. */
 426                 (void) regnode(preg, NOTHING);
 427
 428         return(ret);
 429 }
 430
 431 /*
 432  - regpiece - something followed by possible [*+?]
 433  *
 434  * Note that the branching code sequences used for ? and the general cases
 435  * of * and + are somewhat optimized:  they use the same NOTHING node as
 436  * both the endmarker for their branch list and the body of the last branch.
 437  * It might seem that this node could be dispensed with entirely, but the
 438  * endmarker role is not redundant.
 439  */
 440 static int regpiece(regex_t *preg, int *flagp)
 441 {
 442         int ret;
 443         char op;
 444         int next;
 445         int flags;
 446         int chain = 0;
 447         int min;
 448         int max;
 449
 450         ret = regatom(preg, &flags);
 451         if (ret == 0)
 452                 return 0;
 453
 454         op = *preg->regparse;
 455         if (!ISMULT(op)) {
 456                 *flagp = flags;
 457                 return(ret);
 458         }
 459
 460         if (!(flags&HASWIDTH) && op != '?') {
 461                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 462                 return 0;
 463         }
 464
 465         /* Handle braces (counted repetition) by expansion */
 466         if (op == '{') {
 467                 char *end;
 468
 469                 min = strtoul(preg->regparse + 1, &end, 10);
 470                 if (end == preg->regparse + 1) {
 471                         preg->err = REG_ERR_BAD_COUNT;
 472                         return 0;
 473                 }
 474                 if (*end == '}') {
 475                         max = min;
 476                 }
 477                 else {
 478                         preg->regparse = end;
 479                         max = strtoul(preg->regparse + 1, &end, 10);
 480                         if (*end != '}') {
 481                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 482                                 return 0;
 483                         }
 484                 }
 485                 if (end == preg->regparse + 1) {
 486                         max = MAX_REP_COUNT;
 487                 }
 488                 else if (max < min || max >= 100) {
 489                         preg->err = REG_ERR_BAD_COUNT;
 490                         return 0;
 491                 }
 492                 if (min >= 100) {
 493                         preg->err = REG_ERR_BAD_COUNT;
 494                         return 0;
 495                 }
 496
 497                 preg->regparse = strchr(preg->regparse, '}');
 498         }
 499         else {
 500                 min = (op == '+');
 501                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 502         }
 503
 504         if (preg->regparse[1] == '?') {
 505                 preg->regparse++;
 506                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 507         }
 508         else {
 509                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 510         }
 511         preg->program[ret + 2] = max;
 512         preg->program[ret + 3] = min;
 513         preg->program[ret + 4] = 0;
 514
 515         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 516
 517         if (!(flags & SIMPLE)) {
 518                 int back = regnode(preg, BACK);
 519                 regtail(preg, back, ret);
 520                 regtail(preg, next, back);
 521         }
 522
 523         preg->regparse++;
 524         if (ISMULT(*preg->regparse)) {
 525                 preg->err = REG_ERR_NESTED_COUNT;
 526                 return 0;
 527         }
 528
 529         return chain ? chain : ret;
 530 }
 531
 532 /**
 533  * Add all characters in the inclusive range between lower and upper.
 534  *
 535  * Handles a swapped range (upper < lower).
 536  */
 537 static void reg_addrange(regex_t *preg, int lower, int upper)
 538 {
 539         if (lower > upper) {
 540                 reg_addrange(preg, upper, lower);
 541         }
 542         /* Add a range as length, start */
 543         regc(preg, upper - lower + 1);
 544         regc(preg, lower);
 545 }
 546
 547 /**
 548  * Add a null-terminated literal string as a set of ranges.
 549  */
 550 static void reg_addrange_str(regex_t *preg, const char *str)
 551 {
 552         while (*str) {
 553                 reg_addrange(preg, *str, *str);
 554                 str++;
 555         }
 556 }
 557
 558 /**
 559  * Extracts the next unicode char from utf8.
 560  *
 561  * If 'upper' is set, converts the char to uppercase.
 562  */
 563 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 564 {
 565         int l = utf8_tounicode(s, uc);
 566         if (upper) {
 567                 *uc = utf8_upper(*uc);
 568         }
 569         return l;
 570 }
 571
 572 /**
 573  * Converts a hex digit to decimal.
 574  *
 575  * Returns -1 for an invalid hex digit.
 576  */
 577 static int hexdigitval(int c)
 578 {
 579         if (c >= '0' && c <= '9')
 580                 return c - '0';
 581         if (c >= 'a' && c <= 'f')
 582                 return c - 'a' + 10;
 583         if (c >= 'A' && c <= 'F')
 584                 return c - 'A' + 10;
 585         return -1;
 586 }
 587
 588 /**
 589  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 590  *
 591  * Returns the number of hex digits parsed.
 592  * If there are no hex digits, returns 0 and stores nothing.
 593  */
 594 static int parse_hex(const char *s, int n, int *uc)
 595 {
 596         int val = 0;
 597         int k;
 598
 599         for (k = 0; k < n; k++) {
 600                 int c = hexdigitval(*s++);
 601                 if (c == -1) {
 602                         break;
 603                 }
 604                 val = (val << 4) | c;
 605         }
 606         if (k) {
 607                 *uc = val;
 608         }
 609         return k;
 610 }
 611
 612 /**
 613  * Call for chars after a backlash to decode the escape sequence.
 614  *
 615  * Stores the result in *ch.
 616  *
 617  * Returns the number of bytes consumed.
 618  */
 619 static int reg_decode_escape(const char *s, int *ch)
 620 {
 621         int n;
 622         const char *s0 = s;
 623
 624         *ch = *s++;
 625
 626         switch (*ch) {
 627                 case 'b': *ch = '\b'; break;
 628                 case 'e': *ch = 27; break;
 629                 case 'f': *ch = '\f'; break;
 630                 case 'n': *ch = '\n'; break;
 631                 case 'r': *ch = '\r'; break;
 632                 case 't': *ch = '\t'; break;
 633                 case 'v': *ch = '\v'; break;
 634                 case 'u':
 635                         if ((n = parse_hex(s, 4, ch)) > 0) {
 636                                 s += n;
 637                         }
 638                         break;
 639                 case 'x':
 640                         if ((n = parse_hex(s, 2, ch)) > 0) {
 641                                 s += n;
 642                         }
 643                         break;
 644                 case '\0':
 645                         s--;
 646                         *ch = '\\';
 647                         break;
 648         }
 649         return s - s0;
 650 }
 651
 652 /*
 653  - regatom - the lowest level
 654  *
 655  * Optimization:  gobbles an entire sequence of ordinary characters so that
 656  * it can turn them into a single node, which is smaller to store and
 657  * faster to run.  Backslashed characters are exceptions, each becoming a
 658  * separate node; the code is simpler that way and it's not worth fixing.
 659  */
 660 static int regatom(regex_t *preg, int *flagp)
 661 {
 662         int ret;
 663         int flags;
 664         int nocase = (preg->cflags & REG_ICASE);
 665
 666         int ch;
 667         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 668
 669         *flagp = WORST;         /* Tentatively. */
 670
 671         preg->regparse += n;
 672         switch (ch) {
 673         /* FIXME: these chars only have meaning at beg/end of pat? */
 674         case '^':
 675                 ret = regnode(preg, BOL);
 676                 break;
 677         case '$':
 678                 ret = regnode(preg, EOL);
 679                 break;
 680         case '.':
 681                 ret = regnode(preg, ANY);
 682                 *flagp |= HASWIDTH|SIMPLE;
 683                 break;
 684         case '[': {
 685                         const char *pattern = preg->regparse;
 686
 687                         if (*pattern == '^') {  /* Complement of range. */
 688                                 ret = regnode(preg, ANYBUT);
 689                                 pattern++;
 690                         } else
 691                                 ret = regnode(preg, ANYOF);
 692
 693                         /* Special case. If the first char is ']' or '-', it is part of the set */
 694                         if (*pattern == ']' || *pattern == '-') {
 695                                 reg_addrange(preg, *pattern, *pattern);
 696                                 pattern++;
 697                         }
 698
 699                         while (*pattern && *pattern != ']') {
 700                                 /* Is this a range? a-z */
 701                                 int start;
 702                                 int end;
 703
 704                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 705                                 if (start == '\\') {
 706                                         pattern += reg_decode_escape(pattern, &start);
 707                                         if (start == 0) {
 708                                                 preg->err = REG_ERR_NULL_CHAR;
 709                                                 return 0;
 710                                         }
 711                                 }
 712                                 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
 713                                         /* skip '-' */
 714                                         pattern += utf8_tounicode(pattern, &end);
 715                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 716                                         if (end == '\\') {
 717                                                 pattern += reg_decode_escape(pattern, &end);
 718                                                 if (end == 0) {
 719                                                         preg->err = REG_ERR_NULL_CHAR;
 720                                                         return 0;
 721                                                 }
 722                                         }
 723
 724                                         reg_addrange(preg, start, end);
 725                                         continue;
 726                                 }
 727                                 if (start == '[') {
 728                                         if (strncmp(pattern, ":alpha:]", 8) == 0) {
 729                                                 if ((preg->cflags & REG_ICASE) == 0) {
 730                                                         reg_addrange(preg, 'a', 'z');
 731                                                 }
 732                                                 reg_addrange(preg, 'A', 'Z');
 733                                                 pattern += 8;
 734                                                 continue;
 735                                         }
 736                                         if (strncmp(pattern, ":alnum:]", 8) == 0) {
 737                                                 if ((preg->cflags & REG_ICASE) == 0) {
 738                                                         reg_addrange(preg, 'a', 'z');
 739                                                 }
 740                                                 reg_addrange(preg, 'A', 'Z');
 741                                                 reg_addrange(preg, '0', '9');
 742                                                 pattern += 8;
 743                                                 continue;
 744                                         }
 745                                         if (strncmp(pattern, ":space:]", 8) == 0) {
 746                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 747                                                 pattern += 8;
 748                                                 continue;
 749                                         }
 750                                 }
 751                                 /* Not a range, so just add the char */
 752                                 reg_addrange(preg, start, start);
 753                         }
 754                         regc(preg, '\0');
 755
 756                         if (*pattern) {
 757                                 pattern++;
 758                         }
 759                         preg->regparse = pattern;
 760
 761                         *flagp |= HASWIDTH|SIMPLE;
 762                 }
 763                 break;
 764         case '(':
 765                 ret = reg(preg, 1, &flags);
 766                 if (ret == 0)
 767                         return 0;
 768                 *flagp |= flags&(HASWIDTH|SPSTART);
 769                 break;
 770         case '\0':
 771         case '|':
 772         case ')':
 773                 preg->err = REG_ERR_INTERNAL;
 774                 return 0;       /* Supposed to be caught earlier. */
 775         case '?':
 776         case '+':
 777         case '*':
 778         case '{':
 779                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 780                 return 0;
 781         case '\\':
 782                 switch (*preg->regparse++) {
 783                 case '\0':
 784                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 785                         return 0;
 786                 case '<':
 787                 case 'm':
 788                         ret = regnode(preg, WORDA);
 789                         break;
 790                 case '>':
 791                 case 'M':
 792                         ret = regnode(preg, WORDZ);
 793                         break;
 794                 case 'd':
 795                         ret = regnode(preg, ANYOF);
 796                         reg_addrange(preg, '0', '9');
 797                         regc(preg, '\0');
 798                         *flagp |= HASWIDTH|SIMPLE;
 799                         break;
 800                 case 'w':
 801                         ret = regnode(preg, ANYOF);
 802                         if ((preg->cflags & REG_ICASE) == 0) {
 803                                 reg_addrange(preg, 'a', 'z');
 804                         }
 805                         reg_addrange(preg, 'A', 'Z');
 806                         reg_addrange(preg, '0', '9');
 807                         reg_addrange(preg, '_', '_');
 808                         regc(preg, '\0');
 809                         *flagp |= HASWIDTH|SIMPLE;
 810                         break;
 811                 case 's':
 812                         ret = regnode(preg, ANYOF);
 813                         reg_addrange_str(preg," \t\r\n\f\v");
 814                         regc(preg, '\0');
 815                         *flagp |= HASWIDTH|SIMPLE;
 816                         break;
 817                 /* FIXME: Someday handle \1, \2, ... */
 818                 default:
 819                         /* Handle general quoted chars in exact-match routine */
 820                         /* Back up to include the backslash */
 821                         preg->regparse--;
 822                         goto de_fault;
 823                 }
 824                 break;
 825         de_fault:
 826         default: {
 827                         /*
 828                          * Encode a string of characters to be matched exactly.
 829                          */
 830                         int added = 0;
 831
 832                         /* Back up to pick up the first char of interest */
 833                         preg->regparse -= n;
 834
 835                         ret = regnode(preg, EXACTLY);
 836
 837                         /* Note that a META operator such as ? or * consumes the
 838                          * preceding char.
 839                          * Thus we must be careful to look ahead by 2 and add the
 840                          * last char as it's own EXACTLY if necessary
 841                          */
 842
 843                         /* Until end of string or a META char is reached */
 844                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 845                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 846                                 if (ch == '\\' && preg->regparse[n]) {
 847                                         /* Non-trailing backslash.
 848                                          * Is this a special escape, or a regular escape?
 849                                          */
 850                                         if (strchr("<>mMwds", preg->regparse[n])) {
 851                                                 /* A special escape. All done with EXACTLY */
 852                                                 break;
 853                                         }
 854                                         /* Decode it. Note that we add the length for the escape
 855                                          * sequence to the length for the backlash so we can skip
 856                                          * the entire sequence, or not as required.
 857                                          */
 858                                         n += reg_decode_escape(preg->regparse + n, &ch);
 859                                         if (ch == 0) {
 860                                                 preg->err = REG_ERR_NULL_CHAR;
 861                                                 return 0;
 862                                         }
 863                                 }
 864
 865                                 /* Now we have one char 'ch' of length 'n'.
 866                                  * Check to see if the following char is a MULT
 867                                  */
 868
 869                                 if (ISMULT(preg->regparse[n])) {
 870                                         /* Yes. But do we already have some EXACTLY chars? */
 871                                         if (added) {
 872                                                 /* Yes, so return what we have and pick up the current char next time around */
 873                                                 break;
 874                                         }
 875                                         /* No, so add this single char and finish */
 876                                         regc(preg, ch);
 877                                         added++;
 878                                         preg->regparse += n;
 879                                         break;
 880                                 }
 881
 882                                 /* No, so just add this char normally */
 883                                 regc(preg, ch);
 884                                 added++;
 885                                 preg->regparse += n;
 886                         }
 887                         regc(preg, '\0');
 888
 889                         *flagp |= HASWIDTH;
 890                         if (added == 1)
 891                                 *flagp |= SIMPLE;
 892                         break;
 893                 }
 894                 break;
 895         }
 896
 897         return(ret);
 898 }
 899
 900 static void reg_grow(regex_t *preg, int n)
 901 {
 902         if (preg->p + n >= preg->proglen) {
 903                 preg->proglen = (preg->p + n) * 2;
 904                 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
 905         }
 906 }
 907
 908 /*
 909  - regnode - emit a node
 910  */
 911 /* Location. */
 912 static int regnode(regex_t *preg, int op)
 913 {
 914         reg_grow(preg, 2);
 915
 916         preg->program[preg->p++] = op;
 917         preg->program[preg->p++] = 0;
 918
 919         /* Return the start of the node */
 920         return preg->p - 2;
 921 }
 922
 923 /*
 924  - regc - emit (if appropriate) a byte of code
 925  */
 926 static void regc(regex_t *preg, int b )
 927 {
 928         reg_grow(preg, 1);
 929         preg->program[preg->p++] = b;
 930 }
 931
 932 /*
 933  - reginsert - insert an operator in front of already-emitted operand
 934  *
 935  * Means relocating the operand.
 936  * Returns the new location of the original operand.
 937  */
 938 static int reginsert(regex_t *preg, int op, int size, int opnd )
 939 {
 940         reg_grow(preg, size);
 941
 942         /* Move everything from opnd up */
 943         memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
 944         /* Zero out the new space */
 945         memset(preg->program + opnd, 0, sizeof(int) * size);
 946
 947         preg->program[opnd] = op;
 948
 949         preg->p += size;
 950
 951         return opnd + size;
 952 }
 953
 954 /*
 955  - regtail - set the next-pointer at the end of a node chain
 956  */
 957 static void regtail_(regex_t *preg, int p, int val, int line )
 958 {
 959         int scan;
 960         int temp;
 961         int offset;
 962
 963         /* Find last node. */
 964         scan = p;
 965         for (;;) {
 966                 temp = regnext(preg, scan);
 967                 if (temp == 0)
 968                         break;
 969                 scan = temp;
 970         }
 971
 972         if (OP(preg, scan) == BACK)
 973                 offset = scan - val;
 974         else
 975                 offset = val - scan;
 976
 977         preg->program[scan + 1] = offset;
 978 }
 979
 980 /*
 981  - regoptail - regtail on operand of first argument; nop if operandless
 982  */
 983
 984 static void regoptail(regex_t *preg, int p, int val )
 985 {
 986         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
 987         if (p != 0 && OP(preg, p) == BRANCH) {
 988                 regtail(preg, OPERAND(p), val);
 989         }
 990 }
 991
 992 /*
 993  * regexec and friends
 994  */
 995
 996 /*
 997  * Forwards.
 998  */
 999 static int regtry(regex_t *preg, const char *string );
1000 static int regmatch(regex_t *preg, int prog);
1001 static int regrepeat(regex_t *preg, int p, int max);
1002
1003 /*
1004  - regexec - match a regexp against a string
1005  */
1006 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1007 {
1008         const char *s;
1009         int scan;
1010
1011         /* Be paranoid... */
1012         if (preg == NULL || preg->program == NULL || string == NULL) {
1013                 return REG_ERR_NULL_ARGUMENT;
1014         }
1015
1016         /* Check validity of program. */
1017         if (*preg->program != REG_MAGIC) {
1018                 return REG_ERR_CORRUPTED;
1019         }
1020
1021 #ifdef DEBUG
1022         fprintf(stderr, "regexec: %s\n", string);
1023         regdump(preg);
1024 #endif
1025
1026         preg->eflags = eflags;
1027         preg->pmatch = pmatch;
1028         preg->nmatch = nmatch;
1029         preg->start = string;   /* All offsets are computed from here */
1030
1031         /* Must clear out the embedded repeat counts */
1032         for (scan = OPERAND(1); scan != 0; ) {
1033                 switch (OP(preg, scan)) {
1034                 case REP:
1035                 case REPMIN:
1036                 case REPX:
1037                 case REPXMIN:
1038                         preg->program[scan + 4] = 0;
1039                         scan += 5;
1040                         break;
1041
1042                 case ANYOF:
1043                 case ANYBUT:
1044                 case EXACTLY:
1045                         scan += 2;
1046                         while (preg->program[scan++]) {
1047                         }
1048                         break;
1049
1050                 case END:
1051                         scan = 0;
1052                         break;
1053
1054                 default:
1055                         scan += 2;
1056                         break;
1057                 }
1058         }
1059
1060         /* If there is a "must appear" string, look for it. */
1061         if (preg->regmust != 0) {
1062                 s = string;
1063                 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1064                         if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1065                                 break;
1066                         }
1067                         s++;
1068                 }
1069                 if (s == NULL)  /* Not present. */
1070                         return REG_NOMATCH;
1071         }
1072
1073         /* Mark beginning of line for ^ . */
1074         preg->regbol = string;
1075
1076         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1077         if (preg->reganch) {
1078                 if (eflags & REG_NOTBOL) {
1079                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1080                         goto nextline;
1081                 }
1082                 while (1) {
1083                         int ret = regtry(preg, string);
1084                         if (ret) {
1085                                 return REG_NOERROR;
1086                         }
1087                         if (*string) {
1088 nextline:
1089                                 if (preg->cflags & REG_NEWLINE) {
1090                                         /* Try the next anchor? */
1091                                         string = strchr(string, '\n');
1092                                         if (string) {
1093                                                 preg->regbol = ++string;
1094                                                 continue;
1095                                         }
1096                                 }
1097                         }
1098                         return REG_NOMATCH;
1099                 }
1100         }
1101
1102         /* Messy cases:  unanchored match. */
1103         s = string;
1104         if (preg->regstart != '\0') {
1105                 /* We know what char it must start with. */
1106                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1107                         if (regtry(preg, s))
1108                                 return REG_NOERROR;
1109                         s++;
1110                 }
1111         }
1112         else
1113                 /* We don't -- general case. */
1114                 while (1) {
1115                         if (regtry(preg, s))
1116                                 return REG_NOERROR;
1117                         if (*s == '\0') {
1118                                 break;
1119                         }
1120                         s += utf8_charlen(*s);
1121                 }
1122
1123         /* Failure. */
1124         return REG_NOMATCH;
1125 }
1126
1127 /*
1128  - regtry - try match at specific point
1129  */
1130                         /* 0 failure, 1 success */
1131 static int regtry( regex_t *preg, const char *string )
1132 {
1133         int i;
1134
1135         preg->reginput = string;
1136
1137         for (i = 0; i < preg->nmatch; i++) {
1138                 preg->pmatch[i].rm_so = -1;
1139                 preg->pmatch[i].rm_eo = -1;
1140         }
1141         if (regmatch(preg, 1)) {
1142                 preg->pmatch[0].rm_so = string - preg->start;
1143                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1144                 return(1);
1145         } else
1146                 return(0);
1147 }
1148
1149 /**
1150  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1151  *
1152  * If 'nocase' is non-zero, does a case-insensitive match.
1153  *
1154  * Returns -1 on not found.
1155  */
1156 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1157 {
1158         const char *s = string;
1159         while (proglen && *s) {
1160                 int ch;
1161                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1162                 if (ch != *prog) {
1163                         return -1;
1164                 }
1165                 prog++;
1166                 s += n;
1167                 proglen--;
1168         }
1169         if (proglen == 0) {
1170                 return s - string;
1171         }
1172         return -1;
1173 }
1174
1175 /**
1176  * Searchs for 'c' in the range 'range'.
1177  *
1178  * Returns 1 if found, or 0 if not.
1179  */
1180 static int reg_range_find(const int *range, int c)
1181 {
1182         while (*range) {
1183                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1184                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1185                         return 1;
1186                 }
1187                 range += 2;
1188         }
1189         return 0;
1190 }
1191
1192 /**
1193  * Search for the character 'c' in the utf-8 string 'string'.
1194  *
1195  * If 'nocase' is set, the 'string' is assumed to be uppercase
1196  * and 'c' is converted to uppercase before matching.
1197  *
1198  * Returns the byte position in the string where the 'c' was found, or
1199  * NULL if not found.
1200  */
1201 static const char *str_find(const char *string, int c, int nocase)
1202 {
1203         if (nocase) {
1204                 /* The "string" should already be converted to uppercase */
1205                 c = utf8_upper(c);
1206         }
1207         while (*string) {
1208                 int ch;
1209                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1210                 if (c == ch) {
1211                         return string;
1212                 }
1213                 string += n;
1214         }
1215         return NULL;
1216 }
1217
1218 /**
1219  * Returns true if 'ch' is an end-of-line char.
1220  *
1221  * In REG_NEWLINE mode, \n is considered EOL in
1222  * addition to \0
1223  */
1224 static int reg_iseol(regex_t *preg, int ch)
1225 {
1226         if (preg->cflags & REG_NEWLINE) {
1227                 return ch == '\0' || ch == '\n';
1228         }
1229         else {
1230                 return ch == '\0';
1231         }
1232 }
1233
1234 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1235 {
1236         int nextch = '\0';
1237         const char *save;
1238         int no;
1239         int c;
1240
1241         int max = preg->program[scan + 2];
1242         int min = preg->program[scan + 3];
1243         int next = regnext(preg, scan);
1244
1245         /*
1246          * Lookahead to avoid useless match attempts
1247          * when we know what character comes next.
1248          */
1249         if (OP(preg, next) == EXACTLY) {
1250                 nextch = preg->program[OPERAND(next)];
1251         }
1252         save = preg->reginput;
1253         no = regrepeat(preg, scan + 5, max);
1254         if (no < min) {
1255                 return 0;
1256         }
1257         if (matchmin) {
1258                 /* from min up to no */
1259                 max = no;
1260                 no = min;
1261         }
1262         /* else from no down to min */
1263         while (1) {
1264                 if (matchmin) {
1265                         if (no > max) {
1266                                 break;
1267                         }
1268                 }
1269                 else {
1270                         if (no < min) {
1271                                 break;
1272                         }
1273                 }
1274                 preg->reginput = save + utf8_index(save, no);
1275                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1276                 /* If it could work, try it. */
1277                 if (reg_iseol(preg, nextch) || c == nextch) {
1278                         if (regmatch(preg, next)) {
1279                                 return(1);
1280                         }
1281                 }
1282                 if (matchmin) {
1283                         /* Couldn't or didn't, add one more */
1284                         no++;
1285                 }
1286                 else {
1287                         /* Couldn't or didn't -- back up. */
1288                         no--;
1289                 }
1290         }
1291         return(0);
1292 }
1293
1294 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1295 {
1296         int *scanpt = preg->program + scan;
1297
1298         int max = scanpt[2];
1299         int min = scanpt[3];
1300
1301         /* Have we reached min? */
1302         if (scanpt[4] < min) {
1303                 /* No, so get another one */
1304                 scanpt[4]++;
1305                 if (regmatch(preg, scan + 5)) {
1306                         return 1;
1307                 }
1308                 scanpt[4]--;
1309                 return 0;
1310         }
1311         if (scanpt[4] > max) {
1312                 return 0;
1313         }
1314
1315         if (matchmin) {
1316                 /* minimal, so try other branch first */
1317                 if (regmatch(preg, regnext(preg, scan))) {
1318                         return 1;
1319                 }
1320                 /* No, so try one more */
1321                 scanpt[4]++;
1322                 if (regmatch(preg, scan + 5)) {
1323                         return 1;
1324                 }
1325                 scanpt[4]--;
1326                 return 0;
1327         }
1328         /* maximal, so try this branch again */
1329         if (scanpt[4] < max) {
1330                 scanpt[4]++;
1331                 if (regmatch(preg, scan + 5)) {
1332                         return 1;
1333                 }
1334                 scanpt[4]--;
1335         }
1336         /* At this point we are at max with no match. Try the other branch */
1337         return regmatch(preg, regnext(preg, scan));
1338 }
1339
1340 /*
1341  - regmatch - main matching routine
1342  *
1343  * Conceptually the strategy is simple:  check to see whether the current
1344  * node matches, call self recursively to see whether the rest matches,
1345  * and then act accordingly.  In practice we make some effort to avoid
1346  * recursion, in particular by going through "ordinary" nodes (that don't
1347  * need to know whether the rest of the match failed) by a loop instead of
1348  * by recursion.
1349  */
1350 /* 0 failure, 1 success */
1351 static int regmatch(regex_t *preg, int prog)
1352 {
1353         int scan;       /* Current node. */
1354         int next;               /* Next node. */
1355
1356         scan = prog;
1357
1358 #ifdef DEBUG
1359         if (scan != 0 && regnarrate)
1360                 fprintf(stderr, "%s(\n", regprop(scan));
1361 #endif
1362         while (scan != 0) {
1363                 int n;
1364                 int c;
1365 #ifdef DEBUG
1366                 if (regnarrate) {
1367                         fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1368                 }
1369 #endif
1370                 next = regnext(preg, scan);
1371                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1372
1373                 switch (OP(preg, scan)) {
1374                 case BOL:
1375                         if (preg->reginput != preg->regbol)
1376                                 return(0);
1377                         break;
1378                 case EOL:
1379                         if (!reg_iseol(preg, c)) {
1380                                 return(0);
1381                         }
1382                         break;
1383                 case WORDA:
1384                         /* Must be looking at a letter, digit, or _ */
1385                         if ((!isalnum(UCHAR(c))) && c != '_')
1386                                 return(0);
1387                         /* Prev must be BOL or nonword */
1388                         if (preg->reginput > preg->regbol &&
1389                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1390                                 return(0);
1391                         break;
1392                 case WORDZ:
1393                         /* Can't match at BOL */
1394                         if (preg->reginput > preg->regbol) {
1395                                 /* Current must be EOL or nonword */
1396                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1397                                         c = preg->reginput[-1];
1398                                         /* Previous must be word */
1399                                         if (isalnum(UCHAR(c)) || c == '_') {
1400                                                 break;
1401                                         }
1402                                 }
1403                         }
1404                         /* No */
1405                         return(0);
1406
1407                 case ANY:
1408                         if (reg_iseol(preg, c))
1409                                 return 0;
1410                         preg->reginput += n;
1411                         break;
1412                 case EXACTLY: {
1413                                 int opnd;
1414                                 int len;
1415                                 int slen;
1416
1417                                 opnd = OPERAND(scan);
1418                                 len = str_int_len(preg->program + opnd);
1419
1420                                 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1421                                 if (slen < 0) {
1422                                         return(0);
1423                                 }
1424                                 preg->reginput += slen;
1425                         }
1426                         break;
1427                 case ANYOF:
1428                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1429                                 return(0);
1430                         }
1431                         preg->reginput += n;
1432                         break;
1433                 case ANYBUT:
1434                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1435                                 return(0);
1436                         }
1437                         preg->reginput += n;
1438                         break;
1439                 case NOTHING:
1440                         break;
1441                 case BACK:
1442                         break;
1443                 case BRANCH: {
1444                                 const char *save;
1445
1446                                 if (OP(preg, next) != BRANCH)           /* No choice. */
1447                                         next = OPERAND(scan);   /* Avoid recursion. */
1448                                 else {
1449                                         do {
1450                                                 save = preg->reginput;
1451                                                 if (regmatch(preg, OPERAND(scan))) {
1452                                                         return(1);
1453                                                 }
1454                                                 preg->reginput = save;
1455                                                 scan = regnext(preg, scan);
1456                                         } while (scan != 0 && OP(preg, scan) == BRANCH);
1457                                         return(0);
1458                                         /* NOTREACHED */
1459                                 }
1460                         }
1461                         break;
1462                 case REP:
1463                 case REPMIN:
1464                         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1465
1466                 case REPX:
1467                 case REPXMIN:
1468                         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1469
1470                 case END:
1471                         return(1);      /* Success! */
1472                         break;
1473
1474                 case OPENNC:
1475                 case CLOSENC:
1476                         if (regmatch(preg, next)) {
1477                                 return 1;
1478                         }
1479                         return 0;
1480
1481                 default:
1482                         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1483                                 const char *save;
1484
1485                                 save = preg->reginput;
1486
1487                                 if (regmatch(preg, next)) {
1488                                         int no;
1489                                         /*
1490                                          * Don't set startp if some later
1491                                          * invocation of the same parentheses
1492                                          * already has.
1493                                          */
1494                                         if (OP(preg, scan) < CLOSE) {
1495                                                 no = OP(preg, scan) - OPEN;
1496                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1497                                                         preg->pmatch[no].rm_so = save - preg->start;
1498                                                 }
1499                                         }
1500                                         else {
1501                                                 no = OP(preg, scan) - CLOSE;
1502                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1503                                                         preg->pmatch[no].rm_eo = save - preg->start;
1504                                                 }
1505                                         }
1506                                         return(1);
1507                                 } else
1508                                         return(0);
1509                         }
1510                         return REG_ERR_INTERNAL;
1511                 }
1512
1513                 scan = next;
1514         }
1515
1516         /*
1517          * We get here only if there's trouble -- normally "case END" is
1518          * the terminating point.
1519          */
1520         return REG_ERR_INTERNAL;
1521 }
1522
1523 /*
1524  - regrepeat - repeatedly match something simple, report how many
1525  */
1526 static int regrepeat(regex_t *preg, int p, int max)
1527 {
1528         int count = 0;
1529         const char *scan;
1530         int opnd;
1531         int ch;
1532         int n;
1533
1534         scan = preg->reginput;
1535         opnd = OPERAND(p);
1536         switch (OP(preg, p)) {
1537         case ANY:
1538                 /* No need to handle utf8 specially here */
1539                 while (!reg_iseol(preg, *scan) && count < max) {
1540                         count++;
1541                         scan++;
1542                 }
1543                 break;
1544         case EXACTLY:
1545                 while (count < max) {
1546                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1547                         if (preg->program[opnd] != ch) {
1548                                 break;
1549                         }
1550                         count++;
1551                         scan += n;
1552                 }
1553                 break;
1554         case ANYOF:
1555                 while (count < max) {
1556                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1557                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1558                                 break;
1559                         }
1560                         count++;
1561                         scan += n;
1562                 }
1563                 break;
1564         case ANYBUT:
1565                 while (count < max) {
1566                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1567                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1568                                 break;
1569                         }
1570                         count++;
1571                         scan += n;
1572                 }
1573                 break;
1574         default:                /* Oh dear.  Called inappropriately. */
1575                 preg->err = REG_ERR_INTERNAL;
1576                 count = 0;      /* Best compromise. */
1577                 break;
1578         }
1579         preg->reginput = scan;
1580
1581         return(count);
1582 }
1583
1584 /*
1585  - regnext - dig the "next" pointer out of a node
1586  */
1587 static int regnext(regex_t *preg, int p )
1588 {
1589         int offset;
1590
1591         offset = NEXT(preg, p);
1592
1593         if (offset == 0)
1594                 return 0;
1595
1596         if (OP(preg, p) == BACK)
1597                 return(p-offset);
1598         else
1599                 return(p+offset);
1600 }
1601
1602 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1603
1604 /*
1605  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1606  */
1607 static void regdump(regex_t *preg)
1608 {
1609         int s;
1610         int op = EXACTLY;       /* Arbitrary non-END op. */
1611         int next;
1612         char buf[4];
1613
1614         int i;
1615         for (i = 1; i < preg->p; i++) {
1616                 printf("%02x ", (unsigned char)preg->program[i]);
1617                 if (i % 16 == 0) {
1618                         printf("\n");
1619                 }
1620         }
1621         printf("\n");
1622
1623         s = 1;
1624         while (op != END && s < preg->p) {      /* While that wasn't END last time... */
1625                 op = OP(preg, s);
1626                 printf("%3d: %s", s, regprop(op));      /* Where, what. */
1627                 next = regnext(preg, s);
1628                 if (next == 0)          /* Next ptr. */
1629                         printf("(0)");
1630                 else
1631                         printf("(%d)", next);
1632                 s += 2;
1633                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1634                         int max = preg->program[s];
1635                         int min = preg->program[s + 1];
1636                         if (max == 65535) {
1637                                 printf("{%d,*}", min);
1638                         }
1639                         else {
1640                                 printf("{%d,%d}", min, max);
1641                         }
1642                         printf(" %d", preg->program[s + 2]);
1643                         s += 3;
1644                 }
1645                 else if (op == ANYOF || op == ANYBUT) {
1646                         /* set of ranges */
1647
1648                         while (preg->program[s]) {
1649                                 int len = preg->program[s++];
1650                                 int first = preg->program[s++];
1651                                 buf[utf8_fromunicode(buf, first)] = 0;
1652                                 printf("%s", buf);
1653                                 if (len > 1) {
1654                                         buf[utf8_fromunicode(buf, first + len - 1)] = 0;
1655                                         printf("-%s", buf);
1656                                 }
1657                         }
1658                         s++;
1659                 }
1660                 else if (op == EXACTLY) {
1661                         /* Literal string, where present. */
1662
1663                         while (preg->program[s]) {
1664                                 buf[utf8_fromunicode(buf, preg->program[s])] = 0;
1665                                 printf("%s", buf);
1666                                 s++;
1667                         }
1668                         s++;
1669                 }
1670                 putchar('\n');
1671         }
1672
1673         if (op == END) {
1674                 /* Header fields of interest. */
1675                 if (preg->regstart) {
1676                         buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1677                         printf("start '%s' ", buf);
1678                 }
1679                 if (preg->reganch)
1680                         printf("anchored ");
1681                 if (preg->regmust != 0) {
1682                         int i;
1683                         printf("must have:");
1684                         for (i = 0; i < preg->regmlen; i++) {
1685                                 putchar(preg->program[preg->regmust + i]);
1686                         }
1687                         putchar('\n');
1688                 }
1689         }
1690         printf("\n");
1691 }
1692
1693 /*
1694  - regprop - printable representation of opcode
1695  */
1696 static const char *regprop( int op )
1697 {
1698         static char buf[50];
1699
1700         switch (op) {
1701         case BOL:
1702                 return "BOL";
1703         case EOL:
1704                 return "EOL";
1705         case ANY:
1706                 return "ANY";
1707         case ANYOF:
1708                 return "ANYOF";
1709         case ANYBUT:
1710                 return "ANYBUT";
1711         case BRANCH:
1712                 return "BRANCH";
1713         case EXACTLY:
1714                 return "EXACTLY";
1715         case NOTHING:
1716                 return "NOTHING";
1717         case BACK:
1718                 return "BACK";
1719         case END:
1720                 return "END";
1721         case REP:
1722                 return "REP";
1723         case REPMIN:
1724                 return "REPMIN";
1725         case REPX:
1726                 return "REPX";
1727         case REPXMIN:
1728                 return "REPXMIN";
1729         case WORDA:
1730                 return "WORDA";
1731         case WORDZ:
1732                 return "WORDZ";
1733         case OPENNC:
1734                 return "OPEN";
1735         case CLOSENC:
1736                 return "CLOSE";
1737         default:
1738                 if (op >= OPEN && op < CLOSE) {
1739                         snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1740                 }
1741                 else if (op >= CLOSE && op < CLOSE_END) {
1742                         snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1743                 }
1744                 else {
1745                         snprintf(buf, sizeof(buf), "?%d?\n", op);
1746                 }
1747                 return(buf);
1748         }
1749 }
1750 #endif /* JIM_BOOTSTRAP */
1751
1752 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1753 {
1754         static const char *error_strings[] = {
1755                 "success",
1756                 "no match",
1757                 "bad pattern",
1758                 "null argument",
1759                 "unknown error",
1760                 "too big",
1761                 "out of memory",
1762                 "too many ()",
1763                 "parentheses () not balanced",
1764                 "braces {} not balanced",
1765                 "invalid repetition count(s)",
1766                 "extra characters",
1767                 "*+ of empty atom",
1768                 "nested count",
1769                 "internal error",
1770                 "count follows nothing",
1771                 "trailing backslash",
1772                 "corrupted program",
1773                 "contains null char",
1774         };
1775         const char *err;
1776
1777         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1778                 err = "Bad error code";
1779         }
1780         else {
1781                 err = error_strings[errcode];
1782         }
1783
1784         return snprintf(errbuf, errbuf_size, "%s", err);
1785 }
1786
1787 void regfree(regex_t *preg)
1788 {
1789         free(preg->program);
1790 }
1791
1792 #endif