jimregexp.c

   1 /*
   2  * regcomp and regexec -- regsub and regerror are elsewhere
   3  *
   4  *      Copyright (c) 1986 by University of Toronto.
   5  *      Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *      Permission is granted to anyone to use this software for any
   8  *      purpose on any computer system, and to redistribute it freely,
   9  *      subject to the following restrictions:
  10  *
  11  *      1. The author is not responsible for the consequences of use of
  12  *              this software, no matter how awful, even if they arise
  13  *              from defects in it.
  14  *
  15  *      2. The origin of this software must not be misrepresented, either
  16  *              by explicit claim or by omission.
  17  *
  18  *      3. Altered versions must be plainly marked as such, and must not
  19  *              be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  33  *** to avoid conflicting with the system regexp.h.  Const, bless its
  34  *** soul, was removed so it can compile everywhere.  The declaration
  35  *** of strchr() was in conflict on AIX, so it was removed (as it is
  36  *** happily defined in string.h).
  37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  41  *
  42  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  43  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  44  *   This includes counted repetitions, UTF-8 support, character classes,
  45  *   shorthand character classes, increased number of parentheses to 100,
  46  *   backslash escape sequences. It also removes \n as an alternative to |.
  47  *
  48  * Beware that some of this code is subtly aware of the way operator
  49  * precedence is structured in regular expressions.  Serious changes in
  50  * regular-expression syntax might require a total rethink.
  51  */
  52 #include <stdio.h>
  53 #include <ctype.h>
  54 #include <stdlib.h>
  55 #include <string.h>
  56
  57 #include "jim.h"
  58 #include "jimautoconf.h"
  59 #include "jimregexp.h"
  60 #include "utf8.h"
  61
  62 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
  63
  64 /*
  65  * Structure for regexp "program".  This is essentially a linear encoding
  66  * of a nondeterministic finite-state machine (aka syntax charts or
  67  * "railroad normal form" in parsing technology).  Each node is an opcode
  68  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  69  * all nodes except BRANCH implement concatenation; a "next" pointer with
  70  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  71  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  72  * opposed to a collection of them) is never concatenated with anything
  73  * because of operator precedence.)  The operand of some types of node is
  74  * a literal string; for others, it is a node leading into a sub-FSM.  In
  75  * particular, the operand of a BRANCH node is the first node of the branch.
  76  * (NB this is *not* a tree structure:  the tail of the branch connects
  77  * to the thing following the set of BRANCHes.)  The opcodes are:
  78  */
  79
  80 /* This *MUST* be less than (255-20)/2=117 */
  81 #define REG_MAX_PAREN 100
  82
  83 /* definition   number  opnd?   meaning */
  84 #define END     0       /* no   End of program. */
  85 #define BOL     1       /* no   Match "" at beginning of line. */
  86 #define EOL     2       /* no   Match "" at end of line. */
  87 #define ANY     3       /* no   Match any one character. */
  88 #define ANYOF   4       /* str  Match any character in this string. */
  89 #define ANYBUT  5       /* str  Match any character not in this string. */
  90 #define BRANCH  6       /* node Match this alternative, or the next... */
  91 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  92 #define EXACTLY 8       /* str  Match this string. */
  93 #define NOTHING 9       /* no   Match empty string. */
  94 #define REP             10      /* max,min      Match this (simple) thing [min,max] times. */
  95 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, mininal match. */
  96 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
  97 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
  98
  99 #define WORDA   15      /* no   Match "" at wordchar, where prev is nonword */
 100 #define WORDZ   16      /* no   Match "" at nonwordchar, where prev is word */
 101 #define OPEN    20      /* no   Mark this point in input as start of #n. */
 102                         /*      OPEN+1 is number 1, etc. */
 103 #define CLOSE   (OPEN+REG_MAX_PAREN)    /* no   Analogous to OPEN. */
 104 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 105
 106 /*
 107  * The first byte of the regexp internal "program" is actually this magic
 108  * number; the start node begins in the second byte.
 109  */
 110 #define REG_MAGIC       0xFADED00D
 111
 112 /*
 113  * Opcode notes:
 114  *
 115  * BRANCH       The set of branches constituting a single choice are hooked
 116  *              together with their "next" pointers, since precedence prevents
 117  *              anything being concatenated to any individual branch.  The
 118  *              "next" pointer of the last BRANCH in a choice points to the
 119  *              thing following the whole choice.  This is also where the
 120  *              final "next" pointer of each individual branch points; each
 121  *              branch starts with the operand node of a BRANCH node.
 122  *
 123  * BACK         Normal "next" pointers all implicitly point forward; BACK
 124  *              exists to make loop structures possible.
 125  *
 126  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
 127  *              BRANCH structures using BACK.  Simple cases (one character
 128  *              per match) are implemented with STAR and PLUS for speed
 129  *              and to minimize recursive plunges.
 130  *
 131  * OPEN,CLOSE   ...are numbered at compile time.
 132  */
 133
 134 /*
 135  * A node is one char of opcode followed by two chars of "next" pointer.
 136  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 137  * value is a positive offset from the opcode of the node containing it.
 138  * An operand, if any, simply follows the node.  (Note that much of the
 139  * code generation knows about this implicit relationship.)
 140  *
 141  * Using two bytes for the "next" pointer is vast overkill for most things,
 142  * but allows patterns to get big without disasters.
 143  */
 144 #define OP(p)   ((p)[0])
 145 #define NEXT(p) ((p)[1])
 146 #define OPERAND(p)      ((p) + 2)
 147
 148 /*
 149  * See regmagic.h for one further detail of program structure.
 150  */
 151
 152
 153 /*
 154  * Utility definitions.
 155  */
 156
 157 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 158 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 159 #define META    "^$.[()|?{+*"
 160
 161 /*
 162  * Flags to be passed up and down.
 163  */
 164 #define HASWIDTH        01      /* Known never to match null string. */
 165 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
 166 #define SPSTART         04      /* Starts with * or +. */
 167 #define WORST           0       /* Worst case. */
 168
 169 #define MAX_REP_COUNT 1000000
 170
 171 /*
 172  * Forward declarations for regcomp()'s friends.
 173  */
 174 static int *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 175 static int *regpiece(regex_t *preg, int *flagp );
 176 static int *regbranch(regex_t *preg, int *flagp );
 177 static int *regatom(regex_t *preg, int *flagp );
 178 static int *regnode(regex_t *preg, int op );
 179 static const int *regnext(regex_t *preg, const int *p );
 180 static void regc(regex_t *preg, int b );
 181 static int *reginsert(regex_t *preg, int op, int size, int *opnd );
 182 static void regtail(regex_t *preg, int *p, const int *val );
 183 static void regoptail(regex_t *preg, int *p, const int *val );
 184
 185 static int reg_range_find(const int *string, int c);
 186 static const char *str_find(const char *string, int c, int nocase);
 187 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 188
 189 /*#define DEBUG*/
 190 #ifdef DEBUG
 191 int regnarrate = 0;
 192 static void regdump(regex_t *preg);
 193 static const char *regprop( const int *op );
 194 #endif
 195
 196
 197 static int regdummy;
 198
 199 /**
 200  * Returns the length of the null-terminated integer sequence.
 201  */
 202 static int str_int_len(const int *seq)
 203 {
 204         int n = 0;
 205         while (*seq++) {
 206                 n++;
 207         }
 208         return n;
 209 }
 210
 211 /*
 212  - regcomp - compile a regular expression into internal code
 213  *
 214  * We can't allocate space until we know how big the compiled form will be,
 215  * but we can't compile it (and thus know how big it is) until we've got a
 216  * place to put the code.  So we cheat:  we compile it twice, once with code
 217  * generation turned off and size counting turned on, and once "for real".
 218  * This also means that we don't allocate space until we are sure that the
 219  * thing really will compile successfully, and we never have to move the
 220  * code and thus invalidate pointers into it.  (Note that it has to be in
 221  * one piece because free() must be able to free it all.)
 222  *
 223  * Beware that the optimization-preparation code in here knows about some
 224  * of the structure of the compiled regexp.
 225  */
 226 int regcomp(regex_t *preg, const char *exp, int cflags)
 227 {
 228         const int *scan;
 229         const int *longest;
 230         unsigned len;
 231         int flags;
 232
 233 #ifdef DEBUG
 234         fprintf(stderr, "Compiling: '%s'\n", exp);
 235 #endif
 236         memset(preg, 0, sizeof(*preg));
 237
 238         if (exp == NULL)
 239                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 240
 241         /* First pass: determine size, legality. */
 242         preg->cflags = cflags;
 243         preg->regparse = exp;
 244         preg->re_nsub = 0;
 245         preg->regsize = 0L;
 246         preg->regcode = &regdummy;
 247         regc(preg, REG_MAGIC);
 248         if (reg(preg, 0, &flags) == NULL)
 249                 return preg->err;
 250
 251         /* Small enough for pointer-storage convention? */
 252         if (preg->regsize >= 32767L || preg->re_nsub >= REG_MAX_PAREN)          /* Probably could be 65535L. */
 253                 FAIL(preg,REG_ERR_TOO_BIG);
 254
 255         /* Allocate space. */
 256         preg->program = malloc(preg->regsize * sizeof(*preg->program));
 257         if (preg->program == NULL)
 258                 FAIL(preg, REG_ERR_NOMEM);
 259
 260         /* Second pass: emit code. */
 261         preg->regparse = exp;
 262         preg->re_nsub = 0;
 263         preg->regsize = 0L;
 264         preg->regcode = preg->program;
 265         regc(preg, REG_MAGIC);
 266         if (reg(preg, 0, &flags) == NULL)
 267                 return preg->err;
 268
 269         /* Dig out information for optimizations. */
 270         preg->regstart = 0;     /* Worst-case defaults. */
 271         preg->reganch = 0;
 272         preg->regmust = NULL;
 273         preg->regmlen = 0;
 274         scan = preg->program+1;                 /* First BRANCH. */
 275         if (OP(regnext(preg, scan)) == END) {           /* Only one top-level choice. */
 276                 scan = OPERAND(scan);
 277
 278                 /* Starting-point info. */
 279                 if (OP(scan) == EXACTLY) {
 280                         preg->regstart = *OPERAND(scan);
 281                 }
 282                 else if (OP(scan) == BOL)
 283                         preg->reganch++;
 284
 285                 /*
 286                  * If there's something expensive in the r.e., find the
 287                  * longest literal string that must appear and make it the
 288                  * regmust.  Resolve ties in favor of later strings, since
 289                  * the regstart check works with the beginning of the r.e.
 290                  * and avoiding duplication strengthens checking.  Not a
 291                  * strong reason, but sufficient in the absence of others.
 292                  */
 293                 if (flags&SPSTART) {
 294                         longest = NULL;
 295                         len = 0;
 296                         for (; scan != NULL; scan = regnext(preg, scan)) {
 297                                 if (OP(scan) == EXACTLY) {
 298                                         int plen = str_int_len(OPERAND(scan));
 299                                         if (plen >= len) {
 300                                                 longest = OPERAND(scan);
 301                                                 len = plen;
 302                                         }
 303                                 }
 304                         }
 305                         preg->regmust = longest;
 306                         preg->regmlen = len;
 307                 }
 308         }
 309
 310 #ifdef DEBUG
 311         regdump(preg);
 312 #endif
 313
 314         return 0;
 315 }
 316
 317 /*
 318  - reg - regular expression, i.e. main body or parenthesized thing
 319  *
 320  * Caller must absorb opening parenthesis.
 321  *
 322  * Combining parenthesis handling with the base level of regular expression
 323  * is a trifle forced, but the need to tie the tails of the branches to what
 324  * follows makes it hard to avoid.
 325  */
 326 static int *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 327 {
 328         int *ret;
 329         int *br;
 330         const int *ender;
 331         int parno = 0;
 332         int flags;
 333
 334         *flagp = HASWIDTH;      /* Tentatively. */
 335
 336         /* Make an OPEN node, if parenthesized. */
 337         if (paren) {
 338                 parno = ++preg->re_nsub;
 339                 ret = regnode(preg, OPEN+parno);
 340         } else
 341                 ret = NULL;
 342
 343         /* Pick up the branches, linking them together. */
 344         br = regbranch(preg, &flags);
 345         if (br == NULL)
 346                 return(NULL);
 347         if (ret != NULL)
 348                 regtail(preg, ret, br); /* OPEN -> first. */
 349         else
 350                 ret = br;
 351         if (!(flags&HASWIDTH))
 352                 *flagp &= ~HASWIDTH;
 353         *flagp |= flags&SPSTART;
 354         while (*preg->regparse == '|') {
 355                 preg->regparse++;
 356                 br = regbranch(preg, &flags);
 357                 if (br == NULL)
 358                         return(NULL);
 359                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 360                 if (!(flags&HASWIDTH))
 361                         *flagp &= ~HASWIDTH;
 362                 *flagp |= flags&SPSTART;
 363         }
 364
 365         /* Make a closing node, and hook it on the end. */
 366         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 367         regtail(preg, ret, ender);
 368
 369         /* Hook the tails of the branches to the closing node. */
 370         for (br = ret; br != NULL; br = (int *)regnext(preg, br))
 371                 regoptail(preg, br, ender);
 372
 373         /* Check for proper termination. */
 374         if (paren && *preg->regparse++ != ')') {
 375                 preg->err = REG_ERR_UNMATCHED_PAREN;
 376                 return NULL;
 377         } else if (!paren && *preg->regparse != '\0') {
 378                 if (*preg->regparse == ')') {
 379                         preg->err = REG_ERR_UNMATCHED_PAREN;
 380                         return NULL;
 381                 } else {
 382                         preg->err = REG_ERR_JUNK_ON_END;
 383                         return NULL;
 384                 }
 385         }
 386
 387         return(ret);
 388 }
 389
 390 /*
 391  - regbranch - one alternative of an | operator
 392  *
 393  * Implements the concatenation operator.
 394  */
 395 static int *regbranch(regex_t *preg, int *flagp )
 396 {
 397         int *ret;
 398         int *chain;
 399         int *latest;
 400         int flags;
 401
 402         *flagp = WORST;         /* Tentatively. */
 403
 404         ret = regnode(preg, BRANCH);
 405         chain = NULL;
 406         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 407                *preg->regparse != '|') {
 408                 latest = regpiece(preg, &flags);
 409                 if (latest == NULL)
 410                         return(NULL);
 411                 *flagp |= flags&HASWIDTH;
 412                 if (chain == NULL) {/* First piece. */
 413                         *flagp |= flags&SPSTART;
 414                 }
 415                 else {
 416                         regtail(preg, chain, latest);
 417                 }
 418                 chain = latest;
 419         }
 420         if (chain == NULL)      /* Loop ran zero times. */
 421                 (void) regnode(preg, NOTHING);
 422
 423         return(ret);
 424 }
 425
 426 /*
 427  - regpiece - something followed by possible [*+?]
 428  *
 429  * Note that the branching code sequences used for ? and the general cases
 430  * of * and + are somewhat optimized:  they use the same NOTHING node as
 431  * both the endmarker for their branch list and the body of the last branch.
 432  * It might seem that this node could be dispensed with entirely, but the
 433  * endmarker role is not redundant.
 434  */
 435 static int *regpiece(regex_t *preg, int *flagp)
 436 {
 437         int *ret;
 438         char op;
 439         int *next;
 440         int flags;
 441         int size = preg->regsize;
 442         int *chain = NULL;
 443         int min;
 444         int max;
 445
 446         ret = regatom(preg, &flags);
 447         if (ret == NULL)
 448                 return(NULL);
 449
 450         size = preg->regsize - size;
 451
 452         op = *preg->regparse;
 453         if (!ISMULT(op)) {
 454                 *flagp = flags;
 455                 return(ret);
 456         }
 457
 458         if (!(flags&HASWIDTH) && op != '?') {
 459                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 460                 return NULL;
 461         }
 462
 463         /* Handle braces (counted repetition) by expansion */
 464         if (op == '{') {
 465                 char *end;
 466
 467                 min = strtoul(preg->regparse + 1, &end, 10);
 468                 if (end == preg->regparse + 1) {
 469                         preg->err = REG_ERR_BAD_COUNT;
 470                         return NULL;
 471                 }
 472                 if (*end == '}') {
 473                         max = min;
 474                 }
 475                 else {
 476                         preg->regparse = end;
 477                         max = strtoul(preg->regparse + 1, &end, 10);
 478                         if (*end != '}') {
 479                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 480                                 return NULL;
 481                         }
 482                 }
 483                 if (end == preg->regparse + 1) {
 484                         max = MAX_REP_COUNT;
 485                 }
 486                 else if (max < min || max >= 100) {
 487                         preg->err = REG_ERR_BAD_COUNT;
 488                         return NULL;
 489                 }
 490                 if (min >= 100) {
 491                         preg->err = REG_ERR_BAD_COUNT;
 492                         return NULL;
 493                 }
 494
 495                 preg->regparse = strchr(preg->regparse, '}');
 496         }
 497         else {
 498                 min = (op == '+');
 499                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 500         }
 501
 502         if (preg->regparse[1] == '?') {
 503                 preg->regparse++;
 504                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 505         }
 506         else {
 507                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 508         }
 509         if (preg->regcode != &regdummy) {
 510                 ret[2] = max;
 511                 ret[3] = min;
 512                 ret[4] = 0;
 513         }
 514
 515         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 516
 517         if (!(flags & SIMPLE)) {
 518                 int *back = regnode(preg, BACK);
 519                 regtail(preg, back, ret);
 520                 regtail(preg, next, back);
 521         }
 522
 523         preg->regparse++;
 524         if (ISMULT(*preg->regparse)) {
 525                 preg->err = REG_ERR_NESTED_COUNT;
 526                 return NULL;
 527         }
 528
 529         return chain ? chain : ret;
 530 }
 531
 532 /**
 533  * Add all characters in the inclusive range between lower and upper.
 534  *
 535  * Handles a swapped range (upper < lower).
 536  */
 537 static void reg_addrange(regex_t *preg, int lower, int upper)
 538 {
 539         if (lower > upper) {
 540                 reg_addrange(preg, upper, lower);
 541         }
 542         /* Add a range as length, start */
 543         regc(preg, upper - lower + 1);
 544         regc(preg, lower);
 545 }
 546
 547 /**
 548  * Add a null-terminated literal string as a set of ranges.
 549  */
 550 static void reg_addrange_str(regex_t *preg, const char *str)
 551 {
 552         while (*str) {
 553                 reg_addrange(preg, *str, *str);
 554                 str++;
 555         }
 556 }
 557
 558 /**
 559  * Extracts the next unicode char from utf8.
 560  *
 561  * If 'upper' is set, converts the char to uppercase.
 562  */
 563 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 564 {
 565         int l = utf8_tounicode(s, uc);
 566         if (upper) {
 567                 *uc = utf8_upper(*uc);
 568         }
 569         return l;
 570 }
 571
 572 /**
 573  * Converts a hex digit to decimal.
 574  *
 575  * Returns -1 for an invalid hex digit.
 576  */
 577 static int hexdigitval(int c)
 578 {
 579         if (c >= '0' && c <= '9')
 580                 return c - '0';
 581         if (c >= 'a' && c <= 'f')
 582                 return c - 'a' + 10;
 583         if (c >= 'A' && c <= 'F')
 584                 return c - 'A' + 10;
 585         return -1;
 586 }
 587
 588 /**
 589  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 590  *
 591  * Returns the number of hex digits parsed.
 592  * If there are no hex digits, returns 0 and stores nothing.
 593  */
 594 static int parse_hex(const char *s, int n, int *uc)
 595 {
 596         int val = 0;
 597         int k;
 598
 599         for (k = 0; k < n; k++) {
 600                 int c = hexdigitval(*s++);
 601                 if (c == -1) {
 602                         break;
 603                 }
 604                 val = (val << 4) | c;
 605         }
 606         if (k) {
 607                 *uc = val;
 608         }
 609         return k;
 610 }
 611
 612 /**
 613  * Call for chars after a backlash to decode the escape sequence.
 614  *
 615  * Stores the result in *ch.
 616  *
 617  * Returns the number of bytes consumed.
 618  */
 619 static int reg_decode_escape(const char *s, int *ch)
 620 {
 621         int n;
 622         const char *s0 = s;
 623
 624         *ch = *s++;
 625
 626         switch (*ch) {
 627                 case 'b': *ch = '\b'; break;
 628                 case 'e': *ch = 27; break;
 629                 case 'f': *ch = '\f'; break;
 630                 case 'n': *ch = '\n'; break;
 631                 case 'r': *ch = '\r'; break;
 632                 case 't': *ch = '\t'; break;
 633                 case 'v': *ch = '\v'; break;
 634                 case 'u':
 635                         if ((n = parse_hex(s, 4, ch)) > 0) {
 636                                 s += n;
 637                         }
 638                         break;
 639                 case 'x':
 640                         if ((n = parse_hex(s, 2, ch)) > 0) {
 641                                 s += n;
 642                         }
 643                         break;
 644                 case '\0':
 645                         s--;
 646                         *ch = '\\';
 647                         break;
 648         }
 649         return s - s0;
 650 }
 651
 652 /*
 653  - regatom - the lowest level
 654  *
 655  * Optimization:  gobbles an entire sequence of ordinary characters so that
 656  * it can turn them into a single node, which is smaller to store and
 657  * faster to run.  Backslashed characters are exceptions, each becoming a
 658  * separate node; the code is simpler that way and it's not worth fixing.
 659  */
 660 static int *regatom(regex_t *preg, int *flagp)
 661 {
 662         int *ret;
 663         int flags;
 664         int nocase = (preg->cflags & REG_ICASE);
 665
 666         int ch;
 667         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 668
 669         *flagp = WORST;         /* Tentatively. */
 670
 671         preg->regparse += n;
 672         switch (ch) {
 673         /* FIXME: these chars only have meaning at beg/end of pat? */
 674         case '^':
 675                 ret = regnode(preg, BOL);
 676                 break;
 677         case '$':
 678                 ret = regnode(preg, EOL);
 679                 break;
 680         case '.':
 681                 ret = regnode(preg, ANY);
 682                 *flagp |= HASWIDTH|SIMPLE;
 683                 break;
 684         case '[': {
 685                         const char *pattern = preg->regparse;
 686
 687                         if (*pattern == '^') {  /* Complement of range. */
 688                                 ret = regnode(preg, ANYBUT);
 689                                 pattern++;
 690                         } else
 691                                 ret = regnode(preg, ANYOF);
 692
 693                         /* Special case. If the first char is ']' or '-', it is part of the set */
 694                         if (*pattern == ']' || *pattern == '-') {
 695                                 reg_addrange(preg, *pattern, *pattern);
 696                                 pattern++;
 697                         }
 698
 699                         while (*pattern && *pattern != ']') {
 700                                 /* Is this a range? a-z */
 701                                 int start;
 702                                 int end;
 703
 704                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 705                                 if (start == '\\') {
 706                                         pattern += reg_decode_escape(pattern, &start);
 707                                         if (start == 0) {
 708                                                 preg->err = REG_ERR_NULL_CHAR;
 709                                                 return NULL;
 710                                         }
 711                                 }
 712                                 if (pattern[0] == '-' && pattern[1]) {
 713                                         /* skip '-' */
 714                                         pattern += utf8_tounicode(pattern, &end);
 715                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 716                                         if (end == '\\') {
 717                                                 pattern += reg_decode_escape(pattern, &end);
 718                                                 if (end == 0) {
 719                                                         preg->err = REG_ERR_NULL_CHAR;
 720                                                         return NULL;
 721                                                 }
 722                                         }
 723
 724                                         reg_addrange(preg, start, end);
 725                                         continue;
 726                                 }
 727                                 if (start == '[') {
 728                                         if (strncmp(pattern, ":alpha:]", 8) == 0) {
 729                                                 if ((preg->cflags & REG_ICASE) == 0) {
 730                                                         reg_addrange(preg, 'a', 'z');
 731                                                 }
 732                                                 reg_addrange(preg, 'A', 'Z');
 733                                                 pattern += 8;
 734                                                 continue;
 735                                         }
 736                                         if (strncmp(pattern, ":alnum:]", 8) == 0) {
 737                                                 if ((preg->cflags & REG_ICASE) == 0) {
 738                                                         reg_addrange(preg, 'a', 'z');
 739                                                 }
 740                                                 reg_addrange(preg, 'A', 'Z');
 741                                                 reg_addrange(preg, '0', '9');
 742                                                 pattern += 8;
 743                                                 continue;
 744                                         }
 745                                         if (strncmp(pattern, ":space:]", 8) == 0) {
 746                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 747                                                 pattern += 8;
 748                                                 continue;
 749                                         }
 750                                 }
 751                                 /* Not a range, so just add the char */
 752                                 reg_addrange(preg, start, start);
 753                         }
 754                         regc(preg, '\0');
 755
 756                         if (*pattern) {
 757                                 pattern++;
 758                         }
 759                         preg->regparse = pattern;
 760
 761                         *flagp |= HASWIDTH|SIMPLE;
 762                 }
 763                 break;
 764         case '(':
 765                 ret = reg(preg, 1, &flags);
 766                 if (ret == NULL)
 767                         return(NULL);
 768                 *flagp |= flags&(HASWIDTH|SPSTART);
 769                 break;
 770         case '\0':
 771         case '|':
 772         case ')':
 773                 preg->err = REG_ERR_INTERNAL;
 774                 return NULL;    /* Supposed to be caught earlier. */
 775         case '?':
 776         case '+':
 777         case '*':
 778         case '{':
 779                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 780                 return NULL;
 781         case '\\':
 782                 switch (*preg->regparse++) {
 783                 case '\0':
 784                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 785                         return NULL;
 786                         break;
 787                 case '<':
 788                 case 'm':
 789                         ret = regnode(preg, WORDA);
 790                         break;
 791                 case '>':
 792                 case 'M':
 793                         ret = regnode(preg, WORDZ);
 794                         break;
 795                 case 'd':
 796                         ret = regnode(preg, ANYOF);
 797                         reg_addrange(preg, '0', '9');
 798                         regc(preg, '\0');
 799                         *flagp |= HASWIDTH|SIMPLE;
 800                         break;
 801                 case 'w':
 802                         ret = regnode(preg, ANYOF);
 803                         if ((preg->cflags & REG_ICASE) == 0) {
 804                                 reg_addrange(preg, 'a', 'z');
 805                         }
 806                         reg_addrange(preg, 'A', 'Z');
 807                         reg_addrange(preg, '0', '9');
 808                         reg_addrange(preg, '_', '_');
 809                         regc(preg, '\0');
 810                         *flagp |= HASWIDTH|SIMPLE;
 811                         break;
 812                 case 's':
 813                         ret = regnode(preg, ANYOF);
 814                         reg_addrange_str(preg," \t\r\n\f\v");
 815                         regc(preg, '\0');
 816                         *flagp |= HASWIDTH|SIMPLE;
 817                         break;
 818                 /* FIXME: Someday handle \1, \2, ... */
 819                 default:
 820                         /* Handle general quoted chars in exact-match routine */
 821                         /* Back up to include the backslash */
 822                         preg->regparse--;
 823                         goto de_fault;
 824                 }
 825                 break;
 826         de_fault:
 827         default: {
 828                         /*
 829                          * Encode a string of characters to be matched exactly.
 830                          */
 831                         int added = 0;
 832
 833                         /* Back up to pick up the first char of interest */
 834                         preg->regparse -= n;
 835
 836                         ret = regnode(preg, EXACTLY);
 837
 838                         /* Note that a META operator such as ? or * consumes the
 839                          * preceding char.
 840                          * Thus we must be careful to look ahead by 2 and add the
 841                          * last char as it's own EXACTLY if necessary
 842                          */
 843
 844                         /* Until end of string or a META char is reached */
 845                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 846                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 847                                 if (ch == '\\' && preg->regparse[n]) {
 848                                         /* Non-trailing backslash.
 849                                          * Is this a special escape, or a regular escape?
 850                                          */
 851                                         if (strchr("<>mMwds", preg->regparse[n])) {
 852                                                 /* A special escape. All done with EXACTLY */
 853                                                 break;
 854                                         }
 855                                         /* Decode it. Note that we add the length for the escape
 856                                          * sequence to the length for the backlash so we can skip
 857                                          * the entire sequence, or not as required.
 858                                          */
 859                                         n += reg_decode_escape(preg->regparse + n, &ch);
 860                                         if (ch == 0) {
 861                                                 preg->err = REG_ERR_NULL_CHAR;
 862                                                 return NULL;
 863                                         }
 864                                 }
 865
 866                                 /* Now we have one char 'ch' of length 'n'.
 867                                  * Check to see if the following char is a MULT
 868                                  */
 869
 870                                 if (ISMULT(preg->regparse[n])) {
 871                                         /* Yes. But do we already have some EXACTLY chars? */
 872                                         if (added) {
 873                                                 /* Yes, so return what we have and pick up the current char next time around */
 874                                                 break;
 875                                         }
 876                                         /* No, so add this single char and finish */
 877                                         regc(preg, ch);
 878                                         added++;
 879                                         preg->regparse += n;
 880                                         break;
 881                                 }
 882
 883                                 /* No, so just add this char normally */
 884                                 regc(preg, ch);
 885                                 added++;
 886                                 preg->regparse += n;
 887                         }
 888                         regc(preg, '\0');
 889
 890                         *flagp |= HASWIDTH;
 891                         if (added == 1)
 892                                 *flagp |= SIMPLE;
 893                         break;
 894                 }
 895                 break;
 896         }
 897
 898         return(ret);
 899 }
 900
 901 /*
 902  - regnode - emit a node
 903  */
 904 /* Location. */
 905 static int *regnode(regex_t *preg, int op)
 906 {
 907         int *ret;
 908         int *ptr;
 909
 910         preg->regsize += 2;
 911         ret = preg->regcode;
 912         if (ret == &regdummy) {
 913                 return(ret);
 914         }
 915
 916         ptr = ret;
 917         *ptr++ = op;
 918         *ptr++ = 0;             /* Null "next" pointer. */
 919         preg->regcode = ptr;
 920
 921         return(ret);
 922 }
 923
 924 /*
 925  - regc - emit (if appropriate) a byte of code
 926  */
 927 static void regc(regex_t *preg, int b )
 928 {
 929         preg->regsize++;
 930         if (preg->regcode != &regdummy)
 931                 *preg->regcode++ = b;
 932 }
 933
 934 /*
 935  - reginsert - insert an operator in front of already-emitted operand
 936  *
 937  * Means relocating the operand.
 938  * Returns the new location of the original operand.
 939  */
 940 static int *reginsert(regex_t *preg, int op, int size, int *opnd )
 941 {
 942         int *src;
 943         int *dst;
 944         int *place;
 945
 946         preg->regsize += size;
 947
 948         if (preg->regcode == &regdummy) {
 949                 return opnd;
 950         }
 951
 952         src = preg->regcode;
 953         preg->regcode += size;
 954         dst = preg->regcode;
 955         while (src > opnd)
 956                 *--dst = *--src;
 957
 958         place = opnd;           /* Op node, where operand used to be. */
 959         *place++ = op;
 960         while (--size) {
 961                 *place++ = 0;
 962         }
 963
 964         return place;
 965 }
 966
 967 /*
 968  - regtail - set the next-pointer at the end of a node chain
 969  */
 970 static void regtail(regex_t *preg, int *p, const int *val )
 971 {
 972         int *scan;
 973         int *temp;
 974         int offset;
 975
 976         if (p == &regdummy)
 977                 return;
 978
 979         /* Find last node. */
 980         scan = p;
 981         for (;;) {
 982                 temp = (int *)regnext(preg, scan);
 983                 if (temp == NULL)
 984                         break;
 985                 scan = temp;
 986         }
 987
 988         if (OP(scan) == BACK)
 989                 offset = scan - val;
 990         else
 991                 offset = val - scan;
 992
 993         scan[1] = offset;
 994 }
 995
 996 /*
 997  - regoptail - regtail on operand of first argument; nop if operandless
 998  */
 999
1000 static void regoptail(regex_t *preg, int *p, const int *val )
1001 {
1002         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1003         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1004                 return;
1005         regtail(preg, OPERAND(p), val);
1006 }
1007
1008 /*
1009  * regexec and friends
1010  */
1011
1012 /*
1013  * Forwards.
1014  */
1015 static int regtry(regex_t *preg, const char *string );
1016 static int regmatch(regex_t *preg, const int *prog);
1017 static int regrepeat(regex_t *preg, const int *p, int max);
1018
1019 /*
1020  - regexec - match a regexp against a string
1021  */
1022 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1023 {
1024         const char *s;
1025         const int *scan;
1026
1027         /* Be paranoid... */
1028         if (preg == NULL || preg->program == NULL || string == NULL) {
1029                 return REG_ERR_NULL_ARGUMENT;
1030         }
1031
1032         /* Check validity of program. */
1033         if (*preg->program != REG_MAGIC) {
1034                 return REG_ERR_CORRUPTED;
1035         }
1036
1037 #ifdef DEBUG
1038         fprintf(stderr, "regexec: %s\n", string);
1039         regdump(preg);
1040 #endif
1041
1042         preg->eflags = eflags;
1043         preg->pmatch = pmatch;
1044         preg->nmatch = nmatch;
1045         preg->start = string;   /* All offsets are computed from here */
1046
1047         /* Must clear out the embedded repeat counts */
1048         for (scan = OPERAND(preg->program + 1); scan != NULL; scan = regnext(preg, scan)) {
1049                 switch (OP(scan)) {
1050                 case REP:
1051                 case REPMIN:
1052                 case REPX:
1053                 case REPXMIN:
1054                         *(int *)(scan + 4) = 0;
1055                         break;
1056                 }
1057         }
1058
1059         /* If there is a "must appear" string, look for it. */
1060         if (preg->regmust != NULL) {
1061                 s = string;
1062                 while ((s = str_find(s, preg->regmust[0], preg->cflags & REG_ICASE)) != NULL) {
1063                         if (prefix_cmp(preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1064                                 break;
1065                         }
1066                         s++;
1067                 }
1068                 if (s == NULL)  /* Not present. */
1069                         return REG_NOMATCH;
1070         }
1071
1072         /* Mark beginning of line for ^ . */
1073         preg->regbol = string;
1074
1075         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1076         if (preg->reganch) {
1077                 if (eflags & REG_NOTBOL) {
1078                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1079                         goto nextline;
1080                 }
1081                 while (1) {
1082                         int ret = regtry(preg, string);
1083                         if (ret) {
1084                                 return REG_NOERROR;
1085                         }
1086                         if (*string) {
1087 nextline:
1088                                 if (preg->cflags & REG_NEWLINE) {
1089                                         /* Try the next anchor? */
1090                                         string = strchr(string, '\n');
1091                                         if (string) {
1092                                                 preg->regbol = ++string;
1093                                                 continue;
1094                                         }
1095                                 }
1096                         }
1097                         return REG_NOMATCH;
1098                 }
1099         }
1100
1101         /* Messy cases:  unanchored match. */
1102         s = string;
1103         if (preg->regstart != '\0') {
1104                 /* We know what char it must start with. */
1105                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1106                         if (regtry(preg, s))
1107                                 return REG_NOERROR;
1108                         s++;
1109                 }
1110         }
1111         else
1112                 /* We don't -- general case. */
1113                 while (1) {
1114                         if (regtry(preg, s))
1115                                 return REG_NOERROR;
1116                         if (*s == '\0') {
1117                                 break;
1118                         }
1119                         s += utf8_charlen(*s);
1120                 }
1121
1122         /* Failure. */
1123         return REG_NOMATCH;
1124 }
1125
1126 /*
1127  - regtry - try match at specific point
1128  */
1129                         /* 0 failure, 1 success */
1130 static int regtry( regex_t *preg, const char *string )
1131 {
1132         int i;
1133
1134         preg->reginput = string;
1135
1136         for (i = 0; i < preg->nmatch; i++) {
1137                 preg->pmatch[i].rm_so = -1;
1138                 preg->pmatch[i].rm_eo = -1;
1139         }
1140         if (regmatch(preg, preg->program + 1)) {
1141                 preg->pmatch[0].rm_so = string - preg->start;
1142                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1143                 return(1);
1144         } else
1145                 return(0);
1146 }
1147
1148 /**
1149  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1150  *
1151  * If 'nocase' is non-zero, does a case-insensitive match.
1152  *
1153  * Returns -1 on not found.
1154  */
1155 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1156 {
1157         const char *s = string;
1158         while (proglen && *s) {
1159                 int ch;
1160                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1161                 if (ch != *prog) {
1162                         return -1;
1163                 }
1164                 prog++;
1165                 s += n;
1166                 proglen--;
1167         }
1168         if (proglen == 0) {
1169                 return s - string;
1170         }
1171         return -1;
1172 }
1173
1174 /**
1175  * Searchs for 'c' in the range 'range'.
1176  *
1177  * Returns 1 if found, or 0 if not.
1178  */
1179 static int reg_range_find(const int *range, int c)
1180 {
1181         while (*range) {
1182                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1183                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1184                         return 1;
1185                 }
1186                 range += 2;
1187         }
1188         return 0;
1189 }
1190
1191 /**
1192  * Search for the character 'c' in the utf-8 string 'string'.
1193  *
1194  * If 'nocase' is set, the 'string' is assumed to be uppercase
1195  * and 'c' is converted to uppercase before matching.
1196  *
1197  * Returns the byte position in the string where the 'c' was found, or
1198  * NULL if not found.
1199  */
1200 static const char *str_find(const char *string, int c, int nocase)
1201 {
1202         if (nocase) {
1203                 /* The "string" should already be converted to uppercase */
1204                 c = utf8_upper(c);
1205         }
1206         while (*string) {
1207                 int ch;
1208                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1209                 if (c == ch) {
1210                         return string;
1211                 }
1212                 string += n;
1213         }
1214         return NULL;
1215 }
1216
1217 /**
1218  * Returns true if 'ch' is an end-of-line char.
1219  *
1220  * In REG_NEWLINE mode, \n is considered EOL in
1221  * addition to \0
1222  */
1223 static int reg_iseol(regex_t *preg, int ch)
1224 {
1225         if (preg->cflags & REG_NEWLINE) {
1226                 return ch == '\0' || ch == '\n';
1227         }
1228         else {
1229                 return ch == '\0';
1230         }
1231 }
1232
1233 static int regmatchsimplerepeat(regex_t *preg, const int *scan, int matchmin)
1234 {
1235         int nextch = '\0';
1236         const char *save;
1237         int no;
1238         int c;
1239
1240         int max = scan[2];
1241         int min = scan[3];
1242         const int *next = regnext(preg, scan);
1243
1244         /*
1245          * Lookahead to avoid useless match attempts
1246          * when we know what character comes next.
1247          */
1248         if (OP(next) == EXACTLY) {
1249                 nextch = *OPERAND(next);
1250         }
1251         save = preg->reginput;
1252         no = regrepeat(preg, scan + 5, max);
1253         if (no < min) {
1254                 return 0;
1255         }
1256         if (matchmin) {
1257                 /* from min up to no */
1258                 max = no;
1259                 no = min;
1260         }
1261         /* else from no down to min */
1262         while (1) {
1263                 if (matchmin) {
1264                         if (no > max) {
1265                                 break;
1266                         }
1267                 }
1268                 else {
1269                         if (no < min) {
1270                                 break;
1271                         }
1272                 }
1273                 preg->reginput = save + utf8_index(save, no);
1274                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1275                 /* If it could work, try it. */
1276                 if (reg_iseol(preg, nextch) || c == nextch) {
1277                         if (regmatch(preg, next)) {
1278                                 return(1);
1279                         }
1280                 }
1281                 if (matchmin) {
1282                         /* Couldn't or didn't, add one more */
1283                         no++;
1284                 }
1285                 else {
1286                         /* Couldn't or didn't -- back up. */
1287                         no--;
1288                 }
1289         }
1290         return(0);
1291 }
1292
1293 static int regmatchrepeat(regex_t *preg, int *scan, int matchmin)
1294 {
1295         const char *save;
1296
1297         int max = scan[2];
1298         int min = scan[3];
1299
1300         save = preg->reginput;
1301
1302         /* Have we reached min? */
1303         if (scan[4] < min) {
1304                 /* No, so get another one */
1305                 scan[4]++;
1306                 if (regmatch(preg, scan + 5)) {
1307                         return 1;
1308                 }
1309                 scan[4]--;
1310                 return 0;
1311         }
1312         if (scan[4] > max) {
1313                 return 0;
1314         }
1315
1316         if (matchmin) {
1317                 /* minimal, so try other branch first */
1318                 if (regmatch(preg, regnext(preg, scan))) {
1319                         return 1;
1320                 }
1321                 /* No, so try one more */
1322                 scan[4]++;
1323                 if (regmatch(preg, scan + 5)) {
1324                         return 1;
1325                 }
1326                 scan[4]--;
1327                 return 0;
1328         }
1329         /* maximal, so try this branch again */
1330         save = preg->reginput;
1331         if (scan[4] < max) {
1332                 scan[4]++;
1333                 if (regmatch(preg, scan + 5)) {
1334                         return 1;
1335                 }
1336                 scan[4]--;
1337         }
1338         /* At this point we are at max with no match. Back up by one and try the other branch */
1339         preg->reginput = save;
1340         int ret = regmatch(preg, regnext(preg, scan));
1341         return ret;
1342 }
1343
1344 /*
1345  - regmatch - main matching routine
1346  *
1347  * Conceptually the strategy is simple:  check to see whether the current
1348  * node matches, call self recursively to see whether the rest matches,
1349  * and then act accordingly.  In practice we make some effort to avoid
1350  * recursion, in particular by going through "ordinary" nodes (that don't
1351  * need to know whether the rest of the match failed) by a loop instead of
1352  * by recursion.
1353  */
1354 /* 0 failure, 1 success */
1355 static int regmatch(regex_t *preg, const int *prog)
1356 {
1357         const int *scan;        /* Current node. */
1358         const int *next;                /* Next node. */
1359
1360         scan = prog;
1361
1362 #ifdef DEBUG
1363         if (scan != NULL && regnarrate)
1364                 fprintf(stderr, "%s(\n", regprop(scan));
1365 #endif
1366         while (scan != NULL) {
1367                 int n;
1368                 int c;
1369 #ifdef DEBUG
1370                 if (regnarrate) {
1371                         //fprintf(stderr, "%s...\n", regprop(scan));
1372                         int op = OP(scan);
1373                         fprintf(stderr, "%2d{%02x}%s...\n", (int)(scan-preg->program), op, regprop(scan));      /* Where, what. */
1374                 }
1375 #endif
1376                 next = regnext(preg, scan);
1377                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1378
1379                 switch (OP(scan)) {
1380                 case BOL:
1381                         if (preg->reginput != preg->regbol)
1382                                 return(0);
1383                         break;
1384                 case EOL:
1385                         if (!reg_iseol(preg, c)) {
1386                                 return(0);
1387                         }
1388                         break;
1389                 case WORDA:
1390                         /* Must be looking at a letter, digit, or _ */
1391                         if ((!isalnum(UCHAR(c))) && c != '_')
1392                                 return(0);
1393                         /* Prev must be BOL or nonword */
1394                         if (preg->reginput > preg->regbol &&
1395                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1396                                 return(0);
1397                         break;
1398                 case WORDZ:
1399                         /* Can't match at BOL */
1400                         if (preg->reginput > preg->regbol) {
1401                                 /* Current must be EOL or nonword */
1402                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1403                                         c = preg->reginput[-1];
1404                                         /* Previous must be word */
1405                                         if (isalnum(UCHAR(c)) || c == '_') {
1406                                                 break;
1407                                         }
1408                                 }
1409                         }
1410                         /* No */
1411                         return(0);
1412
1413                 case ANY:
1414                         if (reg_iseol(preg, c))
1415                                 return 0;
1416                         preg->reginput += n;
1417                         break;
1418                 case EXACTLY: {
1419                                 const int *opnd;
1420                                 int len;
1421                                 int slen;
1422
1423                                 opnd = OPERAND(scan);
1424                                 len = str_int_len(opnd);
1425
1426                                 slen = prefix_cmp(opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1427                                 if (slen < 0) {
1428                                         return(0);
1429                                 }
1430                                 preg->reginput += slen;
1431                         }
1432                         break;
1433                 case ANYOF:
1434                         if (reg_iseol(preg, c) || reg_range_find(OPERAND(scan), c) == 0) {
1435                                 return(0);
1436                         }
1437                         preg->reginput += n;
1438                         break;
1439                 case ANYBUT:
1440                         if (reg_iseol(preg, c) || reg_range_find(OPERAND(scan), c) != 0) {
1441                                 return(0);
1442                         }
1443                         preg->reginput += n;
1444                         break;
1445                 case NOTHING:
1446                         break;
1447                 case BACK:
1448                         break;
1449                 case BRANCH: {
1450                                 const char *save;
1451
1452                                 if (OP(next) != BRANCH)         /* No choice. */
1453                                         next = OPERAND(scan);   /* Avoid recursion. */
1454                                 else {
1455                                         do {
1456                                                 save = preg->reginput;
1457                                                 if (regmatch(preg, OPERAND(scan))) {
1458                                                         return(1);
1459                                                 }
1460                                                 preg->reginput = save;
1461                                                 scan = regnext(preg, scan);
1462                                         } while (scan != NULL && OP(scan) == BRANCH);
1463                                         return(0);
1464                                         /* NOTREACHED */
1465                                 }
1466                         }
1467                         break;
1468                 case REP:
1469                 case REPMIN:
1470                         return regmatchsimplerepeat(preg, scan, OP(scan) == REPMIN);
1471
1472                 case REPX:
1473                 case REPXMIN:
1474                         return regmatchrepeat(preg, (int *)scan, OP(scan) == REPXMIN);
1475
1476                 case END:
1477                         return(1);      /* Success! */
1478                         break;
1479                 default:
1480                         if (OP(scan) >= OPEN+1 && OP(scan) < CLOSE_END) {
1481                                 const char *save;
1482
1483                                 save = preg->reginput;
1484
1485                                 if (regmatch(preg, next)) {
1486                                         int no;
1487                                         /*
1488                                          * Don't set startp if some later
1489                                          * invocation of the same parentheses
1490                                          * already has.
1491                                          */
1492                                         if (OP(scan) < CLOSE) {
1493                                                 no = OP(scan) - OPEN;
1494                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1495                                                         preg->pmatch[no].rm_so = save - preg->start;
1496                                                 }
1497                                         }
1498                                         else {
1499                                                 no = OP(scan) - CLOSE;
1500                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1501                                                         preg->pmatch[no].rm_eo = save - preg->start;
1502                                                 }
1503                                         }
1504                                         return(1);
1505                                 } else
1506                                         return(0);
1507                         }
1508                         return REG_ERR_INTERNAL;
1509                 }
1510
1511                 scan = next;
1512         }
1513
1514         /*
1515          * We get here only if there's trouble -- normally "case END" is
1516          * the terminating point.
1517          */
1518         return REG_ERR_INTERNAL;
1519 }
1520
1521 /*
1522  - regrepeat - repeatedly match something simple, report how many
1523  */
1524 static int regrepeat(regex_t *preg, const int *p, int max)
1525 {
1526         int count = 0;
1527         const char *scan;
1528         const int *opnd;
1529         int ch;
1530         int n;
1531
1532         scan = preg->reginput;
1533         opnd = OPERAND(p);
1534         switch (OP(p)) {
1535         case ANY:
1536                 /* No need to handle utf8 specially here */
1537                 while (!reg_iseol(preg, *scan) && count < max) {
1538                         count++;
1539                         scan++;
1540                 }
1541                 break;
1542         case EXACTLY:
1543                 while (count < max) {
1544                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1545                         if (*opnd != ch) {
1546                                 break;
1547                         }
1548                         count++;
1549                         scan += n;
1550                 }
1551                 break;
1552         case ANYOF:
1553                 while (count < max) {
1554                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1555                         if (reg_iseol(preg, ch) || reg_range_find(opnd, ch) == 0) {
1556                                 break;
1557                         }
1558                         count++;
1559                         scan += n;
1560                 }
1561                 break;
1562         case ANYBUT:
1563                 while (count < max) {
1564                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1565                         if (reg_iseol(preg, ch) || reg_range_find(opnd, ch) != 0) {
1566                                 break;
1567                         }
1568                         count++;
1569                         scan += n;
1570                 }
1571                 break;
1572         default:                /* Oh dear.  Called inappropriately. */
1573                 preg->err = REG_ERR_INTERNAL;
1574                 count = 0;      /* Best compromise. */
1575                 break;
1576         }
1577         preg->reginput = scan;
1578
1579         return(count);
1580 }
1581
1582 /*
1583  - regnext - dig the "next" pointer out of a node
1584  */
1585 static const int *regnext(regex_t *preg, const int *p )
1586 {
1587         int offset;
1588
1589         if (p == &regdummy)
1590                 return(NULL);
1591
1592         offset = NEXT(p);
1593
1594         if (offset == 0)
1595                 return(NULL);
1596
1597         if (OP(p) == BACK)
1598                 return(p-offset);
1599         else
1600                 return(p+offset);
1601 }
1602
1603 #ifdef DEBUG
1604
1605 /*
1606  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1607  */
1608 static void regdump(regex_t *preg)
1609 {
1610         const int *s;
1611         char op = EXACTLY;      /* Arbitrary non-END op. */
1612         const int *next;
1613         char buf[4];
1614
1615         if (preg->regcode == &regdummy)
1616                 return;
1617
1618         s = preg->program + 1;
1619         while (op != END && s < preg->regcode) {        /* While that wasn't END last time... */
1620                 op = OP(s);
1621                 printf("%2d{%02x}%s", (int)(s-preg->program), op, regprop(s));  /* Where, what. */
1622                 next = regnext(preg, s);
1623                 if (next == NULL)               /* Next ptr. */
1624                         printf("(0)");
1625                 else
1626                         printf("(%d)", (int)((s-preg->program)+(next-s)));
1627                 s += 2;
1628                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1629                         int max = s[0];
1630                         int min = s[1];
1631                         if (max == 65535) {
1632                                 printf("{%d,*}", min);
1633                         }
1634                         else {
1635                                 printf("{%d,%d}", min, max);
1636                         }
1637                         printf(" %d", s[2]);
1638                         s += 3;
1639                 }
1640                 else if (op == ANYOF || op == ANYBUT) {
1641                         /* set of ranges */
1642
1643                         while (*s) {
1644                                 int len = *s++;
1645                                 int first = *s++;
1646                                 buf[utf8_fromunicode(buf, first)] = 0;
1647                                 printf("%s", buf);
1648                                 if (len > 1) {
1649                                         buf[utf8_fromunicode(buf, first + len - 1)] = 0;
1650                                         printf("-%s", buf);
1651                                 }
1652                         }
1653                         s++;
1654                 }
1655                 else if (op == EXACTLY) {
1656                         /* Literal string, where present. */
1657
1658                         while (*s) {
1659                                 buf[utf8_fromunicode(buf, *s)] = 0;
1660                                 printf("%s", buf);
1661                                 s++;
1662                         }
1663                         s++;
1664                 }
1665                 putchar('\n');
1666         }
1667
1668         if (op == END) {
1669                 /* Header fields of interest. */
1670                 if (preg->regstart) {
1671                         buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1672                         printf("start '%s' ", buf);
1673                 }
1674                 if (preg->reganch)
1675                         printf("anchored ");
1676                 if (preg->regmust != NULL) {
1677                         int i;
1678                         printf("must have:");
1679                         for (i = 0; i < preg->regmlen; i++) {
1680                                 putchar(preg->regmust[i]);
1681                         }
1682                         putchar('\n');
1683                 }
1684         }
1685         printf("\n");
1686 }
1687
1688 /*
1689  - regprop - printable representation of opcode
1690  */
1691 static const char *regprop( const int *op )
1692 {
1693         char *p;
1694         static char buf[50];
1695
1696         (void) strcpy(buf, ":");
1697
1698         switch (OP(op)) {
1699         case BOL:
1700                 p = "BOL";
1701                 break;
1702         case EOL:
1703                 p = "EOL";
1704                 break;
1705         case ANY:
1706                 p = "ANY";
1707                 break;
1708         case ANYOF:
1709                 p = "ANYOF";
1710                 break;
1711         case ANYBUT:
1712                 p = "ANYBUT";
1713                 break;
1714         case BRANCH:
1715                 p = "BRANCH";
1716                 break;
1717         case EXACTLY:
1718                 p = "EXACTLY";
1719                 break;
1720         case NOTHING:
1721                 p = "NOTHING";
1722                 break;
1723         case BACK:
1724                 p = "BACK";
1725                 break;
1726         case END:
1727                 p = "END";
1728                 break;
1729         case REP:
1730                 p = "REP";
1731                 break;
1732         case REPMIN:
1733                 p = "REPMIN";
1734                 break;
1735         case REPX:
1736                 p = "REPX";
1737                 break;
1738         case REPXMIN:
1739                 p = "REPXMIN";
1740                 break;
1741         case WORDA:
1742                 p = "WORDA";
1743                 break;
1744         case WORDZ:
1745                 p = "WORDZ";
1746                 break;
1747         default:
1748                 if (OP(op) >= OPEN && OP(op) < CLOSE) {
1749                         sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1750                 }
1751                 else if (OP(op) >= CLOSE && OP(op) < CLOSE_END) {
1752                         sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1753                 }
1754                 else {
1755                         abort();
1756                 }
1757                 p = NULL;
1758                 break;
1759         }
1760         if (p != NULL)
1761                 (void) strcat(buf, p);
1762         return(buf);
1763 }
1764 #endif
1765
1766 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1767 {
1768         static const char *error_strings[] = {
1769                 "success",
1770                 "no match",
1771                 "bad pattern",
1772                 "null argument",
1773                 "unknown error",
1774                 "too big",
1775                 "out of memory",
1776                 "too many ()",
1777                 "parentheses () not balanced",
1778                 "braces {} not balanced",
1779                 "invalid repetition count(s)",
1780                 "extra characters",
1781                 "*+ of empty atom",
1782                 "nested count",
1783                 "internal error",
1784                 "count follows nothing",
1785                 "trailing backslash",
1786                 "corrupted program",
1787                 "contains null char",
1788         };
1789         const char *err;
1790
1791         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1792                 err = "Bad error code";
1793         }
1794         else {
1795                 err = error_strings[errcode];
1796         }
1797
1798         return snprintf(errbuf, errbuf_size, "%s", err);
1799 }
1800
1801 void regfree(regex_t *preg)
1802 {
1803         free(preg->program);
1804 }
1805
1806 #endif