jimregexp.c

   1 /*
   2  * regcomp and regexec -- regsub and regerror are elsewhere
   3  *
   4  *      Copyright (c) 1986 by University of Toronto.
   5  *      Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *      Permission is granted to anyone to use this software for any
   8  *      purpose on any computer system, and to redistribute it freely,
   9  *      subject to the following restrictions:
  10  *
  11  *      1. The author is not responsible for the consequences of use of
  12  *              this software, no matter how awful, even if they arise
  13  *              from defects in it.
  14  *
  15  *      2. The origin of this software must not be misrepresented, either
  16  *              by explicit claim or by omission.
  17  *
  18  *      3. Altered versions must be plainly marked as such, and must not
  19  *              be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  33  *** to avoid conflicting with the system regexp.h.  Const, bless its
  34  *** soul, was removed so it can compile everywhere.  The declaration
  35  *** of strchr() was in conflict on AIX, so it was removed (as it is
  36  *** happily defined in string.h).
  37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  41  *
  42  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  43  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  44  *   This includes counted repetitions, UTF-8 support, character classes,
  45  *   shorthand character classes, increased number of parentheses to 100,
  46  *   backslash escape sequences. It also removes \n as an alternative to |.
  47  *
  48  * Beware that some of this code is subtly aware of the way operator
  49  * precedence is structured in regular expressions.  Serious changes in
  50  * regular-expression syntax might require a total rethink.
  51  */
  52 #include <stdio.h>
  53 #include <ctype.h>
  54 #include <stdlib.h>
  55 #include <string.h>
  56
  57 #include "jim.h"
  58 #include "jimautoconf.h"
  59 #include "jimregexp.h"
  60 #include "utf8.h"
  61
  62 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
  63
  64 /*
  65  * Structure for regexp "program".  This is essentially a linear encoding
  66  * of a nondeterministic finite-state machine (aka syntax charts or
  67  * "railroad normal form" in parsing technology).  Each node is an opcode
  68  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  69  * all nodes except BRANCH implement concatenation; a "next" pointer with
  70  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  71  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  72  * opposed to a collection of them) is never concatenated with anything
  73  * because of operator precedence.)  The operand of some types of node is
  74  * a literal string; for others, it is a node leading into a sub-FSM.  In
  75  * particular, the operand of a BRANCH node is the first node of the branch.
  76  * (NB this is *not* a tree structure:  the tail of the branch connects
  77  * to the thing following the set of BRANCHes.)  The opcodes are:
  78  */
  79
  80 /* This *MUST* be less than (255-20)/2=117 */
  81 #define REG_MAX_PAREN 100
  82
  83 /* definition   number  opnd?   meaning */
  84 #define END     0       /* no   End of program. */
  85 #define BOL     1       /* no   Match "" at beginning of line. */
  86 #define EOL     2       /* no   Match "" at end of line. */
  87 #define ANY     3       /* no   Match any one character. */
  88 #define ANYOF   4       /* str  Match any character in this string. */
  89 #define ANYBUT  5       /* str  Match any character not in this string. */
  90 #define BRANCH  6       /* node Match this alternative, or the next... */
  91 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  92 #define EXACTLY 8       /* str  Match this string. */
  93 #define NOTHING 9       /* no   Match empty string. */
  94 #define REP             10      /* max,min      Match this (simple) thing [min,max] times. */
  95 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, mininal match. */
  96 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
  97 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
  98
  99 #define WORDA   15      /* no   Match "" at wordchar, where prev is nonword */
 100 #define WORDZ   16      /* no   Match "" at nonwordchar, where prev is word */
 101 #define OPEN    20      /* no   Mark this point in input as start of #n. */
 102                         /*      OPEN+1 is number 1, etc. */
 103 #define CLOSE   (OPEN+REG_MAX_PAREN)    /* no   Analogous to OPEN. */
 104 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 105
 106 /*
 107  * The first byte of the regexp internal "program" is actually this magic
 108  * number; the start node begins in the second byte.
 109  */
 110 #define REG_MAGIC       0xFADED00D
 111
 112 /*
 113  * Opcode notes:
 114  *
 115  * BRANCH       The set of branches constituting a single choice are hooked
 116  *              together with their "next" pointers, since precedence prevents
 117  *              anything being concatenated to any individual branch.  The
 118  *              "next" pointer of the last BRANCH in a choice points to the
 119  *              thing following the whole choice.  This is also where the
 120  *              final "next" pointer of each individual branch points; each
 121  *              branch starts with the operand node of a BRANCH node.
 122  *
 123  * BACK         Normal "next" pointers all implicitly point forward; BACK
 124  *              exists to make loop structures possible.
 125  *
 126  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
 127  *              BRANCH structures using BACK.  Simple cases (one character
 128  *              per match) are implemented with STAR and PLUS for speed
 129  *              and to minimize recursive plunges.
 130  *
 131  * OPEN,CLOSE   ...are numbered at compile time.
 132  */
 133
 134 /*
 135  * A node is one char of opcode followed by two chars of "next" pointer.
 136  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 137  * value is a positive offset from the opcode of the node containing it.
 138  * An operand, if any, simply follows the node.  (Note that much of the
 139  * code generation knows about this implicit relationship.)
 140  *
 141  * Using two bytes for the "next" pointer is vast overkill for most things,
 142  * but allows patterns to get big without disasters.
 143  */
 144 #define OP(p)   ((p)[0])
 145 #define NEXT(p) ((p)[1])
 146 #define OPERAND(p)      ((p) + 2)
 147
 148 /*
 149  * See regmagic.h for one further detail of program structure.
 150  */
 151
 152
 153 /*
 154  * Utility definitions.
 155  */
 156
 157 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 158 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 159 #define META    "^$.[()|?{+*"
 160
 161 /*
 162  * Flags to be passed up and down.
 163  */
 164 #define HASWIDTH        01      /* Known never to match null string. */
 165 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
 166 #define SPSTART         04      /* Starts with * or +. */
 167 #define WORST           0       /* Worst case. */
 168
 169 #define MAX_REP_COUNT 1000000
 170
 171 /*
 172  * Forward declarations for regcomp()'s friends.
 173  */
 174 static int *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 175 static int *regpiece(regex_t *preg, int *flagp );
 176 static int *regbranch(regex_t *preg, int *flagp );
 177 static int *regatom(regex_t *preg, int *flagp );
 178 static int *regnode(regex_t *preg, int op );
 179 static const int *regnext(regex_t *preg, const int *p );
 180 static void regc(regex_t *preg, int b );
 181 static int *reginsert(regex_t *preg, int op, int size, int *opnd );
 182 static void regtail(regex_t *preg, int *p, const int *val );
 183 static void regoptail(regex_t *preg, int *p, const int *val );
 184
 185 static int reg_range_find(const int *string, int c);
 186 static const char *str_find(const char *string, int c, int nocase);
 187 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 188
 189 /*#define DEBUG*/
 190 #ifdef DEBUG
 191 int regnarrate = 0;
 192 static void regdump(regex_t *preg);
 193 static const char *regprop( const int *op );
 194 #endif
 195
 196
 197 static int regdummy;
 198
 199 /**
 200  * Returns the length of the null-terminated integer sequence.
 201  */
 202 static int str_int_len(const int *seq)
 203 {
 204         int n = 0;
 205         while (*seq++) {
 206                 n++;
 207         }
 208         return n;
 209 }
 210
 211 /*
 212  - regcomp - compile a regular expression into internal code
 213  *
 214  * We can't allocate space until we know how big the compiled form will be,
 215  * but we can't compile it (and thus know how big it is) until we've got a
 216  * place to put the code.  So we cheat:  we compile it twice, once with code
 217  * generation turned off and size counting turned on, and once "for real".
 218  * This also means that we don't allocate space until we are sure that the
 219  * thing really will compile successfully, and we never have to move the
 220  * code and thus invalidate pointers into it.  (Note that it has to be in
 221  * one piece because free() must be able to free it all.)
 222  *
 223  * Beware that the optimization-preparation code in here knows about some
 224  * of the structure of the compiled regexp.
 225  */
 226 int regcomp(regex_t *preg, const char *exp, int cflags)
 227 {
 228         const int *scan;
 229         const int *longest;
 230         unsigned len;
 231         int flags;
 232
 233 #ifdef DEBUG
 234         fprintf(stderr, "Compiling: '%s'\n", exp);
 235 #endif
 236         memset(preg, 0, sizeof(*preg));
 237
 238         if (exp == NULL)
 239                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 240
 241         /* First pass: determine size, legality. */
 242         preg->cflags = cflags;
 243         preg->regparse = exp;
 244         preg->re_nsub = 0;
 245         preg->regsize = 0L;
 246         preg->regcode = &regdummy;
 247         regc(preg, REG_MAGIC);
 248         if (reg(preg, 0, &flags) == NULL)
 249                 return preg->err;
 250
 251         /* Small enough for pointer-storage convention? */
 252         if (preg->regsize >= 32767L || preg->re_nsub >= REG_MAX_PAREN)          /* Probably could be 65535L. */
 253                 FAIL(preg,REG_ERR_TOO_BIG);
 254
 255         /* Allocate space. */
 256         preg->program = malloc(preg->regsize * sizeof(*preg->program));
 257         if (preg->program == NULL)
 258                 FAIL(preg, REG_ERR_NOMEM);
 259
 260         /* Second pass: emit code. */
 261         preg->regparse = exp;
 262         preg->re_nsub = 0;
 263         preg->regsize = 0L;
 264         preg->regcode = preg->program;
 265         regc(preg, REG_MAGIC);
 266         if (reg(preg, 0, &flags) == NULL)
 267                 return preg->err;
 268
 269         /* Dig out information for optimizations. */
 270         preg->regstart = 0;     /* Worst-case defaults. */
 271         preg->reganch = 0;
 272         preg->regmust = NULL;
 273         preg->regmlen = 0;
 274         scan = preg->program+1;                 /* First BRANCH. */
 275         if (OP(regnext(preg, scan)) == END) {           /* Only one top-level choice. */
 276                 scan = OPERAND(scan);
 277
 278                 /* Starting-point info. */
 279                 if (OP(scan) == EXACTLY) {
 280                         preg->regstart = *OPERAND(scan);
 281                 }
 282                 else if (OP(scan) == BOL)
 283                         preg->reganch++;
 284
 285                 /*
 286                  * If there's something expensive in the r.e., find the
 287                  * longest literal string that must appear and make it the
 288                  * regmust.  Resolve ties in favor of later strings, since
 289                  * the regstart check works with the beginning of the r.e.
 290                  * and avoiding duplication strengthens checking.  Not a
 291                  * strong reason, but sufficient in the absence of others.
 292                  */
 293                 if (flags&SPSTART) {
 294                         longest = NULL;
 295                         len = 0;
 296                         for (; scan != NULL; scan = regnext(preg, scan)) {
 297                                 if (OP(scan) == EXACTLY) {
 298                                         int plen = str_int_len(OPERAND(scan));
 299                                         if (plen >= len) {
 300                                                 longest = OPERAND(scan);
 301                                                 len = plen;
 302                                         }
 303                                 }
 304                         }
 305                         preg->regmust = longest;
 306                         preg->regmlen = len;
 307                 }
 308         }
 309
 310 #ifdef DEBUG
 311         regdump(preg);
 312 #endif
 313
 314         return 0;
 315 }
 316
 317 /*
 318  - reg - regular expression, i.e. main body or parenthesized thing
 319  *
 320  * Caller must absorb opening parenthesis.
 321  *
 322  * Combining parenthesis handling with the base level of regular expression
 323  * is a trifle forced, but the need to tie the tails of the branches to what
 324  * follows makes it hard to avoid.
 325  */
 326 static int *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 327 {
 328         int *ret;
 329         int *br;
 330         const int *ender;
 331         int parno = 0;
 332         int flags;
 333
 334         *flagp = HASWIDTH;      /* Tentatively. */
 335
 336         /* Make an OPEN node, if parenthesized. */
 337         if (paren) {
 338                 parno = ++preg->re_nsub;
 339                 ret = regnode(preg, OPEN+parno);
 340         } else
 341                 ret = NULL;
 342
 343         /* Pick up the branches, linking them together. */
 344         br = regbranch(preg, &flags);
 345         if (br == NULL)
 346                 return(NULL);
 347         if (ret != NULL)
 348                 regtail(preg, ret, br); /* OPEN -> first. */
 349         else
 350                 ret = br;
 351         if (!(flags&HASWIDTH))
 352                 *flagp &= ~HASWIDTH;
 353         *flagp |= flags&SPSTART;
 354         while (*preg->regparse == '|') {
 355                 preg->regparse++;
 356                 br = regbranch(preg, &flags);
 357                 if (br == NULL)
 358                         return(NULL);
 359                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 360                 if (!(flags&HASWIDTH))
 361                         *flagp &= ~HASWIDTH;
 362                 *flagp |= flags&SPSTART;
 363         }
 364
 365         /* Make a closing node, and hook it on the end. */
 366         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 367         regtail(preg, ret, ender);
 368
 369         /* Hook the tails of the branches to the closing node. */
 370         for (br = ret; br != NULL; br = (int *)regnext(preg, br))
 371                 regoptail(preg, br, ender);
 372
 373         /* Check for proper termination. */
 374         if (paren && *preg->regparse++ != ')') {
 375                 preg->err = REG_ERR_UNMATCHED_PAREN;
 376                 return NULL;
 377         } else if (!paren && *preg->regparse != '\0') {
 378                 if (*preg->regparse == ')') {
 379                         preg->err = REG_ERR_UNMATCHED_PAREN;
 380                         return NULL;
 381                 } else {
 382                         preg->err = REG_ERR_JUNK_ON_END;
 383                         return NULL;
 384                 }
 385         }
 386
 387         return(ret);
 388 }
 389
 390 /*
 391  - regbranch - one alternative of an | operator
 392  *
 393  * Implements the concatenation operator.
 394  */
 395 static int *regbranch(regex_t *preg, int *flagp )
 396 {
 397         int *ret;
 398         int *chain;
 399         int *latest;
 400         int flags;
 401
 402         *flagp = WORST;         /* Tentatively. */
 403
 404         ret = regnode(preg, BRANCH);
 405         chain = NULL;
 406         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 407                *preg->regparse != '|') {
 408                 latest = regpiece(preg, &flags);
 409                 if (latest == NULL)
 410                         return(NULL);
 411                 *flagp |= flags&HASWIDTH;
 412                 if (chain == NULL) {/* First piece. */
 413                         *flagp |= flags&SPSTART;
 414                 }
 415                 else {
 416                         regtail(preg, chain, latest);
 417                 }
 418                 chain = latest;
 419         }
 420         if (chain == NULL)      /* Loop ran zero times. */
 421                 (void) regnode(preg, NOTHING);
 422
 423         return(ret);
 424 }
 425
 426 /*
 427  - regpiece - something followed by possible [*+?]
 428  *
 429  * Note that the branching code sequences used for ? and the general cases
 430  * of * and + are somewhat optimized:  they use the same NOTHING node as
 431  * both the endmarker for their branch list and the body of the last branch.
 432  * It might seem that this node could be dispensed with entirely, but the
 433  * endmarker role is not redundant.
 434  */
 435 static int *regpiece(regex_t *preg, int *flagp)
 436 {
 437         int *ret;
 438         char op;
 439         int *next;
 440         int flags;
 441         int size = preg->regsize;
 442         int *chain = NULL;
 443         int min;
 444         int max;
 445
 446         ret = regatom(preg, &flags);
 447         if (ret == NULL)
 448                 return(NULL);
 449
 450         size = preg->regsize - size;
 451
 452         op = *preg->regparse;
 453         if (!ISMULT(op)) {
 454                 *flagp = flags;
 455                 return(ret);
 456         }
 457
 458         if (!(flags&HASWIDTH) && op != '?') {
 459                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 460                 return NULL;
 461         }
 462
 463         /* Handle braces (counted repetition) by expansion */
 464         if (op == '{') {
 465                 char *end;
 466
 467                 min = strtoul(preg->regparse + 1, &end, 10);
 468                 if (end == preg->regparse + 1) {
 469                         preg->err = REG_ERR_BAD_COUNT;
 470                         return NULL;
 471                 }
 472                 if (*end == '}') {
 473                         max = min;
 474                 }
 475                 else {
 476                         preg->regparse = end;
 477                         max = strtoul(preg->regparse + 1, &end, 10);
 478                         if (*end != '}') {
 479                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 480                                 return NULL;
 481                         }
 482                 }
 483                 if (end == preg->regparse + 1) {
 484                         max = MAX_REP_COUNT;
 485                 }
 486                 else if (max < min || max >= 100) {
 487                         preg->err = REG_ERR_BAD_COUNT;
 488                         return NULL;
 489                 }
 490                 if (min >= 100) {
 491                         preg->err = REG_ERR_BAD_COUNT;
 492                         return NULL;
 493                 }
 494
 495                 preg->regparse = strchr(preg->regparse, '}');
 496         }
 497         else {
 498                 min = (op == '+');
 499                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 500         }
 501
 502         if (preg->regparse[1] == '?') {
 503                 preg->regparse++;
 504                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 505         }
 506         else {
 507                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 508         }
 509         ret[2] = max;
 510         ret[3] = min;
 511         ret[4] = 0;
 512
 513         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 514
 515         if (!(flags & SIMPLE)) {
 516                 int *back = regnode(preg, BACK);
 517                 regtail(preg, back, ret);
 518                 regtail(preg, next, back);
 519         }
 520
 521         preg->regparse++;
 522         if (ISMULT(*preg->regparse)) {
 523                 preg->err = REG_ERR_NESTED_COUNT;
 524                 return NULL;
 525         }
 526
 527         return chain ? chain : ret;
 528 }
 529
 530 /**
 531  * Add all characters in the inclusive range between lower and upper.
 532  *
 533  * Handles a swapped range (upper < lower).
 534  */
 535 static void reg_addrange(regex_t *preg, int lower, int upper)
 536 {
 537         if (lower > upper) {
 538                 reg_addrange(preg, upper, lower);
 539         }
 540         /* Add a range as length, start */
 541         regc(preg, upper - lower + 1);
 542         regc(preg, lower);
 543 }
 544
 545 /**
 546  * Add a null-terminated literal string as a set of ranges.
 547  */
 548 static void reg_addrange_str(regex_t *preg, const char *str)
 549 {
 550         while (*str) {
 551                 reg_addrange(preg, *str, *str);
 552                 str++;
 553         }
 554 }
 555
 556 /**
 557  * Extracts the next unicode char from utf8.
 558  *
 559  * If 'upper' is set, converts the char to uppercase.
 560  */
 561 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 562 {
 563         int l = utf8_tounicode(s, uc);
 564         if (upper) {
 565                 *uc = utf8_upper(*uc);
 566         }
 567         return l;
 568 }
 569
 570 /**
 571  * Converts a hex digit to decimal.
 572  *
 573  * Returns -1 for an invalid hex digit.
 574  */
 575 static int hexdigitval(int c)
 576 {
 577         if (c >= '0' && c <= '9')
 578                 return c - '0';
 579         if (c >= 'a' && c <= 'f')
 580                 return c - 'a' + 10;
 581         if (c >= 'A' && c <= 'F')
 582                 return c - 'A' + 10;
 583         return -1;
 584 }
 585
 586 /**
 587  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 588  *
 589  * Returns the number of hex digits parsed.
 590  * If there are no hex digits, returns 0 and stores nothing.
 591  */
 592 static int parse_hex(const char *s, int n, int *uc)
 593 {
 594         int val = 0;
 595         int k;
 596
 597         for (k = 0; k < n; k++) {
 598                 int c = hexdigitval(*s++);
 599                 if (c == -1) {
 600                         break;
 601                 }
 602                 val = (val << 4) | c;
 603         }
 604         if (k) {
 605                 *uc = val;
 606         }
 607         return k;
 608 }
 609
 610 /**
 611  * Call for chars after a backlash to decode the escape sequence.
 612  *
 613  * Stores the result in *ch.
 614  *
 615  * Returns the number of bytes consumed.
 616  */
 617 static int reg_decode_escape(const char *s, int *ch)
 618 {
 619         int n;
 620         const char *s0 = s;
 621
 622         *ch = *s++;
 623
 624         switch (*ch) {
 625                 case 'b': *ch = '\b'; break;
 626                 case 'e': *ch = 27; break;
 627                 case 'f': *ch = '\f'; break;
 628                 case 'n': *ch = '\n'; break;
 629                 case 'r': *ch = '\r'; break;
 630                 case 't': *ch = '\t'; break;
 631                 case 'v': *ch = '\v'; break;
 632                 case 'u':
 633                         if ((n = parse_hex(s, 4, ch)) > 0) {
 634                                 s += n;
 635                         }
 636                         break;
 637                 case 'x':
 638                         if ((n = parse_hex(s, 2, ch)) > 0) {
 639                                 s += n;
 640                         }
 641                         break;
 642                 case '\0':
 643                         s--;
 644                         *ch = '\\';
 645                         break;
 646         }
 647         return s - s0;
 648 }
 649
 650 /*
 651  - regatom - the lowest level
 652  *
 653  * Optimization:  gobbles an entire sequence of ordinary characters so that
 654  * it can turn them into a single node, which is smaller to store and
 655  * faster to run.  Backslashed characters are exceptions, each becoming a
 656  * separate node; the code is simpler that way and it's not worth fixing.
 657  */
 658 static int *regatom(regex_t *preg, int *flagp)
 659 {
 660         int *ret;
 661         int flags;
 662         int nocase = (preg->cflags & REG_ICASE);
 663
 664         int ch;
 665         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 666
 667         *flagp = WORST;         /* Tentatively. */
 668
 669         preg->regparse += n;
 670         switch (ch) {
 671         /* FIXME: these chars only have meaning at beg/end of pat? */
 672         case '^':
 673                 ret = regnode(preg, BOL);
 674                 break;
 675         case '$':
 676                 ret = regnode(preg, EOL);
 677                 break;
 678         case '.':
 679                 ret = regnode(preg, ANY);
 680                 *flagp |= HASWIDTH|SIMPLE;
 681                 break;
 682         case '[': {
 683                         const char *pattern = preg->regparse;
 684
 685                         if (*pattern == '^') {  /* Complement of range. */
 686                                 ret = regnode(preg, ANYBUT);
 687                                 pattern++;
 688                         } else
 689                                 ret = regnode(preg, ANYOF);
 690
 691                         /* Special case. If the first char is ']' or '-', it is part of the set */
 692                         if (*pattern == ']' || *pattern == '-') {
 693                                 reg_addrange(preg, *pattern, *pattern);
 694                                 pattern++;
 695                         }
 696
 697                         while (*pattern && *pattern != ']') {
 698                                 /* Is this a range? a-z */
 699                                 int start;
 700                                 int end;
 701
 702                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 703                                 if (start == '\\') {
 704                                         pattern += reg_decode_escape(pattern, &start);
 705                                         if (start == 0) {
 706                                                 preg->err = REG_ERR_NULL_CHAR;
 707                                                 return NULL;
 708                                         }
 709                                 }
 710                                 if (pattern[0] == '-' && pattern[1]) {
 711                                         /* skip '-' */
 712                                         pattern += utf8_tounicode(pattern, &end);
 713                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 714                                         if (end == '\\') {
 715                                                 pattern += reg_decode_escape(pattern, &end);
 716                                                 if (end == 0) {
 717                                                         preg->err = REG_ERR_NULL_CHAR;
 718                                                         return NULL;
 719                                                 }
 720                                         }
 721
 722                                         reg_addrange(preg, start, end);
 723                                         continue;
 724                                 }
 725                                 if (start == '[') {
 726                                         if (strncmp(pattern, ":alpha:]", 8) == 0) {
 727                                                 if ((preg->cflags & REG_ICASE) == 0) {
 728                                                         reg_addrange(preg, 'a', 'z');
 729                                                 }
 730                                                 reg_addrange(preg, 'A', 'Z');
 731                                                 pattern += 8;
 732                                                 continue;
 733                                         }
 734                                         if (strncmp(pattern, ":alnum:]", 8) == 0) {
 735                                                 if ((preg->cflags & REG_ICASE) == 0) {
 736                                                         reg_addrange(preg, 'a', 'z');
 737                                                 }
 738                                                 reg_addrange(preg, 'A', 'Z');
 739                                                 reg_addrange(preg, '0', '9');
 740                                                 pattern += 8;
 741                                                 continue;
 742                                         }
 743                                         if (strncmp(pattern, ":space:]", 8) == 0) {
 744                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 745                                                 pattern += 8;
 746                                                 continue;
 747                                         }
 748                                 }
 749                                 /* Not a range, so just add the char */
 750                                 reg_addrange(preg, start, start);
 751                         }
 752                         regc(preg, '\0');
 753
 754                         if (*pattern) {
 755                                 pattern++;
 756                         }
 757                         preg->regparse = pattern;
 758
 759                         *flagp |= HASWIDTH|SIMPLE;
 760                 }
 761                 break;
 762         case '(':
 763                 ret = reg(preg, 1, &flags);
 764                 if (ret == NULL)
 765                         return(NULL);
 766                 *flagp |= flags&(HASWIDTH|SPSTART);
 767                 break;
 768         case '\0':
 769         case '|':
 770         case ')':
 771                 preg->err = REG_ERR_INTERNAL;
 772                 return NULL;    /* Supposed to be caught earlier. */
 773         case '?':
 774         case '+':
 775         case '*':
 776         case '{':
 777                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 778                 return NULL;
 779         case '\\':
 780                 switch (*preg->regparse++) {
 781                 case '\0':
 782                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 783                         return NULL;
 784                         break;
 785                 case '<':
 786                 case 'm':
 787                         ret = regnode(preg, WORDA);
 788                         break;
 789                 case '>':
 790                 case 'M':
 791                         ret = regnode(preg, WORDZ);
 792                         break;
 793                 case 'd':
 794                         ret = regnode(preg, ANYOF);
 795                         reg_addrange(preg, '0', '9');
 796                         regc(preg, '\0');
 797                         *flagp |= HASWIDTH|SIMPLE;
 798                         break;
 799                 case 'w':
 800                         ret = regnode(preg, ANYOF);
 801                         if ((preg->cflags & REG_ICASE) == 0) {
 802                                 reg_addrange(preg, 'a', 'z');
 803                         }
 804                         reg_addrange(preg, 'A', 'Z');
 805                         reg_addrange(preg, '0', '9');
 806                         reg_addrange(preg, '_', '_');
 807                         regc(preg, '\0');
 808                         *flagp |= HASWIDTH|SIMPLE;
 809                         break;
 810                 case 's':
 811                         ret = regnode(preg, ANYOF);
 812                         reg_addrange_str(preg," \t\r\n\f\v");
 813                         regc(preg, '\0');
 814                         *flagp |= HASWIDTH|SIMPLE;
 815                         break;
 816                 /* FIXME: Someday handle \1, \2, ... */
 817                 default:
 818                         /* Handle general quoted chars in exact-match routine */
 819                         /* Back up to include the backslash */
 820                         preg->regparse--;
 821                         goto de_fault;
 822                 }
 823                 break;
 824         de_fault:
 825         default: {
 826                         /*
 827                          * Encode a string of characters to be matched exactly.
 828                          */
 829                         int added = 0;
 830
 831                         /* Back up to pick up the first char of interest */
 832                         preg->regparse -= n;
 833
 834                         ret = regnode(preg, EXACTLY);
 835
 836                         /* Note that a META operator such as ? or * consumes the
 837                          * preceding char.
 838                          * Thus we must be careful to look ahead by 2 and add the
 839                          * last char as it's own EXACTLY if necessary
 840                          */
 841
 842                         /* Until end of string or a META char is reached */
 843                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 844                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 845                                 if (ch == '\\' && preg->regparse[n]) {
 846                                         /* Non-trailing backslash.
 847                                          * Is this a special escape, or a regular escape?
 848                                          */
 849                                         if (strchr("<>mMwds", preg->regparse[n])) {
 850                                                 /* A special escape. All done with EXACTLY */
 851                                                 break;
 852                                         }
 853                                         /* Decode it. Note that we add the length for the escape
 854                                          * sequence to the length for the backlash so we can skip
 855                                          * the entire sequence, or not as required.
 856                                          */
 857                                         n += reg_decode_escape(preg->regparse + n, &ch);
 858                                         if (ch == 0) {
 859                                                 preg->err = REG_ERR_NULL_CHAR;
 860                                                 return NULL;
 861                                         }
 862                                 }
 863
 864                                 /* Now we have one char 'ch' of length 'n'.
 865                                  * Check to see if the following char is a MULT
 866                                  */
 867
 868                                 if (ISMULT(preg->regparse[n])) {
 869                                         /* Yes. But do we already have some EXACTLY chars? */
 870                                         if (added) {
 871                                                 /* Yes, so return what we have and pick up the current char next time around */
 872                                                 break;
 873                                         }
 874                                         /* No, so add this single char and finish */
 875                                         regc(preg, ch);
 876                                         added++;
 877                                         preg->regparse += n;
 878                                         break;
 879                                 }
 880
 881                                 /* No, so just add this char normally */
 882                                 regc(preg, ch);
 883                                 added++;
 884                                 preg->regparse += n;
 885                         }
 886                         regc(preg, '\0');
 887
 888                         *flagp |= HASWIDTH;
 889                         if (added == 1)
 890                                 *flagp |= SIMPLE;
 891                         break;
 892                 }
 893                 break;
 894         }
 895
 896         return(ret);
 897 }
 898
 899 /*
 900  - regnode - emit a node
 901  */
 902 /* Location. */
 903 static int *regnode(regex_t *preg, int op)
 904 {
 905         int *ret;
 906         int *ptr;
 907
 908         preg->regsize += 2;
 909         ret = preg->regcode;
 910         if (ret == &regdummy) {
 911                 return(ret);
 912         }
 913
 914         ptr = ret;
 915         *ptr++ = op;
 916         *ptr++ = 0;             /* Null "next" pointer. */
 917         preg->regcode = ptr;
 918
 919         return(ret);
 920 }
 921
 922 /*
 923  - regc - emit (if appropriate) a byte of code
 924  */
 925 static void regc(regex_t *preg, int b )
 926 {
 927         preg->regsize++;
 928         if (preg->regcode != &regdummy)
 929                 *preg->regcode++ = b;
 930 }
 931
 932 /*
 933  - reginsert - insert an operator in front of already-emitted operand
 934  *
 935  * Means relocating the operand.
 936  * Returns the new location of the original operand.
 937  */
 938 static int *reginsert(regex_t *preg, int op, int size, int *opnd )
 939 {
 940         int *src;
 941         int *dst;
 942         int *place;
 943
 944         preg->regsize += size;
 945
 946         if (preg->regcode == &regdummy) {
 947                 return opnd;
 948         }
 949
 950         src = preg->regcode;
 951         preg->regcode += size;
 952         dst = preg->regcode;
 953         while (src > opnd)
 954                 *--dst = *--src;
 955
 956         place = opnd;           /* Op node, where operand used to be. */
 957         *place++ = op;
 958         while (--size) {
 959                 *place++ = 0;
 960         }
 961
 962         return place;
 963 }
 964
 965 /*
 966  - regtail - set the next-pointer at the end of a node chain
 967  */
 968 static void regtail(regex_t *preg, int *p, const int *val )
 969 {
 970         int *scan;
 971         int *temp;
 972         int offset;
 973
 974         if (p == &regdummy)
 975                 return;
 976
 977         /* Find last node. */
 978         scan = p;
 979         for (;;) {
 980                 temp = (int *)regnext(preg, scan);
 981                 if (temp == NULL)
 982                         break;
 983                 scan = temp;
 984         }
 985
 986         if (OP(scan) == BACK)
 987                 offset = scan - val;
 988         else
 989                 offset = val - scan;
 990
 991         scan[1] = offset;
 992 }
 993
 994 /*
 995  - regoptail - regtail on operand of first argument; nop if operandless
 996  */
 997
 998 static void regoptail(regex_t *preg, int *p, const int *val )
 999 {
1000         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1001         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1002                 return;
1003         regtail(preg, OPERAND(p), val);
1004 }
1005
1006 /*
1007  * regexec and friends
1008  */
1009
1010 /*
1011  * Forwards.
1012  */
1013 static int regtry(regex_t *preg, const char *string );
1014 static int regmatch(regex_t *preg, const int *prog);
1015 static int regrepeat(regex_t *preg, const int *p, int max);
1016
1017 /*
1018  - regexec - match a regexp against a string
1019  */
1020 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1021 {
1022         const char *s;
1023         const int *scan;
1024
1025         /* Be paranoid... */
1026         if (preg == NULL || preg->program == NULL || string == NULL) {
1027                 return REG_ERR_NULL_ARGUMENT;
1028         }
1029
1030         /* Check validity of program. */
1031         if (*preg->program != REG_MAGIC) {
1032                 return REG_ERR_CORRUPTED;
1033         }
1034
1035 #ifdef DEBUG
1036         fprintf(stderr, "regexec: %s\n", string);
1037         regdump(preg);
1038 #endif
1039
1040         preg->eflags = eflags;
1041         preg->pmatch = pmatch;
1042         preg->nmatch = nmatch;
1043         preg->start = string;   /* All offsets are computed from here */
1044
1045         /* Must clear out the embedded repeat counts */
1046         for (scan = OPERAND(preg->program + 1); scan != NULL; scan = regnext(preg, scan)) {
1047                 switch (OP(scan)) {
1048                 case REP:
1049                 case REPMIN:
1050                 case REPX:
1051                 case REPXMIN:
1052                         *(int *)(scan + 4) = 0;
1053                         break;
1054                 }
1055         }
1056
1057         /* If there is a "must appear" string, look for it. */
1058         if (preg->regmust != NULL) {
1059                 s = string;
1060                 while ((s = str_find(s, preg->regmust[0], preg->cflags & REG_ICASE)) != NULL) {
1061                         if (prefix_cmp(preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1062                                 break;
1063                         }
1064                         s++;
1065                 }
1066                 if (s == NULL)  /* Not present. */
1067                         return REG_NOMATCH;
1068         }
1069
1070         /* Mark beginning of line for ^ . */
1071         preg->regbol = string;
1072
1073         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1074         if (preg->reganch) {
1075                 if (eflags & REG_NOTBOL) {
1076                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1077                         goto nextline;
1078                 }
1079                 while (1) {
1080                         int ret = regtry(preg, string);
1081                         if (ret) {
1082                                 return REG_NOERROR;
1083                         }
1084                         if (*string) {
1085 nextline:
1086                                 if (preg->cflags & REG_NEWLINE) {
1087                                         /* Try the next anchor? */
1088                                         string = strchr(string, '\n');
1089                                         if (string) {
1090                                                 preg->regbol = ++string;
1091                                                 continue;
1092                                         }
1093                                 }
1094                         }
1095                         return REG_NOMATCH;
1096                 }
1097         }
1098
1099         /* Messy cases:  unanchored match. */
1100         s = string;
1101         if (preg->regstart != '\0') {
1102                 /* We know what char it must start with. */
1103                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1104                         if (regtry(preg, s))
1105                                 return REG_NOERROR;
1106                         s++;
1107                 }
1108         }
1109         else
1110                 /* We don't -- general case. */
1111                 while (1) {
1112                         if (regtry(preg, s))
1113                                 return REG_NOERROR;
1114                         if (*s == '\0') {
1115                                 break;
1116                         }
1117                         s += utf8_charlen(*s);
1118                 }
1119
1120         /* Failure. */
1121         return REG_NOMATCH;
1122 }
1123
1124 /*
1125  - regtry - try match at specific point
1126  */
1127                         /* 0 failure, 1 success */
1128 static int regtry( regex_t *preg, const char *string )
1129 {
1130         int i;
1131
1132         preg->reginput = string;
1133
1134         for (i = 0; i < preg->nmatch; i++) {
1135                 preg->pmatch[i].rm_so = -1;
1136                 preg->pmatch[i].rm_eo = -1;
1137         }
1138         if (regmatch(preg, preg->program + 1)) {
1139                 preg->pmatch[0].rm_so = string - preg->start;
1140                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1141                 return(1);
1142         } else
1143                 return(0);
1144 }
1145
1146 /**
1147  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1148  *
1149  * If 'nocase' is non-zero, does a case-insensitive match.
1150  *
1151  * Returns -1 on not found.
1152  */
1153 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1154 {
1155         const char *s = string;
1156         while (proglen && *s) {
1157                 int ch;
1158                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1159                 if (ch != *prog) {
1160                         return -1;
1161                 }
1162                 prog++;
1163                 s += n;
1164                 proglen--;
1165         }
1166         if (proglen == 0) {
1167                 return s - string;
1168         }
1169         return -1;
1170 }
1171
1172 /**
1173  * Searchs for 'c' in the range 'range'.
1174  *
1175  * Returns 1 if found, or 0 if not.
1176  */
1177 static int reg_range_find(const int *range, int c)
1178 {
1179         while (*range) {
1180                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1181                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1182                         return 1;
1183                 }
1184                 range += 2;
1185         }
1186         return 0;
1187 }
1188
1189 /**
1190  * Search for the character 'c' in the utf-8 string 'string'.
1191  *
1192  * If 'nocase' is set, the 'string' is assumed to be uppercase
1193  * and 'c' is converted to uppercase before matching.
1194  *
1195  * Returns the byte position in the string where the 'c' was found, or
1196  * NULL if not found.
1197  */
1198 static const char *str_find(const char *string, int c, int nocase)
1199 {
1200         if (nocase) {
1201                 /* The "string" should already be converted to uppercase */
1202                 c = utf8_upper(c);
1203         }
1204         while (*string) {
1205                 int ch;
1206                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1207                 if (c == ch) {
1208                         return string;
1209                 }
1210                 string += n;
1211         }
1212         return NULL;
1213 }
1214
1215 /**
1216  * Returns true if 'ch' is an end-of-line char.
1217  *
1218  * In REG_NEWLINE mode, \n is considered EOL in
1219  * addition to \0
1220  */
1221 static int reg_iseol(regex_t *preg, int ch)
1222 {
1223         if (preg->cflags & REG_NEWLINE) {
1224                 return ch == '\0' || ch == '\n';
1225         }
1226         else {
1227                 return ch == '\0';
1228         }
1229 }
1230
1231 static int regmatchsimplerepeat(regex_t *preg, const int *scan, int matchmin)
1232 {
1233         int nextch = '\0';
1234         const char *save;
1235         int no;
1236         int c;
1237
1238         int max = scan[2];
1239         int min = scan[3];
1240         const int *next = regnext(preg, scan);
1241
1242         /*
1243          * Lookahead to avoid useless match attempts
1244          * when we know what character comes next.
1245          */
1246         if (OP(next) == EXACTLY) {
1247                 nextch = *OPERAND(next);
1248         }
1249         save = preg->reginput;
1250         no = regrepeat(preg, scan + 5, max);
1251         if (no < min) {
1252                 return 0;
1253         }
1254         if (matchmin) {
1255                 /* from min up to no */
1256                 max = no;
1257                 no = min;
1258         }
1259         /* else from no down to min */
1260         while (1) {
1261                 if (matchmin) {
1262                         if (no > max) {
1263                                 break;
1264                         }
1265                 }
1266                 else {
1267                         if (no < min) {
1268                                 break;
1269                         }
1270                 }
1271                 preg->reginput = save + utf8_index(save, no);
1272                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1273                 /* If it could work, try it. */
1274                 if (reg_iseol(preg, nextch) || c == nextch) {
1275                         if (regmatch(preg, next)) {
1276                                 return(1);
1277                         }
1278                 }
1279                 if (matchmin) {
1280                         /* Couldn't or didn't, add one more */
1281                         no++;
1282                 }
1283                 else {
1284                         /* Couldn't or didn't -- back up. */
1285                         no--;
1286                 }
1287         }
1288         return(0);
1289 }
1290
1291 static int regmatchrepeat(regex_t *preg, int *scan, int matchmin)
1292 {
1293         const char *save;
1294
1295         int max = scan[2];
1296         int min = scan[3];
1297
1298         save = preg->reginput;
1299
1300         /* Have we reached min? */
1301         if (scan[4] < min) {
1302                 /* No, so get another one */
1303                 scan[4]++;
1304                 if (regmatch(preg, scan + 5)) {
1305                         return 1;
1306                 }
1307                 scan[4]--;
1308                 return 0;
1309         }
1310         if (scan[4] > max) {
1311                 return 0;
1312         }
1313
1314         if (matchmin) {
1315                 /* minimal, so try other branch first */
1316                 if (regmatch(preg, regnext(preg, scan))) {
1317                         return 1;
1318                 }
1319                 /* No, so try one more */
1320                 scan[4]++;
1321                 if (regmatch(preg, scan + 5)) {
1322                         return 1;
1323                 }
1324                 scan[4]--;
1325                 return 0;
1326         }
1327         /* maximal, so try this branch again */
1328         save = preg->reginput;
1329         if (scan[4] < max) {
1330                 scan[4]++;
1331                 if (regmatch(preg, scan + 5)) {
1332                         return 1;
1333                 }
1334                 scan[4]--;
1335         }
1336         /* At this point we are at max with no match. Back up by one and try the other branch */
1337         preg->reginput = save;
1338         int ret = regmatch(preg, regnext(preg, scan));
1339         return ret;
1340 }
1341
1342 /*
1343  - regmatch - main matching routine
1344  *
1345  * Conceptually the strategy is simple:  check to see whether the current
1346  * node matches, call self recursively to see whether the rest matches,
1347  * and then act accordingly.  In practice we make some effort to avoid
1348  * recursion, in particular by going through "ordinary" nodes (that don't
1349  * need to know whether the rest of the match failed) by a loop instead of
1350  * by recursion.
1351  */
1352 /* 0 failure, 1 success */
1353 static int regmatch(regex_t *preg, const int *prog)
1354 {
1355         const int *scan;        /* Current node. */
1356         const int *next;                /* Next node. */
1357
1358         scan = prog;
1359
1360 #ifdef DEBUG
1361         if (scan != NULL && regnarrate)
1362                 fprintf(stderr, "%s(\n", regprop(scan));
1363 #endif
1364         while (scan != NULL) {
1365                 int n;
1366                 int c;
1367 #ifdef DEBUG
1368                 if (regnarrate) {
1369                         //fprintf(stderr, "%s...\n", regprop(scan));
1370                         int op = OP(scan);
1371                         fprintf(stderr, "%2d{%02x}%s...\n", (int)(scan-preg->program), op, regprop(scan));      /* Where, what. */
1372                 }
1373 #endif
1374                 next = regnext(preg, scan);
1375                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1376
1377                 switch (OP(scan)) {
1378                 case BOL:
1379                         if (preg->reginput != preg->regbol)
1380                                 return(0);
1381                         break;
1382                 case EOL:
1383                         if (!reg_iseol(preg, c)) {
1384                                 return(0);
1385                         }
1386                         break;
1387                 case WORDA:
1388                         /* Must be looking at a letter, digit, or _ */
1389                         if ((!isalnum(UCHAR(c))) && c != '_')
1390                                 return(0);
1391                         /* Prev must be BOL or nonword */
1392                         if (preg->reginput > preg->regbol &&
1393                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1394                                 return(0);
1395                         break;
1396                 case WORDZ:
1397                         /* Can't match at BOL */
1398                         if (preg->reginput > preg->regbol) {
1399                                 /* Current must be EOL or nonword */
1400                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1401                                         c = preg->reginput[-1];
1402                                         /* Previous must be word */
1403                                         if (isalnum(UCHAR(c)) || c == '_') {
1404                                                 break;
1405                                         }
1406                                 }
1407                         }
1408                         /* No */
1409                         return(0);
1410
1411                 case ANY:
1412                         if (reg_iseol(preg, c))
1413                                 return 0;
1414                         preg->reginput += n;
1415                         break;
1416                 case EXACTLY: {
1417                                 const int *opnd;
1418                                 int len;
1419                                 int slen;
1420
1421                                 opnd = OPERAND(scan);
1422                                 len = str_int_len(opnd);
1423
1424                                 slen = prefix_cmp(opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1425                                 if (slen < 0) {
1426                                         return(0);
1427                                 }
1428                                 preg->reginput += slen;
1429                         }
1430                         break;
1431                 case ANYOF:
1432                         if (reg_iseol(preg, c) || reg_range_find(OPERAND(scan), c) == 0) {
1433                                 return(0);
1434                         }
1435                         preg->reginput += n;
1436                         break;
1437                 case ANYBUT:
1438                         if (reg_iseol(preg, c) || reg_range_find(OPERAND(scan), c) != 0) {
1439                                 return(0);
1440                         }
1441                         preg->reginput += n;
1442                         break;
1443                 case NOTHING:
1444                         break;
1445                 case BACK:
1446                         break;
1447                 case BRANCH: {
1448                                 const char *save;
1449
1450                                 if (OP(next) != BRANCH)         /* No choice. */
1451                                         next = OPERAND(scan);   /* Avoid recursion. */
1452                                 else {
1453                                         do {
1454                                                 save = preg->reginput;
1455                                                 if (regmatch(preg, OPERAND(scan))) {
1456                                                         return(1);
1457                                                 }
1458                                                 preg->reginput = save;
1459                                                 scan = regnext(preg, scan);
1460                                         } while (scan != NULL && OP(scan) == BRANCH);
1461                                         return(0);
1462                                         /* NOTREACHED */
1463                                 }
1464                         }
1465                         break;
1466                 case REP:
1467                 case REPMIN:
1468                         return regmatchsimplerepeat(preg, scan, OP(scan) == REPMIN);
1469
1470                 case REPX:
1471                 case REPXMIN:
1472                         return regmatchrepeat(preg, (int *)scan, OP(scan) == REPXMIN);
1473
1474                 case END:
1475                         return(1);      /* Success! */
1476                         break;
1477                 default:
1478                         if (OP(scan) >= OPEN+1 && OP(scan) < CLOSE_END) {
1479                                 const char *save;
1480
1481                                 save = preg->reginput;
1482
1483                                 if (regmatch(preg, next)) {
1484                                         int no;
1485                                         /*
1486                                          * Don't set startp if some later
1487                                          * invocation of the same parentheses
1488                                          * already has.
1489                                          */
1490                                         if (OP(scan) < CLOSE) {
1491                                                 no = OP(scan) - OPEN;
1492                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1493                                                         preg->pmatch[no].rm_so = save - preg->start;
1494                                                 }
1495                                         }
1496                                         else {
1497                                                 no = OP(scan) - CLOSE;
1498                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1499                                                         preg->pmatch[no].rm_eo = save - preg->start;
1500                                                 }
1501                                         }
1502                                         return(1);
1503                                 } else
1504                                         return(0);
1505                         }
1506                         return REG_ERR_INTERNAL;
1507                 }
1508
1509                 scan = next;
1510         }
1511
1512         /*
1513          * We get here only if there's trouble -- normally "case END" is
1514          * the terminating point.
1515          */
1516         return REG_ERR_INTERNAL;
1517 }
1518
1519 /*
1520  - regrepeat - repeatedly match something simple, report how many
1521  */
1522 static int regrepeat(regex_t *preg, const int *p, int max)
1523 {
1524         int count = 0;
1525         const char *scan;
1526         const int *opnd;
1527         int ch;
1528         int n;
1529
1530         scan = preg->reginput;
1531         opnd = OPERAND(p);
1532         switch (OP(p)) {
1533         case ANY:
1534                 /* No need to handle utf8 specially here */
1535                 while (!reg_iseol(preg, *scan) && count < max) {
1536                         count++;
1537                         scan++;
1538                 }
1539                 break;
1540         case EXACTLY:
1541                 while (count < max) {
1542                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1543                         if (*opnd != ch) {
1544                                 break;
1545                         }
1546                         count++;
1547                         scan += n;
1548                 }
1549                 break;
1550         case ANYOF:
1551                 while (count < max) {
1552                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1553                         if (reg_iseol(preg, ch) || reg_range_find(opnd, ch) == 0) {
1554                                 break;
1555                         }
1556                         count++;
1557                         scan += n;
1558                 }
1559                 break;
1560         case ANYBUT:
1561                 while (count < max) {
1562                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1563                         if (reg_iseol(preg, ch) || reg_range_find(opnd, ch) != 0) {
1564                                 break;
1565                         }
1566                         count++;
1567                         scan += n;
1568                 }
1569                 break;
1570         default:                /* Oh dear.  Called inappropriately. */
1571                 preg->err = REG_ERR_INTERNAL;
1572                 count = 0;      /* Best compromise. */
1573                 break;
1574         }
1575         preg->reginput = scan;
1576
1577         return(count);
1578 }
1579
1580 /*
1581  - regnext - dig the "next" pointer out of a node
1582  */
1583 static const int *regnext(regex_t *preg, const int *p )
1584 {
1585         int offset;
1586
1587         if (p == &regdummy)
1588                 return(NULL);
1589
1590         offset = NEXT(p);
1591
1592         if (offset == 0)
1593                 return(NULL);
1594
1595         if (OP(p) == BACK)
1596                 return(p-offset);
1597         else
1598                 return(p+offset);
1599 }
1600
1601 #ifdef DEBUG
1602
1603 /*
1604  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1605  */
1606 static void regdump(regex_t *preg)
1607 {
1608         const int *s;
1609         char op = EXACTLY;      /* Arbitrary non-END op. */
1610         const int *next;
1611         char buf[4];
1612
1613         if (preg->regcode == &regdummy)
1614                 return;
1615
1616         s = preg->program + 1;
1617         while (op != END && s < preg->regcode) {        /* While that wasn't END last time... */
1618                 op = OP(s);
1619                 printf("%2d{%02x}%s", (int)(s-preg->program), op, regprop(s));  /* Where, what. */
1620                 next = regnext(preg, s);
1621                 if (next == NULL)               /* Next ptr. */
1622                         printf("(0)");
1623                 else
1624                         printf("(%d)", (int)((s-preg->program)+(next-s)));
1625                 s += 2;
1626                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1627                         int max = s[0];
1628                         int min = s[1];
1629                         if (max == 65535) {
1630                                 printf("{%d,*}", min);
1631                         }
1632                         else {
1633                                 printf("{%d,%d}", min, max);
1634                         }
1635                         printf(" %d", s[2]);
1636                         s += 3;
1637                 }
1638                 else if (op == ANYOF || op == ANYBUT) {
1639                         /* set of ranges */
1640
1641                         while (*s) {
1642                                 int len = *s++;
1643                                 int first = *s++;
1644                                 buf[utf8_fromunicode(buf, first)] = 0;
1645                                 printf("%s", buf);
1646                                 if (len > 1) {
1647                                         buf[utf8_fromunicode(buf, first + len - 1)] = 0;
1648                                         printf("-%s", buf);
1649                                 }
1650                         }
1651                         s++;
1652                 }
1653                 else if (op == EXACTLY) {
1654                         /* Literal string, where present. */
1655
1656                         while (*s) {
1657                                 buf[utf8_fromunicode(buf, *s)] = 0;
1658                                 printf("%s", buf);
1659                                 s++;
1660                         }
1661                         s++;
1662                 }
1663                 putchar('\n');
1664         }
1665
1666         if (op == END) {
1667                 /* Header fields of interest. */
1668                 if (preg->regstart) {
1669                         buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1670                         printf("start '%s' ", buf);
1671                 }
1672                 if (preg->reganch)
1673                         printf("anchored ");
1674                 if (preg->regmust != NULL) {
1675                         int i;
1676                         printf("must have:");
1677                         for (i = 0; i < preg->regmlen; i++) {
1678                                 putchar(preg->regmust[i]);
1679                         }
1680                         putchar('\n');
1681                 }
1682         }
1683         printf("\n");
1684 }
1685
1686 /*
1687  - regprop - printable representation of opcode
1688  */
1689 static const char *regprop( const int *op )
1690 {
1691         char *p;
1692         static char buf[50];
1693
1694         (void) strcpy(buf, ":");
1695
1696         switch (OP(op)) {
1697         case BOL:
1698                 p = "BOL";
1699                 break;
1700         case EOL:
1701                 p = "EOL";
1702                 break;
1703         case ANY:
1704                 p = "ANY";
1705                 break;
1706         case ANYOF:
1707                 p = "ANYOF";
1708                 break;
1709         case ANYBUT:
1710                 p = "ANYBUT";
1711                 break;
1712         case BRANCH:
1713                 p = "BRANCH";
1714                 break;
1715         case EXACTLY:
1716                 p = "EXACTLY";
1717                 break;
1718         case NOTHING:
1719                 p = "NOTHING";
1720                 break;
1721         case BACK:
1722                 p = "BACK";
1723                 break;
1724         case END:
1725                 p = "END";
1726                 break;
1727         case REP:
1728                 p = "REP";
1729                 break;
1730         case REPMIN:
1731                 p = "REPMIN";
1732                 break;
1733         case REPX:
1734                 p = "REPX";
1735                 break;
1736         case REPXMIN:
1737                 p = "REPXMIN";
1738                 break;
1739         case WORDA:
1740                 p = "WORDA";
1741                 break;
1742         case WORDZ:
1743                 p = "WORDZ";
1744                 break;
1745         default:
1746                 if (OP(op) >= OPEN && OP(op) < CLOSE) {
1747                         sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1748                 }
1749                 else if (OP(op) >= CLOSE && OP(op) < CLOSE_END) {
1750                         sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1751                 }
1752                 else {
1753                         abort();
1754                 }
1755                 p = NULL;
1756                 break;
1757         }
1758         if (p != NULL)
1759                 (void) strcat(buf, p);
1760         return(buf);
1761 }
1762 #endif
1763
1764 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1765 {
1766         static const char *error_strings[] = {
1767                 "success",
1768                 "no match",
1769                 "bad pattern",
1770                 "null argument",
1771                 "unknown error",
1772                 "too big",
1773                 "out of memory",
1774                 "too many ()",
1775                 "parentheses () not balanced",
1776                 "braces {} not balanced",
1777                 "invalid repetition count(s)",
1778                 "extra characters",
1779                 "*+ of empty atom",
1780                 "nested count",
1781                 "internal error",
1782                 "count follows nothing",
1783                 "trailing backslash",
1784                 "corrupted program",
1785                 "contains null char",
1786         };
1787         const char *err;
1788
1789         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1790                 err = "Bad error code";
1791         }
1792         else {
1793                 err = error_strings[errcode];
1794         }
1795
1796         return snprintf(errbuf, errbuf_size, "%s", err);
1797 }
1798
1799 void regfree(regex_t *preg)
1800 {
1801         free(preg->program);
1802 }
1803
1804 #endif