src/regexp.c

   1 /*
   2  * regcomp and regexec -- regsub and regerror are elsewhere
   3  *
   4  *  Copyright (c) 1986 by University of Toronto.
   5  *  Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *  Permission is granted to anyone to use this software for any
   8  *  purpose on any computer system, and to redistribute it freely,
   9  *  subject to the following restrictions:
  10  *
  11  *  1. The author is not responsible for the consequences of use of
  12  *    this software, no matter how awful, even if they arise
  13  *    from defects in it.
  14  *
  15  *  2. The origin of this software must not be misrepresented, either
  16  *    by explicit claim or by omission.
  17  *
  18  *  3. Altered versions must be plainly marked as such, and must not
  19  *    be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  33  *** to avoid conflicting with the system regexp.h.  Const, bless its
  34  *** soul, was removed so it can compile everywhere.  The declaration
  35  *** of strchr() was in conflict on AIX, so it was removed (as it is
  36  *** happily defined in string.h).
  37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  41  *
  42  * Beware that some of this code is subtly aware of the way operator
  43  * precedence is structured in regular expressions.  Serious changes in
  44  * regular-expression syntax might require a total rethink.
  45  */
  46 #include "regexp.h"
  47 #include <stdio.h>
  48 #include <ctype.h>
  49 #ifndef ultrix
  50 #include <stdlib.h>
  51 #endif
  52 #include <string.h>
  53
  54 /*
  55  * The "internal use only" fields in regexp.h are present to pass info from
  56  * compile to execute that permits the execute phase to run lots faster on
  57  * simple cases.  They are:
  58  *
  59  * regstart char that must begin a match; '\0' if none obvious
  60  * reganch  is the match anchored (at beginning-of-line only)?
  61  * regmust  string (pointer into program) that match must include, or NULL
  62  * regmlen  length of regmust string
  63  *
  64  * Regstart and reganch permit very fast decisions on suitable starting points
  65  * for a match, cutting down the work a lot.  Regmust permits fast rejection
  66  * of lines that cannot possibly match.  The regmust tests are costly enough
  67  * that regcomp() supplies a regmust only if the r.e. contains something
  68  * potentially expensive (at present, the only such thing detected is * or +
  69  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
  70  * supplied because the test in regexec() needs it and regcomp() is computing
  71  * it anyway.
  72  */
  73
  74 /*
  75  * Structure for regexp "program".  This is essentially a linear encoding
  76  * of a nondeterministic finite-state machine (aka syntax charts or
  77  * "railroad normal form" in parsing technology).  Each node is an opcode
  78  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  79  * all nodes except BRANCH implement concatenation; a "next" pointer with
  80  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  81  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  82  * opposed to a collection of them) is never concatenated with anything
  83  * because of operator precedence.)  The operand of some types of node is
  84  * a literal string; for others, it is a node leading into a sub-FSM.  In
  85  * particular, the operand of a BRANCH node is the first node of the branch.
  86  * (NB this is *not* a tree structure:  the tail of the branch connects
  87  * to the thing following the set of BRANCHes.)  The opcodes are:
  88  */
  89
  90 /* definition number  opnd? meaning */
  91 #define END     0 /* no End of program. */
  92 #define BOL     1 /* no Match "" at beginning of line. */
  93 #define EOL     2 /* no Match "" at end of line. */
  94 #define ANY     3 /* no Match any one character. */
  95 #define ANYOF   4 /* str  Match any character in this string. */
  96 #define ANYBUT  5 /* str  Match any character not in this string. */
  97 #define BRANCH  6 /* node Match this alternative, or the next... */
  98 #define BACK    7 /* no Match "", "next" ptr points backward. */
  99 #define EXACTLY 8 /* str  Match this string. */
 100 #define NOTHING 9 /* no Match empty string. */
 101 #define STAR   10 /* node Match this (simple) thing 0 or more times. */
 102 #define PLUS   11 /* node Match this (simple) thing 1 or more times. */
 103 #define WORDA  12 /* no Match "" at wordchar, where prev is nonword */
 104 #define WORDZ  13 /* no Match "" at nonwordchar, where prev is word */
 105 #define OPEN   20 /* no Mark this point in input as start of #n. */
 106   /*  OPEN+1 is number 1, etc. */
 107 #define CLOSE  30 /* no Analogous to OPEN. */
 108
 109 /*
 110  * Opcode notes:
 111  *
 112  * BRANCH The set of branches constituting a single choice are hooked
 113  *    together with their "next" pointers, since precedence prevents
 114  *    anything being concatenated to any individual branch.  The
 115  *    "next" pointer of the last BRANCH in a choice points to the
 116  *    thing following the whole choice.  This is also where the
 117  *    final "next" pointer of each individual branch points; each
 118  *    branch starts with the operand node of a BRANCH node.
 119  *
 120  * BACK   Normal "next" pointers all implicitly point forward; BACK
 121  *    exists to make loop structures possible.
 122  *
 123  * STAR,PLUS  '?', and complex '*' and '+', are implemented as circular
 124  *    BRANCH structures using BACK.  Simple cases (one character
 125  *    per match) are implemented with STAR and PLUS for speed
 126  *    and to minimize recursive plunges.
 127  *
 128  * OPEN,CLOSE ...are numbered at compile time.
 129  */
 130
 131 /*
 132  * A node is one char of opcode followed by two chars of "next" pointer.
 133  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 134  * value is a positive offset from the opcode of the node containing it.
 135  * An operand, if any, simply follows the node.  (Note that much of the
 136  * code generation knows about this implicit relationship.)
 137  *
 138  * Using two bytes for the "next" pointer is vast overkill for most things,
 139  * but allows patterns to get big without disasters.
 140  */
 141 #define OP(p) (*(p))
 142 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
 143 #define OPERAND(p)  ((p) + 3)
 144
 145 /*
 146  * See regmagic.h for one further detail of program structure.
 147  */
 148
 149
 150 /*
 151  * Utility definitions.
 152  */
 153 #ifndef CHARBITS
 154 #define UCHARAT(p)  ((int)*(unsigned char *)(p))
 155 #else
 156 #define UCHARAT(p)  ((int)*(p)&CHARBITS)
 157 #endif
 158
 159 #define FAIL(m) { regerror(m); return(NULL); }
 160 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
 161
 162 /*
 163  * Flags to be passed up and down.
 164  */
 165 #define HASWIDTH  01  /* Known never to match null string. */
 166 #define SIMPLE    02  /* Simple enough to be STAR/PLUS operand. */
 167 #define SPSTART   04  /* Starts with * or +. */
 168 #define WORST   0 /* Worst case. */
 169
 170 /*
 171  * Global work variables for regcomp().
 172  */
 173 static char *regparse;    /* Input-scan pointer. */
 174 static int regnpar;   /* () count. */
 175 static char regdummy;
 176 static char *regcode;   /* Code-emit pointer; &regdummy = don't. */
 177 static long regsize;    /* Code size. */
 178
 179 /*
 180  * Forward declarations for regcomp()'s friends.
 181  */
 182 #ifndef STATIC
 183 #define STATIC  static
 184 #endif
 185 STATIC char *reg( int paren, int *flagp );
 186 STATIC char *regbranch( int *flagp );
 187 STATIC char *regpiece( int *flagp );
 188 STATIC char *regatom( int *flagp );
 189 STATIC char *regnode( int op );
 190 STATIC char *regnext( register char *p );
 191 STATIC void regc( int b );
 192 STATIC void reginsert( char op, char *opnd );
 193 STATIC void regtail( char *p, char *val );
 194 STATIC void regoptail( char *p, char *val );
 195 #ifdef STRCSPN
 196 STATIC int strcspn();
 197 #endif
 198
 199 /*
 200  - regcomp - compile a regular expression into internal code
 201  *
 202  * We can't allocate space until we know how big the compiled form will be,
 203  * but we can't compile it (and thus know how big it is) until we've got a
 204  * place to put the code.  So we cheat:  we compile it twice, once with code
 205  * generation turned off and size counting turned on, and once "for real".
 206  * This also means that we don't allocate space until we are sure that the
 207  * thing really will compile successfully, and we never have to move the
 208  * code and thus invalidate pointers into it.  (Note that it has to be in
 209  * one piece because free() must be able to free it all.)
 210  *
 211  * Beware that the optimization-preparation code in here knows about some
 212  * of the structure of the compiled regexp.
 213  */
 214 regexp *
 215 regcomp( const char *exp )
 216 {
 217   register regexp *r;
 218   register char *scan;
 219   register char *longest;
 220   register unsigned len;
 221   int flags;
 222
 223   if (exp == NULL)
 224     FAIL("NULL argument");
 225
 226   /* First pass: determine size, legality. */
 227 #ifdef notdef
 228   if (exp[0] == '.' && exp[1] == '*') exp += 2;  /* aid grep */
 229 #endif
 230   regparse = (char *)exp;
 231   regnpar = 1;
 232   regsize = 0L;
 233   regcode = &regdummy;
 234   regc(MAGIC);
 235   if (reg(0, &flags) == NULL)
 236     return(NULL);
 237
 238   /* Small enough for pointer-storage convention? */
 239   if (regsize >= 32767L)    /* Probably could be 65535L. */
 240     FAIL("regexp too big");
 241
 242   /* Allocate space. */
 243   r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
 244   if (r == NULL)
 245     FAIL("out of space");
 246
 247   /* Second pass: emit code. */
 248   regparse = (char *)exp;
 249   regnpar = 1;
 250   regcode = r->program;
 251   regc(MAGIC);
 252   if (reg(0, &flags) == NULL)
 253     return(NULL);
 254
 255   /* Dig out information for optimizations. */
 256   r->regstart = '\0'; /* Worst-case defaults. */
 257   r->reganch = 0;
 258   r->regmust = NULL;
 259   r->regmlen = 0;
 260   scan = r->program+1;      /* First BRANCH. */
 261   if (OP(regnext(scan)) == END) {   /* Only one top-level choice. */
 262     scan = OPERAND(scan);
 263
 264     /* Starting-point info. */
 265     if (OP(scan) == EXACTLY)
 266       r->regstart = *OPERAND(scan);
 267     else if (OP(scan) == BOL)
 268       r->reganch++;
 269
 270     /*
 271      * If there's something expensive in the r.e., find the
 272      * longest literal string that must appear and make it the
 273      * regmust.  Resolve ties in favor of later strings, since
 274      * the regstart check works with the beginning of the r.e.
 275      * and avoiding duplication strengthens checking.  Not a
 276      * strong reason, but sufficient in the absence of others.
 277      */
 278     if (flags&SPSTART) {
 279       longest = NULL;
 280       len = 0;
 281       for (; scan != NULL; scan = regnext(scan))
 282         if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
 283           longest = OPERAND(scan);
 284           len = strlen(OPERAND(scan));
 285         }
 286       r->regmust = longest;
 287       r->regmlen = len;
 288     }
 289   }
 290
 291   return(r);
 292 }
 293
 294 /*
 295  - reg - regular expression, i.e. main body or parenthesized thing
 296  *
 297  * Caller must absorb opening parenthesis.
 298  *
 299  * Combining parenthesis handling with the base level of regular expression
 300  * is a trifle forced, but the need to tie the tails of the branches to what
 301  * follows makes it hard to avoid.
 302  */
 303 static char *
 304 reg(
 305   int paren,      /* Parenthesized? */
 306   int *flagp )
 307 {
 308   register char *ret;
 309   register char *br;
 310   register char *ender;
 311   register int parno = 0; /*k8*/
 312   int flags;
 313
 314   *flagp = HASWIDTH;  /* Tentatively. */
 315
 316   /* Make an OPEN node, if parenthesized. */
 317   if (paren) {
 318     if (regnpar >= NSUBEXP)
 319       FAIL("too many ()");
 320     parno = regnpar;
 321     regnpar++;
 322     ret = regnode(OPEN+parno);
 323   } else
 324     ret = NULL;
 325
 326   /* Pick up the branches, linking them together. */
 327   br = regbranch(&flags);
 328   if (br == NULL)
 329     return(NULL);
 330   if (ret != NULL)
 331     regtail(ret, br); /* OPEN -> first. */
 332   else
 333     ret = br;
 334   if (!(flags&HASWIDTH))
 335     *flagp &= ~HASWIDTH;
 336   *flagp |= flags&SPSTART;
 337   while (*regparse == '|' || *regparse == '\n') {
 338     regparse++;
 339     br = regbranch(&flags);
 340     if (br == NULL)
 341       return(NULL);
 342     regtail(ret, br); /* BRANCH -> BRANCH. */
 343     if (!(flags&HASWIDTH))
 344       *flagp &= ~HASWIDTH;
 345     *flagp |= flags&SPSTART;
 346   }
 347
 348   /* Make a closing node, and hook it on the end. */
 349   ender = regnode((paren) ? CLOSE+parno : END);
 350   regtail(ret, ender);
 351
 352   /* Hook the tails of the branches to the closing node. */
 353   for (br = ret; br != NULL; br = regnext(br))
 354     regoptail(br, ender);
 355
 356   /* Check for proper termination. */
 357   if (paren && *regparse++ != ')') {
 358     FAIL("unmatched ()");
 359   } else if (!paren && *regparse != '\0') {
 360     if (*regparse == ')') {
 361       FAIL("unmatched ()");
 362     } else
 363       FAIL("junk on end");  /* "Can't happen". */
 364     /* NOTREACHED */
 365   }
 366
 367   return(ret);
 368 }
 369
 370 /*
 371  - regbranch - one alternative of an | operator
 372  *
 373  * Implements the concatenation operator.
 374  */
 375 static char *
 376 regbranch( int *flagp )
 377 {
 378   register char *ret;
 379   register char *chain;
 380   register char *latest;
 381   int flags;
 382
 383   *flagp = WORST;   /* Tentatively. */
 384
 385   ret = regnode(BRANCH);
 386   chain = NULL;
 387   while (*regparse != '\0' && *regparse != ')' &&
 388          *regparse != '\n' && *regparse != '|') {
 389     latest = regpiece(&flags);
 390     if (latest == NULL)
 391       return(NULL);
 392     *flagp |= flags&HASWIDTH;
 393     if (chain == NULL)  /* First piece. */
 394       *flagp |= flags&SPSTART;
 395     else
 396       regtail(chain, latest);
 397     chain = latest;
 398   }
 399   if (chain == NULL)  /* Loop ran zero times. */
 400     (void) regnode(NOTHING);
 401
 402   return(ret);
 403 }
 404
 405 /*
 406  - regpiece - something followed by possible [*+?]
 407  *
 408  * Note that the branching code sequences used for ? and the general cases
 409  * of * and + are somewhat optimized:  they use the same NOTHING node as
 410  * both the endmarker for their branch list and the body of the last branch.
 411  * It might seem that this node could be dispensed with entirely, but the
 412  * endmarker role is not redundant.
 413  */
 414 static char *
 415 regpiece( int *flagp )
 416 {
 417   register char *ret;
 418   register char op;
 419   register char *next;
 420   int flags;
 421
 422   ret = regatom(&flags);
 423   if (ret == NULL)
 424     return(NULL);
 425
 426   op = *regparse;
 427   if (!ISMULT(op)) {
 428     *flagp = flags;
 429     return(ret);
 430   }
 431
 432   if (!(flags&HASWIDTH) && op != '?')
 433     FAIL("*+ operand could be empty");
 434   *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
 435
 436   if (op == '*' && (flags&SIMPLE))
 437     reginsert(STAR, ret);
 438   else if (op == '*') {
 439     /* Emit x* as (x&|), where & means "self". */
 440     reginsert(BRANCH, ret);     /* Either x */
 441     regoptail(ret, regnode(BACK));    /* and loop */
 442     regoptail(ret, ret);      /* back */
 443     regtail(ret, regnode(BRANCH));    /* or */
 444     regtail(ret, regnode(NOTHING));   /* null. */
 445   } else if (op == '+' && (flags&SIMPLE)) {
 446     reginsert(PLUS, ret);
 447   } else if (op == '+') {
 448     /* Emit x+ as x(&|), where & means "self". */
 449     next = regnode(BRANCH);     /* Either */
 450     regtail(ret, next);
 451     regtail(regnode(BACK), ret);    /* loop back */
 452     regtail(next, regnode(BRANCH));   /* or */
 453     regtail(ret, regnode(NOTHING));   /* null. */
 454   } else if (op == '?') {
 455     /* Emit x? as (x|) */
 456     reginsert(BRANCH, ret);     /* Either x */
 457     regtail(ret, regnode(BRANCH));    /* or */
 458     next = regnode(NOTHING);    /* null. */
 459     regtail(ret, next);
 460     regoptail(ret, next);
 461   }
 462   regparse++;
 463   if (ISMULT(*regparse))
 464     FAIL("nested *?+");
 465
 466   return(ret);
 467 }
 468
 469 /*
 470  - regatom - the lowest level
 471  *
 472  * Optimization:  gobbles an entire sequence of ordinary characters so that
 473  * it can turn them into a single node, which is smaller to store and
 474  * faster to run.  Backslashed characters are exceptions, each becoming a
 475  * separate node; the code is simpler that way and it's not worth fixing.
 476  */
 477 static char *
 478 regatom( int *flagp )
 479 {
 480   register char *ret;
 481   int flags;
 482
 483   *flagp = WORST;   /* Tentatively. */
 484
 485   switch (*regparse++) {
 486   /* FIXME: these chars only have meaning at beg/end of pat? */
 487   case '^':
 488     ret = regnode(BOL);
 489     break;
 490   case '$':
 491     ret = regnode(EOL);
 492     break;
 493   case '.':
 494     ret = regnode(ANY);
 495     *flagp |= HASWIDTH|SIMPLE;
 496     break;
 497   case '[': {
 498       register int classr;
 499       register int classend;
 500
 501       if (*regparse == '^') { /* Complement of range. */
 502         ret = regnode(ANYBUT);
 503         regparse++;
 504       } else
 505         ret = regnode(ANYOF);
 506       if (*regparse == ']' || *regparse == '-')
 507         regc(*regparse++);
 508       while (*regparse != '\0' && *regparse != ']') {
 509         if (*regparse == '-') {
 510           regparse++;
 511           if (*regparse == ']' || *regparse == '\0')
 512             regc('-');
 513           else {
 514             classr = UCHARAT(regparse-2)+1;
 515             classend = UCHARAT(regparse);
 516             if (classr > classend+1)
 517               FAIL("invalid [] range");
 518             for (; classr <= classend; classr++)
 519               regc(classr);
 520             regparse++;
 521           }
 522         } else
 523           regc(*regparse++);
 524       }
 525       regc('\0');
 526       if (*regparse != ']')
 527         FAIL("unmatched []");
 528       regparse++;
 529       *flagp |= HASWIDTH|SIMPLE;
 530     }
 531     break;
 532   case '(':
 533     ret = reg(1, &flags);
 534     if (ret == NULL)
 535       return(NULL);
 536     *flagp |= flags&(HASWIDTH|SPSTART);
 537     break;
 538   case '\0':
 539   case '|':
 540   case '\n':
 541   case ')':
 542     FAIL("internal urp"); /* Supposed to be caught earlier. */
 543     break;
 544   case '?':
 545   case '+':
 546   case '*':
 547     FAIL("?+* follows nothing");
 548     break;
 549   case '\\':
 550     switch (*regparse++) {
 551     case '\0':
 552       FAIL("trailing \\");
 553       break;
 554     case '<':
 555       ret = regnode(WORDA);
 556       break;
 557     case '>':
 558       ret = regnode(WORDZ);
 559       break;
 560     /* FIXME: Someday handle \1, \2, ... */
 561     default:
 562       /* Handle general quoted chars in exact-match routine */
 563       goto de_fault;
 564     }
 565     break;
 566   de_fault:
 567   default:
 568     /*
 569      * Encode a string of characters to be matched exactly.
 570      *
 571      * This is a bit tricky due to quoted chars and due to
 572      * '*', '+', and '?' taking the SINGLE char previous
 573      * as their operand.
 574      *
 575      * On entry, the char at regparse[-1] is going to go
 576      * into the string, no matter what it is.  (It could be
 577      * following a \ if we are entered from the '\' case.)
 578      *
 579      * Basic idea is to pick up a good char in  ch  and
 580      * examine the next char.  If it's *+? then we twiddle.
 581      * If it's \ then we frozzle.  If it's other magic char
 582      * we push  ch  and terminate the string.  If none of the
 583      * above, we push  ch  on the string and go around again.
 584      *
 585      *  regprev  is used to remember where "the current char"
 586      * starts in the string, if due to a *+? we need to back
 587      * up and put the current char in a separate, 1-char, string.
 588      * When  regprev  is NULL,  ch  is the only char in the
 589      * string; this is used in *+? handling, and in setting
 590      * flags |= SIMPLE at the end.
 591      */
 592     {
 593       char *regprev;
 594       register char ch;
 595
 596       regparse--;     /* Look at cur char */
 597       ret = regnode(EXACTLY);
 598       for ( regprev = 0 ; ; ) {
 599         ch = *regparse++; /* Get current char */
 600         switch (*regparse) {  /* look at next one */
 601
 602         default:
 603           regc(ch); /* Add cur to string */
 604           break;
 605
 606         case '.': case '[': case '(':
 607         case ')': case '|': case '\n':
 608         case '$': case '^':
 609         case '\0':
 610         /* FIXME, $ and ^ should not always be magic */
 611         magic:
 612           regc(ch); /* dump cur char */
 613           goto done;  /* and we are done */
 614
 615         case '?': case '+': case '*':
 616           if (!regprev)   /* If just ch in str, */
 617             goto magic; /* use it */
 618           /* End mult-char string one early */
 619           regparse = regprev; /* Back up parse */
 620           goto done;
 621
 622         case '\\':
 623           regc(ch); /* Cur char OK */
 624           switch (regparse[1]){ /* Look after \ */
 625           case '\0':
 626           case '<':
 627           case '>':
 628           /* FIXME: Someday handle \1, \2, ... */
 629             goto done; /* Not quoted */
 630           default:
 631             /* Backup point is \, scan               * point is after it. */
 632             regprev = regparse;
 633             regparse++;
 634             continue; /* NOT break; */
 635           }
 636         }
 637         regprev = regparse; /* Set backup point */
 638       }
 639     done:
 640       regc('\0');
 641       *flagp |= HASWIDTH;
 642       if (!regprev)   /* One char? */
 643         *flagp |= SIMPLE;
 644     }
 645     break;
 646   }
 647
 648   return(ret);
 649 }
 650
 651 /*
 652  - regnode - emit a node
 653  */
 654 static char *     /* Location. */
 655 regnode( int op )
 656 {
 657   register char *ret;
 658   register char *ptr;
 659
 660   ret = regcode;
 661   if (ret == &regdummy) {
 662     regsize += 3;
 663     return(ret);
 664   }
 665
 666   ptr = ret;
 667   *ptr++ = op;
 668   *ptr++ = '\0';    /* Null "next" pointer. */
 669   *ptr++ = '\0';
 670   regcode = ptr;
 671
 672   return(ret);
 673 }
 674
 675 /*
 676  - regc - emit (if appropriate) a byte of code
 677  */
 678 static void
 679 regc( int b )
 680 {
 681   if (regcode != &regdummy)
 682     *regcode++ = b;
 683   else
 684     regsize++;
 685 }
 686
 687 /*
 688  - reginsert - insert an operator in front of already-emitted operand
 689  *
 690  * Means relocating the operand.
 691  */
 692 static void
 693 reginsert(
 694   char op,
 695   char *opnd )
 696 {
 697   register char *src;
 698   register char *dst;
 699   register char *place;
 700
 701   if (regcode == &regdummy) {
 702     regsize += 3;
 703     return;
 704   }
 705
 706   src = regcode;
 707   regcode += 3;
 708   dst = regcode;
 709   while (src > opnd)
 710     *--dst = *--src;
 711
 712   place = opnd;   /* Op node, where operand used to be. */
 713   *place++ = op;
 714   *place++ = '\0';
 715   *place++ = '\0';
 716 }
 717
 718 /*
 719  - regtail - set the next-pointer at the end of a node chain
 720  */
 721 static void
 722 regtail(
 723   char *p,
 724   char *val )
 725 {
 726   register char *scan;
 727   register char *temp;
 728   register int offset;
 729
 730   if (p == &regdummy)
 731     return;
 732
 733   /* Find last node. */
 734   scan = p;
 735   for (;;) {
 736     temp = regnext(scan);
 737     if (temp == NULL)
 738       break;
 739     scan = temp;
 740   }
 741
 742   if (OP(scan) == BACK)
 743     offset = scan - val;
 744   else
 745     offset = val - scan;
 746   *(scan+1) = (offset>>8)&0377;
 747   *(scan+2) = offset&0377;
 748 }
 749
 750 /*
 751  - regoptail - regtail on operand of first argument; nop if operandless
 752  */
 753
 754 static void
 755 regoptail(
 756   char *p,
 757   char *val )
 758 {
 759   /* "Operandless" and "op != BRANCH" are synonymous in practice. */
 760   if (p == NULL || p == &regdummy || OP(p) != BRANCH)
 761     return;
 762   regtail(OPERAND(p), val);
 763 }
 764
 765 /*
 766  * regexec and friends
 767  */
 768
 769 /*
 770  * Global work variables for regexec().
 771  */
 772 static const char *reginput;  /* String-input pointer. */
 773 static char *regbol;    /* Beginning of input, for ^ check. */
 774 static const char **regstartp;  /* Pointer to startp array. */
 775 static const char **regendp;  /* Ditto for endp. */
 776
 777 /*
 778  * Forwards.
 779  */
 780 STATIC int regtry( regexp *prog, const char *string );
 781 STATIC int regmatch( char *prog );
 782 STATIC int regrepeat( char *p );
 783
 784 #ifdef DEBUG
 785 int regnarrate = 0;
 786 void regdump();
 787 STATIC char *regprop();
 788 #endif
 789
 790 /*
 791  - regexec - match a regexp against a string
 792  */
 793 int
 794 regexec(
 795   register regexp *prog,
 796   register const char *string )
 797 {
 798   register char *s;
 799
 800   /* Be paranoid... */
 801   if (prog == NULL || string == NULL) {
 802     regerror("NULL parameter");
 803     return(0);
 804   }
 805
 806   /* Check validity of program. */
 807   if (UCHARAT(prog->program) != MAGIC) {
 808     regerror("corrupted program");
 809     return(0);
 810   }
 811
 812   /* If there is a "must appear" string, look for it. */
 813   if (prog->regmust != NULL) {
 814     s = (char *)string;
 815     while ((s = strchr(s, prog->regmust[0])) != NULL) {
 816       if (strncmp(s, prog->regmust, prog->regmlen) == 0)
 817         break;  /* Found it. */
 818       s++;
 819     }
 820     if (s == NULL)  /* Not present. */
 821       return(0);
 822   }
 823
 824   /* Mark beginning of line for ^ . */
 825   regbol = (char *)string;
 826
 827   /* Simplest case:  anchored match need be tried only once. */
 828   if (prog->reganch)
 829     return(regtry(prog, string));
 830
 831   /* Messy cases:  unanchored match. */
 832   s = (char *)string;
 833   if (prog->regstart != '\0')
 834     /* We know what char it must start with. */
 835     while ((s = strchr(s, prog->regstart)) != NULL) {
 836       if (regtry(prog, s))
 837         return(1);
 838       s++;
 839     }
 840   else
 841     /* We don't -- general case. */
 842     do {
 843       if (regtry(prog, s))
 844         return(1);
 845     } while (*s++ != '\0');
 846
 847   /* Failure. */
 848   return(0);
 849 }
 850
 851 /*
 852  - regtry - try match at specific point
 853  */
 854 static int      /* 0 failure, 1 success */
 855 regtry(
 856   regexp *prog,
 857   const char *string )
 858 {
 859   register int i;
 860   register const char **sp;
 861   register const char **ep;
 862
 863   reginput = string;
 864   regstartp = prog->startp;
 865   regendp = prog->endp;
 866
 867   sp = prog->startp;
 868   ep = prog->endp;
 869   for (i = NSUBEXP; i > 0; i--) {
 870     *sp++ = NULL;
 871     *ep++ = NULL;
 872   }
 873   if (regmatch(prog->program + 1)) {
 874     prog->startp[0] = string;
 875     prog->endp[0] = reginput;
 876     return(1);
 877   } else
 878     return(0);
 879 }
 880
 881 /*
 882  - regmatch - main matching routine
 883  *
 884  * Conceptually the strategy is simple:  check to see whether the current
 885  * node matches, call self recursively to see whether the rest matches,
 886  * and then act accordingly.  In practice we make some effort to avoid
 887  * recursion, in particular by going through "ordinary" nodes (that don't
 888  * need to know whether the rest of the match failed) by a loop instead of
 889  * by recursion.
 890  */
 891 static int      /* 0 failure, 1 success */
 892 regmatch( char *prog )
 893 {
 894   register char *scan;  /* Current node. */
 895   char *next;   /* Next node. */
 896
 897   scan = prog;
 898 #ifdef DEBUG
 899   if (scan != NULL && regnarrate)
 900     fprintf(stderr, "%s(\n", regprop(scan));
 901 #endif
 902   while (scan != NULL) {
 903 #ifdef DEBUG
 904     if (regnarrate)
 905       fprintf(stderr, "%s...\n", regprop(scan));
 906 #endif
 907     next = regnext(scan);
 908
 909     switch (OP(scan)) {
 910     case BOL:
 911       if (reginput != regbol)
 912         return(0);
 913       break;
 914     case EOL:
 915       if (*reginput != '\0')
 916         return(0);
 917       break;
 918     case WORDA:
 919       /* Must be looking at a letter, digit, or _ */
 920       if ((!isalnum(*reginput)) && *reginput != '_')
 921         return(0);
 922       /* Prev must be BOL or nonword */
 923       if (reginput > regbol &&
 924           (isalnum(reginput[-1]) || reginput[-1] == '_'))
 925         return(0);
 926       break;
 927     case WORDZ:
 928       /* Must be looking at non letter, digit, or _ */
 929       if (isalnum(*reginput) || *reginput == '_')
 930         return(0);
 931       /* We don't care what the previous char was */
 932       break;
 933     case ANY:
 934       if (*reginput == '\0')
 935         return(0);
 936       reginput++;
 937       break;
 938     case EXACTLY: {
 939         register int len;
 940         register char *opnd;
 941
 942         opnd = OPERAND(scan);
 943         /* Inline the first character, for speed. */
 944         if (*opnd != *reginput)
 945           return(0);
 946         len = strlen(opnd);
 947         if (len > 1 && strncmp(opnd, reginput, len) != 0)
 948           return(0);
 949         reginput += len;
 950       }
 951       break;
 952     case ANYOF:
 953       if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
 954         return(0);
 955       reginput++;
 956       break;
 957     case ANYBUT:
 958       if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
 959         return(0);
 960       reginput++;
 961       break;
 962     case NOTHING:
 963       break;
 964     case BACK:
 965       break;
 966     case OPEN+1:
 967     case OPEN+2:
 968     case OPEN+3:
 969     case OPEN+4:
 970     case OPEN+5:
 971     case OPEN+6:
 972     case OPEN+7:
 973     case OPEN+8:
 974     case OPEN+9: {
 975         register int no;
 976         register const char *save;
 977
 978         no = OP(scan) - OPEN;
 979         save = reginput;
 980
 981         if (regmatch(next)) {
 982           /*
 983            * Don't set startp if some later
 984            * invocation of the same parentheses
 985            * already has.
 986            */
 987           if (regstartp[no] == NULL)
 988             regstartp[no] = save;
 989           return(1);
 990         } else
 991           return(0);
 992       }
 993       break;
 994     case CLOSE+1:
 995     case CLOSE+2:
 996     case CLOSE+3:
 997     case CLOSE+4:
 998     case CLOSE+5:
 999     case CLOSE+6:
1000     case CLOSE+7:
1001     case CLOSE+8:
1002     case CLOSE+9: {
1003         register int no;
1004         register const char *save;
1005
1006         no = OP(scan) - CLOSE;
1007         save = reginput;
1008
1009         if (regmatch(next)) {
1010           /*
1011            * Don't set endp if some later
1012            * invocation of the same parentheses
1013            * already has.
1014            */
1015           if (regendp[no] == NULL)
1016             regendp[no] = save;
1017           return(1);
1018         } else
1019           return(0);
1020       }
1021       break;
1022     case BRANCH: {
1023         register const char *save;
1024
1025         if (OP(next) != BRANCH)   /* No choice. */
1026           next = OPERAND(scan); /* Avoid recursion. */
1027         else {
1028           do {
1029             save = reginput;
1030             if (regmatch(OPERAND(scan)))
1031               return(1);
1032             reginput = save;
1033             scan = regnext(scan);
1034           } while (scan != NULL && OP(scan) == BRANCH);
1035           return(0);
1036           /* NOTREACHED */
1037         }
1038       }
1039       break;
1040     case STAR:
1041     case PLUS: {
1042         register char nextch;
1043         register int no;
1044         register const char *save;
1045         register int min;
1046
1047         /*
1048          * Lookahead to avoid useless match attempts
1049          * when we know what character comes next.
1050          */
1051         nextch = '\0';
1052         if (OP(next) == EXACTLY)
1053           nextch = *OPERAND(next);
1054         min = (OP(scan) == STAR) ? 0 : 1;
1055         save = reginput;
1056         no = regrepeat(OPERAND(scan));
1057         while (no >= min) {
1058           /* If it could work, try it. */
1059           if (nextch == '\0' || *reginput == nextch)
1060             if (regmatch(next))
1061               return(1);
1062           /* Couldn't or didn't -- back up. */
1063           no--;
1064           reginput = save + no;
1065         }
1066         return(0);
1067       }
1068       break;
1069     case END:
1070       return(1);  /* Success! */
1071       break;
1072     default:
1073       regerror("memory corruption");
1074       return(0);
1075       break;
1076     }
1077
1078     scan = next;
1079   }
1080
1081   /*
1082    * We get here only if there's trouble -- normally "case END" is
1083    * the terminating point.
1084    */
1085   regerror("corrupted pointers");
1086   return(0);
1087 }
1088
1089 /*
1090  - regrepeat - repeatedly match something simple, report how many
1091  */
1092 static int
1093 regrepeat( char *p )
1094 {
1095   register int count = 0;
1096   register const char *scan;
1097   register char *opnd;
1098
1099   scan = reginput;
1100   opnd = OPERAND(p);
1101   switch (OP(p)) {
1102   case ANY:
1103     count = strlen(scan);
1104     scan += count;
1105     break;
1106   case EXACTLY:
1107     while (*opnd == *scan) {
1108       count++;
1109       scan++;
1110     }
1111     break;
1112   case ANYOF:
1113     while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1114       count++;
1115       scan++;
1116     }
1117     break;
1118   case ANYBUT:
1119     while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1120       count++;
1121       scan++;
1122     }
1123     break;
1124   default:    /* Oh dear.  Called inappropriately. */
1125     regerror("internal foulup");
1126     count = 0;  /* Best compromise. */
1127     break;
1128   }
1129   reginput = scan;
1130
1131   return(count);
1132 }
1133
1134 /*
1135  - regnext - dig the "next" pointer out of a node
1136  */
1137 static char *
1138 regnext( register char *p )
1139 {
1140   register int offset;
1141
1142   if (p == &regdummy)
1143     return(NULL);
1144
1145   offset = NEXT(p);
1146   if (offset == 0)
1147     return(NULL);
1148
1149   if (OP(p) == BACK)
1150     return(p-offset);
1151   else
1152     return(p+offset);
1153 }
1154
1155 #ifdef DEBUG
1156
1157 STATIC char *regprop();
1158
1159 /*
1160  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1161  */
1162 void
1163 regdump( regexp *r )
1164 {
1165   register char *s;
1166   register char op = EXACTLY; /* Arbitrary non-END op. */
1167   register char *next;
1168
1169
1170   s = r->program + 1;
1171   while (op != END) { /* While that wasn't END last time... */
1172     op = OP(s);
1173     printf("%2d%s", s-r->program, regprop(s));  /* Where, what. */
1174     next = regnext(s);
1175     if (next == NULL)   /* Next ptr. */
1176       printf("(0)");
1177     else
1178       printf("(%d)", (s-r->program)+(next-s));
1179     s += 3;
1180     if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1181       /* Literal string, where present. */
1182       while (*s != '\0') {
1183         putchar(*s);
1184         s++;
1185       }
1186       s++;
1187     }
1188     putchar('\n');
1189   }
1190
1191   /* Header fields of interest. */
1192   if (r->regstart != '\0')
1193     printf("start `%c' ", r->regstart);
1194   if (r->reganch)
1195     printf("anchored ");
1196   if (r->regmust != NULL)
1197     printf("must have \"%s\"", r->regmust);
1198   printf("\n");
1199 }
1200
1201 /*
1202  - regprop - printable representation of opcode
1203  */
1204 static char *
1205 regprop( char *op )
1206 {
1207   register char *p;
1208   static char buf[50];
1209
1210   (void) strcpy(buf, ":");
1211
1212   switch (OP(op)) {
1213   case BOL:
1214     p = "BOL";
1215     break;
1216   case EOL:
1217     p = "EOL";
1218     break;
1219   case ANY:
1220     p = "ANY";
1221     break;
1222   case ANYOF:
1223     p = "ANYOF";
1224     break;
1225   case ANYBUT:
1226     p = "ANYBUT";
1227     break;
1228   case BRANCH:
1229     p = "BRANCH";
1230     break;
1231   case EXACTLY:
1232     p = "EXACTLY";
1233     break;
1234   case NOTHING:
1235     p = "NOTHING";
1236     break;
1237   case BACK:
1238     p = "BACK";
1239     break;
1240   case END:
1241     p = "END";
1242     break;
1243   case OPEN+1:
1244   case OPEN+2:
1245   case OPEN+3:
1246   case OPEN+4:
1247   case OPEN+5:
1248   case OPEN+6:
1249   case OPEN+7:
1250   case OPEN+8:
1251   case OPEN+9:
1252     sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1253     p = NULL;
1254     break;
1255   case CLOSE+1:
1256   case CLOSE+2:
1257   case CLOSE+3:
1258   case CLOSE+4:
1259   case CLOSE+5:
1260   case CLOSE+6:
1261   case CLOSE+7:
1262   case CLOSE+8:
1263   case CLOSE+9:
1264     sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1265     p = NULL;
1266     break;
1267   case STAR:
1268     p = "STAR";
1269     break;
1270   case PLUS:
1271     p = "PLUS";
1272     break;
1273   case WORDA:
1274     p = "WORDA";
1275     break;
1276   case WORDZ:
1277     p = "WORDZ";
1278     break;
1279   default:
1280     regerror("corrupted opcode");
1281     break;
1282   }
1283   if (p != NULL)
1284     (void) strcat(buf, p);
1285   return(buf);
1286 }
1287 #endif
1288
1289 /*
1290  * The following is provided for those people who do not have strcspn() in
1291  * their C libraries.  They should get off their butts and do something
1292  * about it; at least one public-domain implementation of those (highly
1293  * useful) string routines has been published on Usenet.
1294  */
1295 #ifdef STRCSPN
1296 /*
1297  * strcspn - find length of initial segment of s1 consisting entirely
1298  * of characters not from s2
1299  */
1300
1301 static int
1302 strcspn(
1303   char *s1,
1304   char *s2 )
1305 {
1306   register char *scan1;
1307   register char *scan2;
1308   register int count;
1309
1310   count = 0;
1311   for (scan1 = s1; *scan1 != '\0'; scan1++) {
1312     for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1313       if (*scan1 == *scan2++)
1314         return(count);
1315     count++;
1316   }
1317   return(count);
1318 }
1319 #endif
1320
1321 void
1322 regerror( const char *s )
1323 {
1324   printf( "re error %s\n", s );
1325 }