src/hsregexp.c

   1 /*
   2  * hsRxCompile and hsRxExec -- regsub and hsRxError are elsewhere
   3  *
   4  *  Copyright (c) 1986 by University of Toronto.
   5  *  Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *  Permission is granted to anyone to use this software for any
   8  *  purpose on any computer system, and to redistribute it freely,
   9  *  subject to the following restrictions:
  10  *
  11  *  1. The author is not responsible for the consequences of use of
  12  *    this software, no matter how awful, even if they arise
  13  *    from defects in it.
  14  *
  15  *  2. The origin of this software must not be misrepresented, either
  16  *    by explicit claim or by omission.
  17  *
  18  *  3. Altered versions must be plainly marked as such, and must not
  19  *    be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a hsRxCompile() redundancy.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  32  *** was moved into hsregexp.h, and the include of hsregexp.h now uses "'s
  33  *** to avoid conflicting with the system hsregexp.h.  Const, bless its
  34  *** soul, was removed so it can compile everywhere.  The declaration
  35  *** of strchr() was in conflict on AIX, so it was removed (as it is
  36  *** happily defined in string.h).
  37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  41  *
  42  * Beware that some of this code is subtly aware of the way operator
  43  * precedence is structured in regular expressions.  Serious changes in
  44  * regular-expression syntax might require a total rethink.
  45  */
  46 #include "hsregexp.h"
  47
  48 #include <ctype.h>
  49 #include <stdio.h>
  50 #include <stdlib.h>
  51 #include <string.h>
  52
  53 /*
  54  * The "internal use only" fields in hsregexp.h are present to pass info from
  55  * compile to execute that permits the execute phase to run lots faster on
  56  * simple cases.  They are:
  57  *
  58  * regstart char that must begin a match; '\0' if none obvious
  59  * reganch  is the match anchored (at beginning-of-line only)?
  60  * regmust  string (pointer into program) that match must include, or NULL
  61  * regmlen  length of regmust string
  62  *
  63  * Regstart and reganch permit very fast decisions on suitable starting points
  64  * for a match, cutting down the work a lot.  Regmust permits fast rejection
  65  * of lines that cannot possibly match.  The regmust tests are costly enough
  66  * that hsRxCompile() supplies a regmust only if the r.e. contains something
  67  * potentially expensive (at present, the only such thing detected is * or +
  68  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
  69  * supplied because the test in hsRxExec() needs it and hsRxCompile() is computing
  70  * it anyway.
  71  */
  72
  73 /*
  74  * Structure for regexp "program".  This is essentially a linear encoding
  75  * of a nondeterministic finite-state machine (aka syntax charts or
  76  * "railroad normal form" in parsing technology).  Each node is an opcode
  77  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  78  * all nodes except BRANCH implement concatenation; a "next" pointer with
  79  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  80  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  81  * opposed to a collection of them) is never concatenated with anything
  82  * because of operator precedence.)  The operand of some types of node is
  83  * a literal string; for others, it is a node leading into a sub-FSM.  In
  84  * particular, the operand of a BRANCH node is the first node of the branch.
  85  * (NB this is *not* a tree structure:  the tail of the branch connects
  86  * to the thing following the set of BRANCHes.)  The opcodes are:
  87  */
  88
  89 /* definition opnd? meaning */
  90 enum {
  91   END,     /* no    End of program */
  92   BOL,     /* no    Match "" at beginning of line */
  93   EOL,     /* no    Match "" at end of line */
  94   ANY,     /* no    Match any one character */
  95   ANYOF,   /* str   Match any character in this string */
  96   ANYBUT,  /* str   Match any character not in this string */
  97   BRANCH,  /* node  Match this alternative, or the next */
  98   BACK,    /* no    Match "", "next" ptr points backward */
  99   EXACTLY, /* str   Match this string */
 100   NOTHING, /* no    Match empty string */
 101   STAR,    /* node  Match this (simple) thing 0 or more times */
 102   PLUS,    /* node  Match this (simple) thing 1 or more times */
 103   WORDA,   /* no    Match "" at wordchar, where prev is nonword */
 104   WORDZ,   /* no    Match "" at nonwordchar, where prev is word */
 105   //
 106   OPEN  = 40, /* no Mark this point in input as start of #n */
 107     /* OPEN+1 is number 1, etc. */
 108   CLOSE = 60 /* no  Analogous to OPEN */
 109 };
 110
 111
 112 /*
 113  * Opcode notes:
 114  *
 115  * BRANCH
 116  *   The set of branches constituting a single choice are hooked
 117  *   together with their "next" pointers, since precedence prevents
 118  *   anything being concatenated to any individual branch.
 119  *   The "next" pointer of the last BRANCH in a choice points to the
 120  *   thing following the whole choice. This is also where the
 121  *   final "next" pointer of each individual branch points; each
 122  *   branch starts with the operand node of a BRANCH node.
 123  *
 124  * BACK
 125  *   Normal "next" pointers all implicitly point forward; BACK
 126  *   exists to make loop structures possible.
 127  *
 128  * STAR, PLUS
 129  *   '?' and complex '*' and '+', are implemented as circular
 130  *   BRANCH structures using BACK. Simple cases (one character
 131  *   per match) are implemented with STAR and PLUS for speed
 132  *   and to minimize recursive plunges.
 133  *
 134  * OPEN,CLOSE ...are numbered at compile time.
 135  */
 136
 137 /*
 138  * A node is one char of opcode followed by two chars of "next" pointer.
 139  * "Next" pointers are stored as two 8-bit pieces, high order first.
 140  * The value is a positive offset from the opcode of the node containing it.
 141  * An operand, if any, simply follows the node. (Note that much of the
 142  * code generation knows about this implicit relationship.)
 143  *
 144  * Using two bytes for the "next" pointer is vast overkill for most things,
 145  * but allows patterns to get big without disasters.
 146  */
 147 #define OP(p)       (*(p))
 148 #define NEXT(p)     (((*((p)+1)&0377)<<8)+(*((p)+2)&0377))
 149 #define OPERAND(p)  ((p)+3)
 150
 151
 152 /*
 153  * utility definitions
 154  */
 155 #ifndef CHARBITS
 156 # define UCHARAT(p)  ((int)*(unsigned char *)(p))
 157 #else
 158 # define UCHARAT(p)  ((int)*(p)&CHARBITS)
 159 #endif
 160
 161 #define FAIL(m)    { hsRxError(m); return(NULL); }
 162 #define ISMULT(c)  ((c) == '*' || (c) == '+' || (c) == '?')
 163
 164
 165 /*
 166  * flags to be passed up and down
 167  */
 168 #define WORST     0 /* worst case */
 169 #define HASWIDTH  1 /* known never to match null string */
 170 #define SIMPLE    2 /* simple enough to be STAR/PLUS operand */
 171 #define SPSTART   4 /* starts with * or + */
 172
 173
 174 /*
 175  * The first byte of the regexp internal "program" is actually this magic
 176  * number; the start node begins in the second byte.
 177  */
 178 #define MAGIC  (0234)
 179
 180
 181 /*
 182  * global work variables for hsRxCompile()
 183  */
 184 static char *regparse; /* input-scan pointer */
 185 static int regnpar; /* () count */
 186 static char regdummy;
 187 static char *regcode; /* code-emit pointer; &regdummy = don't */
 188 static long regsize; /* code size */
 189
 190 /*
 191  * forward declarations for hsRxCompile()'s friends
 192  */
 193 static char *reg (int paren, int *flagp);
 194 static char *regbranch (int *flagp);
 195 static char *regpiece (int *flagp);
 196 static char *regatom (int *flagp);
 197 static char *regnode (int op);
 198 static char *regnext (char *p);
 199 static void regc (int b);
 200 static void reginsert (char op, char *opnd);
 201 static void regtail (char *p, char *val);
 202 static void regoptail (char *p, char *val);
 203 #ifdef STRCSPN
 204 static int strcspn (const char *s1, const char *s2);
 205 #endif
 206
 207
 208 /*
 209  - hsRxCompile - compile a regular expression into internal code
 210  *
 211  * We can't allocate space until we know how big the compiled form will be,
 212  * but we can't compile it (and thus know how big it is) until we've got a
 213  * place to put the code.  So we cheat:  we compile it twice, once with code
 214  * generation turned off and size counting turned on, and once "for real".
 215  * This also means that we don't allocate space until we are sure that the
 216  * thing really will compile successfully, and we never have to move the
 217  * code and thus invalidate pointers into it.  (Note that it has to be in
 218  * one piece because free() must be able to free it all.)
 219  *
 220  * Beware that the optimization-preparation code in here knows about some
 221  * of the structure of the compiled regexp.
 222  */
 223 HSRegExp *hsRxCompile (const char *exp) {
 224   HSRegExp *r;
 225   char *scan;
 226   char *longest;
 227   unsigned len;
 228   int flags;
 229   //
 230   if (exp == NULL) FAIL("NULL argument");
 231   /* first pass: determine size, legality */
 232 #ifdef notdef
 233   if (exp[0] == '.' && exp[1] == '*') exp += 2;  /* aid grep */
 234 #endif
 235   regparse = (char *)exp;
 236   regnpar = 1;
 237   regsize = 0L;
 238   regcode = &regdummy;
 239   regc(MAGIC);
 240   if (reg(0, &flags) == NULL) return NULL;
 241   /* small enough for pointer-storage convention? */
 242   if (regsize >= 32767L) FAIL("regexp too big"); /* probably could be 65535L */
 243   /* allocate space. */
 244   r = (HSRegExp *)malloc(sizeof(HSRegExp)+(unsigned)regsize);
 245   if (r == NULL) FAIL("out of space");
 246   /* second pass: emit code */
 247   r->parcnt = regnpar;
 248   regparse = (char *)exp;
 249   regnpar = 1;
 250   regcode = r->program;
 251   regc(MAGIC);
 252   if (reg(0, &flags) == NULL) return NULL;
 253   /* dig out information for optimizations */
 254   r->regstart = '\0'; /* worst-case defaults */
 255   r->reganch = 0;
 256   r->regmust = NULL;
 257   r->regmlen = 0;
 258   scan = r->program+1; /* first BRANCH */
 259   if (OP(regnext(scan)) == END) {
 260     /* only one top-level choice */
 261     scan = OPERAND(scan);
 262     /* starting-point info */
 263     if (OP(scan) == EXACTLY) r->regstart = *OPERAND(scan);
 264     else if (OP(scan) == BOL) ++(r->reganch);
 265     /* if there's something expensive in the r.e., find the longest
 266      * literal string that must appear and make it the regmust */
 267     /* resolve ties in favor of later strings, since the regstart
 268      * check works with the beginning of the r.e. and avoiding
 269      * duplication strengthens checking */
 270     /* not a strong reason, but sufficient in the absence of others */
 271     if (flags&SPSTART) {
 272       longest = NULL;
 273       len = 0;
 274       for (; scan != NULL; scan = regnext(scan)) {
 275         if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
 276           longest = OPERAND(scan);
 277           len = strlen(OPERAND(scan));
 278         }
 279       }
 280       r->regmust = longest;
 281       r->regmlen = len;
 282     }
 283   }
 284   return r;
 285 }
 286
 287
 288 /*
 289  - reg - regular expression, i.e. main body or parenthesized thing
 290  *
 291  * Caller must absorb opening parenthesis.
 292  *
 293  * Combining parenthesis handling with the base level of regular expression
 294  * is a trifle forced, but the need to tie the tails of the branches to what
 295  * follows makes it hard to avoid.
 296  */
 297 static char *reg (int paren/*parenthesized?*/, int *flagp) {
 298   char *ret;
 299   char *br;
 300   char *ender;
 301   int parno = 0; /*k8*/
 302   int flags;
 303   //
 304   *flagp = HASWIDTH; /* tentatively */
 305   /* make an OPEN node, if parenthesized */
 306   if (paren) {
 307     if (regnpar >= NSUBEXP) FAIL("too many ()");
 308     parno = regnpar;
 309     ++regnpar;
 310     ret = regnode(OPEN+parno);
 311   } else {
 312     ret = NULL;
 313   }
 314   /* pick up the branches, linking them together */
 315   br = regbranch(&flags);
 316   if (br == NULL) return NULL;
 317   if (ret != NULL) regtail(ret, br); /* OPEN -> first */ else ret = br;
 318   if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
 319   *flagp |= flags&SPSTART;
 320   while (*regparse == '|' || *regparse == '\n') {
 321     ++regparse;
 322     br = regbranch(&flags);
 323     if (br == NULL) return NULL;
 324     regtail(ret, br); /* BRANCH -> BRANCH */
 325     if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
 326     *flagp |= flags&SPSTART;
 327   }
 328   /* make a closing node, and hook it on the end */
 329   ender = regnode(paren?CLOSE+parno:END);
 330   regtail(ret, ender);
 331   /* hook the tails of the branches to the closing node */
 332   for (br = ret; br != NULL; br = regnext(br)) regoptail(br, ender);
 333   /* check for proper termination */
 334   if (paren && *regparse++ != ')') {
 335     FAIL("unmatched ()");
 336   } else if (!paren && *regparse != '\0') {
 337     if (*regparse == ')') {
 338       FAIL("unmatched ()");
 339     } else {
 340       FAIL("junk on end"); /* "Can't happen". */
 341     }
 342     /* NOTREACHED */
 343   }
 344   return ret;
 345 }
 346
 347
 348 /*
 349  - regbranch - one alternative of an | operator
 350  *
 351  * Implements the concatenation operator.
 352  */
 353 static char *regbranch (int *flagp) {
 354   char *ret;
 355   char *chain;
 356   char *latest;
 357   int flags;
 358   //
 359   *flagp = WORST; /* tentatively */
 360   ret = regnode(BRANCH);
 361   chain = NULL;
 362   while (*regparse != '\0' && *regparse != ')' && *regparse != '\n' && *regparse != '|') {
 363     latest = regpiece(&flags);
 364     if (latest == NULL) return NULL;
 365     *flagp |= flags&HASWIDTH;
 366     if (chain == NULL) *flagp |= flags&SPSTART; /* first piece */ else regtail(chain, latest);
 367     chain = latest;
 368   }
 369   if (chain == NULL) regnode(NOTHING); /* loop ran zero times */
 370   return ret;
 371 }
 372
 373
 374 /*
 375  - regpiece - something followed by possible [*+?]
 376  *
 377  * Note that the branching code sequences used for ? and the general cases
 378  * of * and + are somewhat optimized:  they use the same NOTHING node as
 379  * both the endmarker for their branch list and the body of the last branch.
 380  * It might seem that this node could be dispensed with entirely, but the
 381  * endmarker role is not redundant.
 382  */
 383 static char *regpiece (int *flagp) {
 384   char *ret;
 385   char op;
 386   char *next;
 387   int flags;
 388   //
 389   ret = regatom(&flags);
 390   if (ret == NULL) return NULL;
 391   op = *regparse;
 392   if (!ISMULT(op)) { *flagp = flags; return ret; }
 393   if (!(flags&HASWIDTH) && op != '?') FAIL("*+ operand could be empty");
 394   *flagp = (op != '+')?(WORST|SPSTART):(WORST|HASWIDTH);
 395   if (op == '*' && (flags&SIMPLE)) {
 396     reginsert(STAR, ret);
 397   } else if (op == '*') {
 398     /* emit x* as (x&|), where & means "self" */
 399     reginsert(BRANCH, ret);         /* either x */
 400     regoptail(ret, regnode(BACK));  /* and loop */
 401     regoptail(ret, ret);            /* back */
 402     regtail(ret, regnode(BRANCH));  /* or */
 403     regtail(ret, regnode(NOTHING)); /* null */
 404   } else if (op == '+' && (flags&SIMPLE)) {
 405     reginsert(PLUS, ret);
 406   } else if (op == '+') {
 407     /* emit x+ as x(&|), where & means "self" */
 408     next = regnode(BRANCH);         /* either */
 409     regtail(ret, next);
 410     regtail(regnode(BACK), ret);    /* loop back */
 411     regtail(next, regnode(BRANCH)); /* or */
 412     regtail(ret, regnode(NOTHING)); /* null */
 413   } else if (op == '?') {
 414     /* emit x? as (x|) */
 415     reginsert(BRANCH, ret);        /* either x */
 416     regtail(ret, regnode(BRANCH)); /* or */
 417     next = regnode(NOTHING);       /* null */
 418     regtail(ret, next);
 419     regoptail(ret, next);
 420   }
 421   ++regparse;
 422   if (ISMULT(*regparse)) FAIL("nested *?+");
 423   return(ret);
 424 }
 425
 426
 427 /*
 428  - regatom - the lowest level
 429  *
 430  * Optimization:  gobbles an entire sequence of ordinary characters so that
 431  * it can turn them into a single node, which is smaller to store and
 432  * faster to run.  Backslashed characters are exceptions, each becoming a
 433  * separate node; the code is simpler that way and it's not worth fixing.
 434  */
 435 static char *regatom (int *flagp) {
 436   char *ret;
 437   int flags;
 438   //
 439   *flagp = WORST; /* tentatively */
 440   switch (*regparse++) {
 441     /* FIXME: these chars only have meaning at beg/end of pat? */
 442     case '^':
 443       ret = regnode(BOL);
 444       break;
 445     case '$':
 446       ret = regnode(EOL);
 447       break;
 448     //
 449     case '.':
 450       ret = regnode(ANY);
 451       *flagp |= HASWIDTH|SIMPLE;
 452       break;
 453     case '[': {
 454       int classr, classend;
 455       //
 456       if (*regparse == '^') {
 457         /* complement of range */
 458         ret = regnode(ANYBUT);
 459         ++regparse;
 460       } else {
 461         ret = regnode(ANYOF);
 462       }
 463       if (*regparse == ']' || *regparse == '-') regc(*regparse++);
 464       while (*regparse != '\0' && *regparse != ']') {
 465         if (*regparse == '-') {
 466           ++regparse;
 467           if (*regparse == ']' || *regparse == '\0') {
 468             regc('-');
 469           } else {
 470             classr = UCHARAT(regparse-2)+1;
 471             classend = UCHARAT(regparse);
 472             if (classr > classend+1) FAIL("invalid [] range");
 473             for (; classr <= classend; ++classr) regc(classr);
 474             ++regparse;
 475           }
 476         } else {
 477           regc(*regparse++);
 478         }
 479       }
 480       regc('\0');
 481       if (*regparse != ']') FAIL("unmatched []");
 482       ++regparse;
 483       *flagp |= HASWIDTH|SIMPLE;
 484       } break;
 485     case '(':
 486       ret = reg(1, &flags);
 487       if (ret == NULL) return NULL;
 488       *flagp |= flags&(HASWIDTH|SPSTART);
 489       break;
 490     case '\0':
 491     case '|':
 492     case '\n':
 493     case ')':
 494       FAIL("internal urp"); /* supposed to be caught earlier */
 495       break;
 496     case '?':
 497     case '+':
 498     case '*':
 499       FAIL("?+* follows nothing");
 500       break;
 501     case '\\':
 502       switch (*regparse++) {
 503         case '\0': FAIL("trailing \\"); break;
 504         case '<': ret = regnode(WORDA); break;
 505         case '>': ret = regnode(WORDZ); break;
 506         /* FIXME: Someday handle \1, \2, ... */
 507         default: goto de_fault; /* Handle general quoted chars in exact-match routine */
 508       }
 509       break;
 510 de_fault:
 511     default:
 512       /*
 513        * Encode a string of characters to be matched exactly.
 514        *
 515        * This is a bit tricky due to quoted chars and due to
 516        * '*', '+', and '?' taking the SINGLE char previous
 517        * as their operand.
 518        *
 519        * On entry, the char at regparse[-1] is going to go
 520        * into the string, no matter what it is.  (It could be
 521        * following a \ if we are entered from the '\' case.)
 522        *
 523        * Basic idea is to pick up a good char in  ch  and
 524        * examine the next char.  If it's *+? then we twiddle.
 525        * If it's \ then we frozzle.  If it's other magic char
 526        * we push  ch  and terminate the string.  If none of the
 527        * above, we push  ch  on the string and go around again.
 528        *
 529        *  regprev  is used to remember where "the current char"
 530        * starts in the string, if due to a *+? we need to back
 531        * up and put the current char in a separate, 1-char, string.
 532        * When  regprev  is NULL,  ch  is the only char in the
 533        * string; this is used in *+? handling, and in setting
 534        * flags |= SIMPLE at the end.
 535        */
 536       {
 537         char *regprev;
 538         char ch;
 539         //
 540         --regparse; /* look at cur char */
 541         ret = regnode(EXACTLY);
 542         for (regprev = 0; ;) {
 543           ch = *regparse++; /* get current char */
 544           /* look at next one */
 545           switch (*regparse) {
 546             case '.': case '[': case '(':
 547             case ')': case '|': case '\n':
 548             case '$': case '^':
 549             case '\0':
 550               /* FIXME, $ and ^ should not always be magic */
 551 magic:        regc(ch); /* dump cur char */
 552               goto done; /* and we are done */
 553             case '?': case '+': case '*':
 554               if (!regprev) goto magic; /* if just ch in str, use it */
 555               /* end mult-char string one early */
 556               regparse = regprev; /* back up parse */
 557               goto done;
 558             case '\\':
 559               regc(ch); /* cur char OK */
 560               /* look after \ */
 561               switch (regparse[1]) {
 562                 case '\0':
 563                 case '<':
 564                 case '>':
 565                   /* FIXME: someday handle \1, \2, ... */
 566                   goto done; /* not quoted */
 567                 default:
 568                   /* backup point is \, scan point is after it */
 569                   regprev = regparse;
 570                   ++regparse;
 571                   continue; /* NOT break; */
 572               }
 573             default:
 574               regc(ch); /* add cur to string */
 575               break;
 576           }
 577           regprev = regparse; /* set backup point */
 578         }
 579 done:   regc('\0');
 580         *flagp |= HASWIDTH;
 581         if (!regprev) *flagp |= SIMPLE; /* one char? */
 582       }
 583       break;
 584   }
 585   return ret;
 586 }
 587
 588
 589 /*
 590  - regnode - emit a node
 591  */
 592 /* return location */
 593 static char *regnode (int op) {
 594   char *ret;
 595   char *ptr;
 596   //
 597   ret = regcode;
 598   if (ret == &regdummy) { regsize += 3; return ret; }
 599   ptr = ret;
 600   *ptr++ = op;
 601   *ptr++ = '\0'; /* null "next" pointer */
 602   *ptr++ = '\0';
 603   regcode = ptr;
 604   return ret;
 605 }
 606
 607
 608 /*
 609  - regc - emit (if appropriate) a byte of code
 610  */
 611 static void regc (int b) {
 612   if (regcode != &regdummy) *regcode++ = b; else ++regsize;
 613 }
 614
 615
 616 /*
 617  - reginsert - insert an operator in front of already-emitted operand
 618  *
 619  * Means relocating the operand.
 620  */
 621 static void reginsert (char op, char *opnd) {
 622   char *src;
 623   char *dst;
 624   char *place;
 625   //
 626   if (regcode == &regdummy) { regsize += 3; return; }
 627   src = regcode;
 628   regcode += 3;
 629   dst = regcode;
 630   while (src > opnd) *--dst = *--src;
 631   place = opnd; /* op node, where operand used to be */
 632   *place++ = op;
 633   *place++ = '\0';
 634   *place++ = '\0';
 635 }
 636
 637
 638 /*
 639  - regtail - set the next-pointer at the end of a node chain
 640  */
 641 static void regtail (char *p, char *val) {
 642   char *scan;
 643   char *temp;
 644   int offset;
 645   //
 646   if (p == &regdummy) return;
 647   /* find last node */
 648   scan = p;
 649   for (;;) {
 650     temp = regnext(scan);
 651     if (temp == NULL) break;
 652     scan = temp;
 653   }
 654   offset = OP(scan)==BACK?scan-val:val-scan;
 655   *(scan+1) = (offset>>8)&0377;
 656   *(scan+2) = offset&0377;
 657 }
 658
 659
 660 /*
 661  - regoptail - regtail on operand of first argument; nop if operandless
 662  */
 663 static void regoptail (char *p, char *val) {
 664   /* "operandless" and "op != BRANCH" are synonymous in practice */
 665   if (p == NULL || p == &regdummy || OP(p) != BRANCH) return;
 666   regtail(OPERAND(p), val);
 667 }
 668
 669
 670 /*
 671  * hsRxExec and friends
 672  */
 673
 674 /*
 675  * global work variables for hsRxExec()
 676  */
 677 static const char *reginput; /* string-input pointer */
 678 static char *regbol; /* beginning of input, for ^ check */
 679 static const char **regstartp; /* pointer to startp array */
 680 static const char **regendp; /* ditto for endp */
 681
 682 /* forwards */
 683 static int regtry (HSRegExp *prog, const char *string);
 684 static int regmatch (char *prog);
 685 static int regrepeat (char *p);
 686
 687 #ifdef RXDEBUG
 688 int hsRxNarrate = 0;
 689 static const char *regprop (const char *op);
 690 #endif
 691
 692
 693 int hsRxFree (HSRegExp *re) {
 694   if (re != NULL) free(re);
 695   return 0;
 696 }
 697
 698
 699 /*
 700  - hsRxExec - match a regexp against a string
 701  */
 702 int hsRxExec (HSRegExp *prog, const char *string) {
 703   char *s;
 704   //
 705   /* be paranoid... */
 706   if (prog == NULL || string == NULL) { hsRxError("NULL parameter"); return 0; }
 707   /* check validity of program */
 708   if (UCHARAT(prog->program) != MAGIC) { hsRxError("corrupted program"); return 0; }
 709   /* if there is a "must appear" string, look for it */
 710   if (prog->regmust != NULL) {
 711     s = (char *)string;
 712     while ((s = strchr(s, prog->regmust[0])) != NULL) {
 713       if (strncmp(s, prog->regmust, prog->regmlen) == 0) break; /* found it */
 714       ++s;
 715     }
 716     if (s == NULL) return 0; /* not present */
 717   }
 718   /* mark beginning of line for ^ */
 719   regbol = (char *)string;
 720   /* simplest case: anchored match need be tried only once */
 721   if (prog->reganch) return regtry(prog, string);
 722   /* messy cases: unanchored match */
 723   s = (char *)string;
 724   if (prog->regstart != '\0') {
 725     /* we know what char it must start with */
 726     while ((s = strchr(s, prog->regstart)) != NULL) {
 727       if (regtry(prog, s)) return 1;
 728       ++s;
 729     }
 730   } else {
 731     /* we don't -- general case */
 732     do {
 733       if (regtry(prog, s)) return 1;
 734     } while (*s++ != '\0');
 735   }
 736   /* failure */
 737   return 0;
 738 }
 739
 740
 741 /*
 742  - regtry - try match at specific point
 743  */
 744 /* 0 failure, 1 success */
 745 static int regtry (HSRegExp *prog, const char *string) {
 746   int i;
 747   const char **sp, **ep;
 748   //
 749   reginput = string;
 750   regstartp = prog->startp;
 751   regendp = prog->endp;
 752   sp = prog->startp;
 753   ep = prog->endp;
 754   for (i = NSUBEXP; i > 0; --i) { *sp++ = NULL; *ep++ = NULL; }
 755   if (regmatch(prog->program+1)) {
 756     prog->startp[0] = string;
 757     prog->endp[0] = reginput;
 758     return 1;
 759   }
 760   return 0;
 761 }
 762
 763
 764 /*
 765  - regmatch - main matching routine
 766  *
 767  * Conceptually the strategy is simple:  check to see whether the current
 768  * node matches, call self recursively to see whether the rest matches,
 769  * and then act accordingly.  In practice we make some effort to avoid
 770  * recursion, in particular by going through "ordinary" nodes (that don't
 771  * need to know whether the rest of the match failed) by a loop instead of
 772  * by recursion.
 773  */
 774 /* 0 failure, 1 success */
 775 static int regmatch (char *prog) {
 776   const char *save;
 777   char *scan; /* current node */
 778   char *next; /* next node */
 779   int no;
 780   //
 781   scan = prog;
 782 #ifdef RXDEBUG
 783   if (scan != NULL && hsRxNarrate) fprintf(stderr, "%s(\n", regprop(scan));
 784 #endif
 785   while (scan != NULL) {
 786 #ifdef RXDEBUG
 787     if (hsRxNarrate) fprintf(stderr, "%s...\n", regprop(scan));
 788 #endif
 789     next = regnext(scan);
 790     if (OP(scan) >= OPEN && OP(scan) < OPEN+NSUBEXP) {
 791       no = OP(scan)-OPEN;
 792       save = reginput;
 793       if (regmatch(next)) {
 794         /* don't set startp if some later invocation of the same parentheses already has */
 795         if (regstartp[no] == NULL || regstartp[no] > save) regstartp[no] = save;
 796         return 1;
 797       }
 798       return 0;
 799     }
 800     if (OP(scan) >= CLOSE && OP(scan) < CLOSE+NSUBEXP) {
 801       no = OP(scan)-CLOSE;
 802       save = reginput;
 803       if (regmatch(next)) {
 804         /* don't set endp if some later invocation of the same parentheses already has */
 805         if (regendp[no] == NULL || regendp[no] < save) regendp[no] = save;
 806         return 1;
 807       }
 808       return 0;
 809     }
 810     switch (OP(scan)) {
 811       case BOL: if (reginput != regbol) return 0; break;
 812       case EOL: if (*reginput != '\0') return 0; break;
 813       case WORDA:
 814         /* must be looking at a letter, digit, or _ */
 815         if ((!isalnum(*reginput)) && *reginput != '_') return 0;
 816         /* prev must be BOL or nonword */
 817         if (reginput > regbol && (isalnum(reginput[-1]) || reginput[-1] == '_')) return 0;
 818         break;
 819       case WORDZ:
 820         /* must be looking at non letter, digit, or _ */
 821         if (isalnum(*reginput) || *reginput == '_') return 0;
 822         /* we don't care what the previous char was */
 823         break;
 824       case ANY:
 825         if (*reginput == '\0') return 0;
 826         ++reginput;
 827         break;
 828       case EXACTLY: {
 829         int len;
 830         char *opnd;
 831         //
 832         opnd = OPERAND(scan);
 833         /* inline the first character, for speed */
 834         if (*opnd != *reginput) return 0;
 835         len = strlen(opnd);
 836         if (len > 1 && strncmp(opnd, reginput, len) != 0) return 0;
 837         reginput += len;
 838         } break;
 839       case ANYOF:
 840         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) return 0;
 841         ++reginput;
 842         break;
 843       case ANYBUT:
 844         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) return 0;
 845         ++reginput;
 846         break;
 847       case NOTHING: break;
 848       case BACK: break;
 849       case BRANCH: {
 850         //
 851         if (OP(next) != BRANCH) {
 852           /* no choice */
 853           /* avoid recursion, i.e. move PC to the next VM instruction */
 854           next = OPERAND(scan);
 855         } else {
 856           do {
 857             save = reginput;
 858             if (regmatch(OPERAND(scan))) return 1; /* execute from next instruction */
 859             reginput = save;
 860             /* perform jump, if failed */
 861             scan = regnext(scan);
 862           } while (scan != NULL && OP(scan) == BRANCH);
 863           return 0;
 864           /* NOTREACHED */
 865         }
 866       } break;
 867       case STAR:
 868       case PLUS: {
 869         char nextch;
 870         int min;
 871         //
 872         /* lookahead to avoid useless match attempts when we know what character comes next */
 873         nextch = '\0';
 874         if (OP(next) == EXACTLY) nextch = *OPERAND(next);
 875         min = (OP(scan) == STAR)?0:1;
 876         save = reginput;
 877         no = regrepeat(OPERAND(scan));
 878         while (no >= min) {
 879           /* if it could work, try it */
 880           if (nextch == '\0' || *reginput == nextch) {
 881             if (regmatch(next)) return 1;
 882           }
 883           /* couldn't or didn't -- back up */
 884           --no;
 885           reginput = save+no;
 886         }
 887         return 0;
 888         } break;
 889       case END: return 1; /* success! */
 890       default: hsRxError("memory corruption"); return 0;
 891     } /* switch */
 892     scan = next;
 893   } /* while */
 894   /* we get here only if there's trouble -- normally "case END" is the terminating point */
 895   hsRxError("corrupted pointers");
 896   return 0;
 897 }
 898
 899
 900 /*
 901  - regrepeat - repeatedly match something simple, report how many
 902  */
 903 static int regrepeat (char *p) {
 904   int count = 0;
 905   const char *scan;
 906   char *opnd;
 907   //
 908   scan = reginput;
 909   opnd = OPERAND(p);
 910   switch (OP(p)) {
 911     case ANY:
 912       count = strlen(scan);
 913       scan += count;
 914       break;
 915     case EXACTLY:
 916       while (*opnd == *scan) { ++count; ++scan; }
 917       break;
 918     case ANYOF:
 919       while (*scan != '\0' && strchr(opnd, *scan) != NULL) { ++count; ++scan; }
 920       break;
 921     case ANYBUT:
 922       while (*scan != '\0' && strchr(opnd, *scan) == NULL) { ++count; ++scan; }
 923       break;
 924     default: /* oh dear! called inappropriately */
 925       hsRxError("internal foulup");
 926       count = 0; /* best compromise */
 927       break;
 928   }
 929   reginput = scan;
 930   return count;
 931 }
 932
 933
 934 /*
 935  - regnext - dig the "next" pointer out of a node
 936  */
 937 static char *regnext (char *p) {
 938   int offset;
 939   //
 940   if (p == &regdummy) return NULL;
 941   offset = NEXT(p);
 942   if (offset == 0) return NULL;
 943   if (OP(p) == BACK) return p-offset;
 944   return p+offset;
 945 }
 946
 947
 948 #ifdef RXDEBUG
 949 /*
 950  - hsRxDump - dump a regexp onto stdout in vaguely comprehensible form
 951  */
 952 void hsRxDump (HSRegExp *r) {
 953   char *s, *next;
 954   char op = EXACTLY; /* arbitrary non-END op */
 955   //
 956   s = r->program+1;
 957   /* while that wasn't END last time... */
 958   while (op != END) {
 959     op = OP(s);
 960     printf("%5d: %s", s-r->program, regprop(s)); /* where, what */
 961     next = regnext(s);
 962     if (next == NULL) printf("%3d", 0); /* next ptr */
 963     else printf("%3d", (s-r->program)+(next-s));
 964     s += 3;
 965     if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
 966       /* literal string, where present */
 967       putchar(' ');
 968       putchar('"');
 969       while (*s != '\0') {
 970         char ch = *s++;
 971         if ((unsigned char)ch < ' ' || ch == 127 || ch == '"') {
 972           switch (ch) {
 973             case '"': printf("\\\""); break;
 974             case '\t': printf("\\t"); break;
 975             case '\n': printf("\\n"); break;
 976             case '\r': printf("\\r"); break;
 977             default: printf("\\x%02x", (unsigned char)ch); break;
 978           }
 979         } else {
 980           putchar(ch);
 981         }
 982       }
 983       putchar('"');
 984       ++s;
 985     }
 986     putchar('\n');
 987   }
 988   /* header fields of interest */
 989   if (r->regstart != '\0') printf("start `%c' ", r->regstart);
 990   if (r->reganch) printf("anchored ");
 991   if (r->regmust != NULL) printf("must have \"%s\"", r->regmust);
 992   printf("\n");
 993 }
 994
 995
 996 /*
 997  - regprop - printable representation of opcode
 998  */
 999 static const char *regprop (const char *op) {
1000   char *p = NULL;
1001   static char buf[50];
1002   //
1003   buf[0] = '\0';
1004   if (OP(op) >= OPEN && OP(op) < OPEN+NSUBEXP) {
1005     sprintf(buf+strlen(buf), "OPEN%02d", OP(op)-OPEN);
1006   } else if (OP(op) >= CLOSE && OP(op) < CLOSE+NSUBEXP) {
1007     sprintf(buf+strlen(buf), "CLOSE%02d", OP(op)-OPEN);
1008   } else {
1009     switch (OP(op)) {
1010       case BOL: p = "BOL"; break;
1011       case EOL: p = "EOL"; break;
1012       case ANY: p = "ANY"; break;
1013       case ANYOF: p = "ANYOF"; break;
1014       case ANYBUT: p = "ANYBUT"; break;
1015       case BRANCH: p = "BRANCH"; break;
1016       case EXACTLY: p = "EXACTLY"; break;
1017       case NOTHING: p = "NOTHING"; break;
1018       case BACK: p = "BACK"; break;
1019       case END: p = "END"; break;
1020       case STAR: p = "STAR"; break;
1021       case PLUS: p = "PLUS"; break;
1022       case WORDA: p = "WORDA"; break;
1023       case WORDZ: p = "WORDZ"; break;
1024       default: hsRxError("corrupted opcode"); break;
1025     }
1026   }
1027   if (p != NULL) strcat(buf, p);
1028   while (strlen(buf) < 10) strcat(buf, " ");
1029   return buf;
1030 }
1031 #endif
1032
1033
1034 /*
1035  * The following is provided for those people who do not have strcspn() in
1036  * their C libraries.  They should get off their butts and do something
1037  * about it; at least one public-domain implementation of those (highly
1038  * useful) string routines has been published on Usenet.
1039  */
1040 #ifdef STRCSPN
1041 /*
1042  * strcspn - find length of initial segment of s1 consisting entirely
1043  * of characters not from s2
1044  */
1045
1046 static int strcspn (const char *s1, const char *s2) {
1047   const char *scan1, *scan2;
1048   int count;
1049   //
1050   count = 0;
1051   for (scan1 = s1; *scan1 != '\0'; ++scan1) {
1052     for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1053       if (*scan1 == *scan2++) return count ;
1054     ++count;
1055   }
1056   return count;
1057 }
1058 #endif
1059
1060
1061 void hsRxError (const char *s) {
1062   printf("re error %s\n", s);
1063 }