compiler/posixc/regex/regcomp.c

   1 /*      $NetBSD: regcomp.c,v 1.28 2007/02/09 23:44:18 junyoung Exp $    */
   2
   3 /*-
   4  * Copyright (c) 1992, 1993, 1994
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Henry Spencer.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  35  */
  36
  37 /*-
  38  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
  39  *
  40  * This code is derived from software contributed to Berkeley by
  41  * Henry Spencer.
  42  *
  43  * Redistribution and use in source and binary forms, with or without
  44  * modification, are permitted provided that the following conditions
  45  * are met:
  46  * 1. Redistributions of source code must retain the above copyright
  47  *    notice, this list of conditions and the following disclaimer.
  48  * 2. Redistributions in binary form must reproduce the above copyright
  49  *    notice, this list of conditions and the following disclaimer in the
  50  *    documentation and/or other materials provided with the distribution.
  51  * 3. All advertising materials mentioning features or use of this software
  52  *    must display the following acknowledgement:
  53  *      This product includes software developed by the University of
  54  *      California, Berkeley and its contributors.
  55  * 4. Neither the name of the University nor the names of its contributors
  56  *    may be used to endorse or promote products derived from this software
  57  *    without specific prior written permission.
  58  *
  59  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  60  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  62  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  63  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  64  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  65  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  66  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  67  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  68  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  69  * SUCH DAMAGE.
  70  *
  71  *      @(#)regcomp.c   8.5 (Berkeley) 3/20/94
  72  */
  73
  74 /*
  75 __RCSID("$NetBSD: regcomp.c,v 1.28 2007/02/09 23:44:18 junyoung Exp $");
  76 */
  77
  78 #if defined(__AROS__)
  79 #if !DEBUG
  80 #define NDEBUG
  81 #else
  82 #define REDEBUG
  83 #endif
  84 #endif
  85
  86 #include <sys/types.h>
  87 #include <inttypes.h>
  88
  89 #include <assert.h>
  90 #include <ctype.h>
  91 #include <limits.h>
  92 #include <stdio.h>
  93 #include <stdlib.h>
  94 #include <string.h>
  95 #include <regex.h>
  96
  97 #ifdef __weak_alias
  98 __weak_alias(regcomp,_regcomp)
  99 #endif
 100
 101 #include "utils.h"
 102 #include "regex2.h"
 103
 104 #include "cclass.h"
 105 #include "cname.h"
 106
 107 /*
 108  * parse structure, passed up and down to avoid global variables and
 109  * other clumsinesses
 110  */
 111 struct parse {
 112         const char *next;       /* next character in RE */
 113         const char *end;        /* end of string (-> NUL normally) */
 114         int error;              /* has an error been seen? */
 115         sop *strip;             /* malloced strip */
 116         sopno ssize;            /* malloced strip size (allocated) */
 117         sopno slen;             /* malloced strip length (used) */
 118         int ncsalloc;           /* number of csets allocated */
 119         struct re_guts *g;
 120 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */
 121         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
 122         sopno pend[NPAREN];     /* -> ) ([0] unused) */
 123 };
 124
 125 /* ========= begin header generated by ./mkh ========= */
 126 #ifdef __cplusplus
 127 extern "C" {
 128 #endif
 129
 130 /* === regcomp.c === */
 131 static void p_ere(struct parse *p, int stop);
 132 static void p_ere_exp(struct parse *p);
 133 static void p_str(struct parse *p);
 134 static void p_bre(struct parse *p, int end1, int end2);
 135 static int p_simp_re(struct parse *p, int starordinary);
 136 static int p_count(struct parse *p);
 137 static void p_bracket(struct parse *p);
 138 static void p_b_term(struct parse *p, cset *cs);
 139 static void p_b_cclass(struct parse *p, cset *cs);
 140 static void p_b_eclass(struct parse *p, cset *cs);
 141 static char p_b_symbol(struct parse *p);
 142 static char p_b_coll_elem(struct parse *p, int endc);
 143 static int othercase(int ch);
 144 static void bothcases(struct parse *p, int ch);
 145 static void ordinary(struct parse *p, int ch);
 146 static void nonnewline(struct parse *p);
 147 static void repeat(struct parse *p, sopno start, int from, int to);
 148 static int seterr(struct parse *p, int e);
 149 static cset *allocset(struct parse *p);
 150 static void freeset(struct parse *p, cset *cs);
 151 static int freezeset(struct parse *p, cset *cs);
 152 static int firstch(struct parse *p, cset *cs);
 153 static int nch(struct parse *p, cset *cs);
 154 static void mcadd(struct parse *p, cset *cs, const char *cp);
 155 #if 0
 156 static void mcsub(cset *cs, char *cp);
 157 static int mcin(cset *cs, char *cp);
 158 static char *mcfind(cset *cs, char *cp);
 159 #endif
 160 static void mcinvert(struct parse *p, cset *cs);
 161 static void mccase(struct parse *p, cset *cs);
 162 static int isinsets(struct re_guts *g, int c);
 163 static int samesets(struct re_guts *g, int c1, int c2);
 164 static void categorize(struct parse *p, struct re_guts *g);
 165 static sopno dupl(struct parse *p, sopno start, sopno finish);
 166 static void doemit(struct parse *p, sop op, sopno opnd);
 167 static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos);
 168 static void dofwd(struct parse *p, sopno pos, sopno value);
 169 static void enlarge(struct parse *p, sopno size);
 170 static void stripsnug(struct parse *p, struct re_guts *g);
 171 static void findmust(struct parse *p, struct re_guts *g);
 172 static sopno pluscount(struct parse *p, struct re_guts *g);
 173
 174 #ifdef __cplusplus
 175 }
 176 #endif
 177 /* ========= end header generated by ./mkh ========= */
 178
 179 static char nuls[10];           /* place to point scanner in event of error */
 180
 181 /*
 182  * macros for use with parse structure
 183  * BEWARE:  these know that the parse structure is named `p' !!!
 184  */
 185 #define PEEK()  (*p->next)
 186 #define PEEK2() (*(p->next+1))
 187 #define MORE()  (p->next < p->end)
 188 #define MORE2() (p->next+1 < p->end)
 189 #define SEE(c)  (MORE() && PEEK() == (c))
 190 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
 191 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 192 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 193 #define NEXT()  (p->next++)
 194 #define NEXT2() (p->next += 2)
 195 #define NEXTn(n)        (p->next += (n))
 196 #define GETNEXT()       (*p->next++)
 197 #define SETERROR(e)     seterr(p, (e))
 198 #define REQUIRE(co, e)  (void) ((co) || SETERROR(e))
 199 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
 200 #define MUSTEAT(c, e)   (void) (REQUIRE(MORE() && GETNEXT() == (c), e))
 201 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
 202 #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd)
 203 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 204 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
 205 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
 206 #define HERE()          (p->slen)
 207 #define THERE()         (p->slen - 1)
 208 #define THERETHERE()    (p->slen - 2)
 209 #define DROP(n) (p->slen -= (n))
 210
 211 #ifndef NDEBUG
 212 static int never = 0;           /* for use in asserts; shuts lint up */
 213 #else
 214 #define never   0               /* some <assert.h>s have bugs too */
 215 #endif
 216
 217 /*
 218  - regcomp - interface for parser and compilation
 219  = extern int regcomp(regex_t *, const char *, int);
 220  = #define      REG_BASIC       0000
 221  = #define      REG_EXTENDED    0001
 222  = #define      REG_ICASE       0002
 223  = #define      REG_NOSUB       0004
 224  = #define      REG_NEWLINE     0010
 225  = #define      REG_NOSPEC      0020
 226  = #define      REG_PEND        0040
 227  = #define      REG_DUMP        0200
 228  */
 229 int                             /* 0 success, otherwise REG_something */
 230 regcomp(
 231     regex_t *preg,
 232     const char *pattern,
 233     int cflags)
 234 {
 235         struct parse pa;
 236         struct re_guts *g;
 237         struct parse *p = &pa;
 238         int i;
 239         size_t len;
 240 #ifdef REDEBUG
 241 #       define  GOODFLAGS(f)    (f)
 242 #else
 243 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP)
 244 #endif
 245
 246         assert(preg != NULL);
 247         assert(pattern != NULL);
 248
 249         cflags = GOODFLAGS(cflags);
 250         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 251                 return(REG_INVARG);
 252
 253         if (cflags&REG_PEND) {
 254                 if (preg->re_endp < pattern)
 255                         return(REG_INVARG);
 256                 len = preg->re_endp - pattern;
 257         } else
 258                 len = strlen(pattern);
 259
 260         /* do the mallocs early so failure handling is easy */
 261         g = (struct re_guts *)malloc(sizeof(struct re_guts) +
 262                                                         (NC-1)*sizeof(cat_t));
 263         if (g == NULL)
 264                 return(REG_ESPACE);
 265         p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
 266         p->strip = (sop *)malloc(p->ssize * sizeof(sop));
 267         p->slen = 0;
 268         if (p->strip == NULL) {
 269                 free(g);
 270                 return(REG_ESPACE);
 271         }
 272
 273         /* set things up */
 274         p->g = g;
 275         p->next = pattern;
 276         p->end = p->next + len;
 277         p->error = 0;
 278         p->ncsalloc = 0;
 279         for (i = 0; i < NPAREN; i++) {
 280                 p->pbegin[i] = 0;
 281                 p->pend[i] = 0;
 282         }
 283         g->csetsize = NC;
 284         g->sets = NULL;
 285         g->setbits = NULL;
 286         g->ncsets = 0;
 287         g->cflags = cflags;
 288         g->iflags = 0;
 289         g->nbol = 0;
 290         g->neol = 0;
 291         g->must = NULL;
 292         g->mlen = 0;
 293         g->nsub = 0;
 294         g->ncategories = 1;     /* category 0 is "everything else" */
 295         g->categories = &g->catspace[-(CHAR_MIN)];
 296         (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
 297         g->backrefs = 0;
 298
 299         /* do it */
 300         EMIT(OEND, 0);
 301         g->firststate = THERE();
 302         if (cflags&REG_EXTENDED)
 303                 p_ere(p, OUT);
 304         else if (cflags&REG_NOSPEC)
 305                 p_str(p);
 306         else
 307                 p_bre(p, OUT, OUT);
 308         EMIT(OEND, 0);
 309         g->laststate = THERE();
 310
 311         /* tidy up loose ends and fill things in */
 312         categorize(p, g);
 313         stripsnug(p, g);
 314         findmust(p, g);
 315         g->nplus = pluscount(p, g);
 316         g->magic = MAGIC2;
 317         preg->re_nsub = g->nsub;
 318         preg->re_g = g;
 319         preg->re_magic = MAGIC1;
 320 #ifndef REDEBUG
 321         /* not debugging, so can't rely on the assert() in regexec() */
 322         if (g->iflags&BAD)
 323                 SETERROR(REG_ASSERT);
 324 #endif
 325
 326         /* win or lose, we're done */
 327         if (p->error != 0)      /* lose */
 328                 regfree(preg);
 329         return(p->error);
 330 }
 331
 332 /*
 333  - p_ere - ERE parser top level, concatenation and alternation
 334  == static void p_ere(struct parse *p, int stop);
 335  */
 336 static void
 337 p_ere(
 338     struct parse *p,
 339     int stop)                   /* character this ERE should end at */
 340 {
 341         char c;
 342         sopno prevback = 0;     /* pacify gcc */
 343         sopno prevfwd = 0;      /* pacify gcc */
 344         sopno conc;
 345         int first = 1;          /* is this the first alternative? */
 346
 347         assert(p != NULL);
 348
 349         for (;;) {
 350                 /* do a bunch of concatenated expressions */
 351                 conc = HERE();
 352                 while (MORE() && (c = PEEK()) != '|' && c != stop)
 353                         p_ere_exp(p);
 354                 REQUIRE(HERE() != conc, REG_EMPTY);     /* require nonempty */
 355
 356                 if (!EAT('|'))
 357                         break;          /* NOTE BREAK OUT */
 358
 359                 if (first) {
 360                         INSERT(OCH_, conc);     /* offset is wrong */
 361                         prevfwd = conc;
 362                         prevback = conc;
 363                         first = 0;
 364                 }
 365                 ASTERN(OOR1, prevback);
 366                 prevback = THERE();
 367                 AHEAD(prevfwd);                 /* fix previous offset */
 368                 prevfwd = HERE();
 369                 EMIT(OOR2, 0);                  /* offset is very wrong */
 370         }
 371
 372         if (!first) {           /* tail-end fixups */
 373                 AHEAD(prevfwd);
 374                 ASTERN(O_CH, prevback);
 375         }
 376
 377         assert(!MORE() || SEE(stop));
 378 }
 379
 380 /*
 381  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 382  == static void p_ere_exp(struct parse *p);
 383  */
 384 static void
 385 p_ere_exp(
 386     struct parse *p)
 387 {
 388         char c;
 389         sopno pos;
 390         int count;
 391         int count2;
 392         sopno subno;
 393         int wascaret = 0;
 394
 395         assert(p != NULL);
 396
 397         assert(MORE());         /* caller should have ensured this */
 398         c = GETNEXT();
 399
 400         pos = HERE();
 401         switch (c) {
 402         case '(':
 403                 REQUIRE(MORE(), REG_EPAREN);
 404                 p->g->nsub++;
 405                 subno = p->g->nsub;
 406                 if (subno < NPAREN)
 407                         p->pbegin[subno] = HERE();
 408                 EMIT(OLPAREN, subno);
 409                 if (!SEE(')'))
 410                         p_ere(p, ')');
 411                 if (subno < NPAREN) {
 412                         p->pend[subno] = HERE();
 413                         assert(p->pend[subno] != 0);
 414                 }
 415                 EMIT(ORPAREN, subno);
 416                 MUSTEAT(')', REG_EPAREN);
 417                 break;
 418 #ifndef POSIX_MISTAKE
 419         case ')':               /* happens only if no current unmatched ( */
 420                 /*
 421                  * You may ask, why the ifndef?  Because I didn't notice
 422                  * this until slightly too late for 1003.2, and none of the
 423                  * other 1003.2 regular-expression reviewers noticed it at
 424                  * all.  So an unmatched ) is legal POSIX, at least until
 425                  * we can get it fixed.
 426                  */
 427                 SETERROR(REG_EPAREN);
 428                 break;
 429 #endif
 430         case '^':
 431                 EMIT(OBOL, 0);
 432                 p->g->iflags |= USEBOL;
 433                 p->g->nbol++;
 434                 wascaret = 1;
 435                 break;
 436         case '$':
 437                 EMIT(OEOL, 0);
 438                 p->g->iflags |= USEEOL;
 439                 p->g->neol++;
 440                 break;
 441         case '|':
 442                 SETERROR(REG_EMPTY);
 443                 break;
 444         case '*':
 445         case '+':
 446         case '?':
 447                 SETERROR(REG_BADRPT);
 448                 break;
 449         case '.':
 450                 if (p->g->cflags&REG_NEWLINE)
 451                         nonnewline(p);
 452                 else
 453                         EMIT(OANY, 0);
 454                 break;
 455         case '[':
 456                 p_bracket(p);
 457                 break;
 458         case '\\':
 459                 REQUIRE(MORE(), REG_EESCAPE);
 460                 c = GETNEXT();
 461                 ordinary(p, c);
 462                 break;
 463         case '{':               /* okay as ordinary except if digit follows */
 464                 REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT);
 465                 /* FALLTHROUGH */
 466         default:
 467                 ordinary(p, c);
 468                 break;
 469         }
 470
 471         if (!MORE())
 472                 return;
 473         c = PEEK();
 474         /* we call { a repetition if followed by a digit */
 475         if (!( c == '*' || c == '+' || c == '?' ||
 476             (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ))
 477                 return;         /* no repetition, we're done */
 478         NEXT();
 479
 480         REQUIRE(!wascaret, REG_BADRPT);
 481         switch (c) {
 482         case '*':       /* implemented as +? */
 483                 /* this case does not require the (y|) trick, noKLUDGE */
 484                 INSERT(OPLUS_, pos);
 485                 ASTERN(O_PLUS, pos);
 486                 INSERT(OQUEST_, pos);
 487                 ASTERN(O_QUEST, pos);
 488                 break;
 489         case '+':
 490                 INSERT(OPLUS_, pos);
 491                 ASTERN(O_PLUS, pos);
 492                 break;
 493         case '?':
 494                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 495                 INSERT(OCH_, pos);              /* offset slightly wrong */
 496                 ASTERN(OOR1, pos);              /* this one's right */
 497                 AHEAD(pos);                     /* fix the OCH_ */
 498                 EMIT(OOR2, 0);                  /* offset very wrong... */
 499                 AHEAD(THERE());                 /* ...so fix it */
 500                 ASTERN(O_CH, THERETHERE());
 501                 break;
 502         case '{':
 503                 count = p_count(p);
 504                 if (EAT(',')) {
 505                         if (isdigit((unsigned char)PEEK())) {
 506                                 count2 = p_count(p);
 507                                 REQUIRE(count <= count2, REG_BADBR);
 508                         } else          /* single number with comma */
 509                                 count2 = INFINITY;
 510                 } else          /* just a single number */
 511                         count2 = count;
 512                 repeat(p, pos, count, count2);
 513                 if (!EAT('}')) {        /* error heuristics */
 514                         while (MORE() && PEEK() != '}')
 515                                 NEXT();
 516                         REQUIRE(MORE(), REG_EBRACE);
 517                         SETERROR(REG_BADBR);
 518                 }
 519                 break;
 520         }
 521
 522         if (!MORE())
 523                 return;
 524         c = PEEK();
 525         if (!( c == '*' || c == '+' || c == '?' ||
 526             (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) )
 527                 return;
 528         SETERROR(REG_BADRPT);
 529 }
 530
 531 /*
 532  - p_str - string (no metacharacters) "parser"
 533  == static void p_str(struct parse *p);
 534  */
 535 static void
 536 p_str(
 537     struct parse *p)
 538 {
 539
 540         assert(p != NULL);
 541
 542         REQUIRE(MORE(), REG_EMPTY);
 543         while (MORE())
 544                 ordinary(p, GETNEXT());
 545 }
 546
 547 /*
 548  - p_bre - BRE parser top level, anchoring and concatenation
 549  == static void p_bre(struct parse *p, int end1, \
 550  ==     int end2);
 551  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 552  *
 553  * This implementation is a bit of a kludge, in that a trailing $ is first
 554  * taken as an ordinary character and then revised to be an anchor.  The
 555  * only undesirable side effect is that '$' gets included as a character
 556  * category in such cases.  This is fairly harmless; not worth fixing.
 557  * The amount of lookahead needed to avoid this kludge is excessive.
 558  */
 559 static void
 560 p_bre(
 561     struct parse *p,
 562     int end1,           /* first terminating character */
 563     int end2)           /* second terminating character */
 564 {
 565         sopno start;
 566         int first = 1;                  /* first subexpression? */
 567         int wasdollar = 0;
 568
 569         assert(p != NULL);
 570
 571         start = HERE();
 572
 573         if (EAT('^')) {
 574                 EMIT(OBOL, 0);
 575                 p->g->iflags |= USEBOL;
 576                 p->g->nbol++;
 577         }
 578         while (MORE() && !SEETWO(end1, end2)) {
 579                 wasdollar = p_simp_re(p, first);
 580                 first = 0;
 581         }
 582         if (wasdollar) {        /* oops, that was a trailing anchor */
 583                 DROP(1);
 584                 EMIT(OEOL, 0);
 585                 p->g->iflags |= USEEOL;
 586                 p->g->neol++;
 587         }
 588
 589         REQUIRE(HERE() != start, REG_EMPTY);    /* require nonempty */
 590 }
 591
 592 /*
 593  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 594  == static int p_simp_re(struct parse *p, int starordinary);
 595  */
 596 static int                      /* was the simple RE an unbackslashed $? */
 597 p_simp_re(
 598     struct parse *p,
 599     int starordinary)           /* is a leading * an ordinary character? */
 600 {
 601         int c;
 602         int count;
 603         int count2;
 604         sopno pos;
 605         int i;
 606         sopno subno;
 607 #       define  BACKSL  (1<<CHAR_BIT)
 608
 609         assert(p != NULL);
 610
 611         pos = HERE();           /* repetion op, if any, covers from here */
 612
 613         assert(MORE());         /* caller should have ensured this */
 614         c = GETNEXT();
 615         if (c == '\\') {
 616                 REQUIRE(MORE(), REG_EESCAPE);
 617                 c = BACKSL | (unsigned char)GETNEXT();
 618         }
 619         switch (c) {
 620         case '.':
 621                 if (p->g->cflags&REG_NEWLINE)
 622                         nonnewline(p);
 623                 else
 624                         EMIT(OANY, 0);
 625                 break;
 626         case '[':
 627                 p_bracket(p);
 628                 break;
 629         case BACKSL|'{':
 630                 SETERROR(REG_BADRPT);
 631                 break;
 632         case BACKSL|'(':
 633                 p->g->nsub++;
 634                 subno = p->g->nsub;
 635                 if (subno < NPAREN)
 636                         p->pbegin[subno] = HERE();
 637                 EMIT(OLPAREN, subno);
 638                 /* the MORE here is an error heuristic */
 639                 if (MORE() && !SEETWO('\\', ')'))
 640                         p_bre(p, '\\', ')');
 641                 if (subno < NPAREN) {
 642                         p->pend[subno] = HERE();
 643                         assert(p->pend[subno] != 0);
 644                 }
 645                 EMIT(ORPAREN, subno);
 646                 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 647                 break;
 648         case BACKSL|')':        /* should not get here -- must be user */
 649         case BACKSL|'}':
 650                 SETERROR(REG_EPAREN);
 651                 break;
 652         case BACKSL|'1':
 653         case BACKSL|'2':
 654         case BACKSL|'3':
 655         case BACKSL|'4':
 656         case BACKSL|'5':
 657         case BACKSL|'6':
 658         case BACKSL|'7':
 659         case BACKSL|'8':
 660         case BACKSL|'9':
 661                 i = (c&~BACKSL) - '0';
 662                 assert(i < NPAREN);
 663                 if (p->pend[i] != 0) {
 664                         assert(i <= p->g->nsub);
 665                         EMIT(OBACK_, i);
 666                         assert(p->pbegin[i] != 0);
 667                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 668                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 669                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 670                         EMIT(O_BACK, i);
 671                 } else
 672                         SETERROR(REG_ESUBREG);
 673                 p->g->backrefs = 1;
 674                 break;
 675         case '*':
 676                 REQUIRE(starordinary, REG_BADRPT);
 677                 /* FALLTHROUGH */
 678         default:
 679                 ordinary(p, c &~ BACKSL);
 680                 break;
 681         }
 682
 683         if (EAT('*')) {         /* implemented as +? */
 684                 /* this case does not require the (y|) trick, noKLUDGE */
 685                 INSERT(OPLUS_, pos);
 686                 ASTERN(O_PLUS, pos);
 687                 INSERT(OQUEST_, pos);
 688                 ASTERN(O_QUEST, pos);
 689         } else if (EATTWO('\\', '{')) {
 690                 count = p_count(p);
 691                 if (EAT(',')) {
 692                         if (MORE() && isdigit((unsigned char)PEEK())) {
 693                                 count2 = p_count(p);
 694                                 REQUIRE(count <= count2, REG_BADBR);
 695                         } else          /* single number with comma */
 696                                 count2 = INFINITY;
 697                 } else          /* just a single number */
 698                         count2 = count;
 699                 repeat(p, pos, count, count2);
 700                 if (!EATTWO('\\', '}')) {       /* error heuristics */
 701                         while (MORE() && !SEETWO('\\', '}'))
 702                                 NEXT();
 703                         REQUIRE(MORE(), REG_EBRACE);
 704                         SETERROR(REG_BADBR);
 705                 }
 706         } else if (c == (unsigned char)'$')     /* $ (but not \$) ends it */
 707                 return(1);
 708
 709         return(0);
 710 }
 711
 712 /*
 713  - p_count - parse a repetition count
 714  == static int p_count(struct parse *p);
 715  */
 716 static int                      /* the value */
 717 p_count(
 718     struct parse *p)
 719 {
 720         int count = 0;
 721         int ndigits = 0;
 722
 723         assert(p != NULL);
 724
 725         while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) {
 726                 count = count*10 + (GETNEXT() - '0');
 727                 ndigits++;
 728         }
 729
 730         REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
 731         return(count);
 732 }
 733
 734 /*
 735  - p_bracket - parse a bracketed character list
 736  == static void p_bracket(struct parse *p);
 737  *
 738  * Note a significant property of this code:  if the allocset() did SETERROR,
 739  * no set operations are done.
 740  */
 741 static void
 742 p_bracket(
 743     struct parse *p)
 744 {
 745         cset *cs;
 746         int invert = 0;
 747
 748         assert(p != NULL);
 749
 750         cs = allocset(p);
 751
 752         /* Dept of Truly Sickening Special-Case Kludges */
 753         if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]",
 754                                             (size_t)6) == 0) {
 755                 EMIT(OBOW, 0);
 756                 NEXTn(6);
 757                 return;
 758         }
 759         if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]",
 760                                             (size_t)6) == 0) {
 761                 EMIT(OEOW, 0);
 762                 NEXTn(6);
 763                 return;
 764         }
 765
 766         if (EAT('^'))
 767                 invert++;       /* make note to invert set at end */
 768         if (EAT(']'))
 769                 CHadd(cs, ']');
 770         else if (EAT('-'))
 771                 CHadd(cs, '-');
 772         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
 773                 p_b_term(p, cs);
 774         if (EAT('-'))
 775                 CHadd(cs, '-');
 776         MUSTEAT(']', REG_EBRACK);
 777
 778         if (p->error != 0)      /* don't mess things up further */
 779                 return;
 780
 781         if (p->g->cflags&REG_ICASE) {
 782                 int i;
 783                 int ci;
 784
 785                 for (i = p->g->csetsize - 1; i >= 0; i--)
 786                         if (CHIN(cs, i) && isalpha(i)) {
 787                                 ci = othercase(i);
 788                                 if (ci != i)
 789                                         CHadd(cs, ci);
 790                         }
 791                 if (cs->multis != NULL)
 792                         mccase(p, cs);
 793         }
 794         if (invert) {
 795                 int i;
 796
 797                 for (i = p->g->csetsize - 1; i >= 0; i--)
 798                         if (CHIN(cs, i))
 799                                 CHsub(cs, i);
 800                         else
 801                                 CHadd(cs, i);
 802                 if (p->g->cflags&REG_NEWLINE)
 803                         CHsub(cs, '\n');
 804                 if (cs->multis != NULL)
 805                         mcinvert(p, cs);
 806         }
 807
 808         assert(cs->multis == NULL);             /* xxx */
 809
 810         if (nch(p, cs) == 1) {          /* optimize singleton sets */
 811                 ordinary(p, firstch(p, cs));
 812                 freeset(p, cs);
 813         } else
 814                 EMIT(OANYOF, freezeset(p, cs));
 815 }
 816
 817 /*
 818  - p_b_term - parse one term of a bracketed character list
 819  == static void p_b_term(struct parse *p, cset *cs);
 820  */
 821 static void
 822 p_b_term(
 823     struct parse *p,
 824     cset *cs)
 825 {
 826         char c;
 827         char start, finish;
 828         int i;
 829
 830         assert(p != NULL);
 831         assert(cs != NULL);
 832
 833         /* classify what we've got */
 834         switch ((MORE()) ? PEEK() : '\0') {
 835         case '[':
 836                 c = (MORE2()) ? PEEK2() : '\0';
 837                 break;
 838
 839         case '-':
 840                 SETERROR(REG_ERANGE);
 841                 return;                 /* NOTE RETURN */
 842
 843         default:
 844                 c = '\0';
 845                 break;
 846         }
 847
 848         switch (c) {
 849         case ':':               /* character class */
 850                 NEXT2();
 851                 REQUIRE(MORE(), REG_EBRACK);
 852                 c = PEEK();
 853                 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
 854                 p_b_cclass(p, cs);
 855                 REQUIRE(MORE(), REG_EBRACK);
 856                 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
 857                 break;
 858         case '=':               /* equivalence class */
 859                 NEXT2();
 860                 REQUIRE(MORE(), REG_EBRACK);
 861                 c = PEEK();
 862                 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 863                 p_b_eclass(p, cs);
 864                 REQUIRE(MORE(), REG_EBRACK);
 865                 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 866                 break;
 867         default:                /* symbol, ordinary character, or range */
 868 /* xxx revision needed for multichar stuff */
 869                 start = p_b_symbol(p);
 870                 if (SEE('-') && MORE2() && PEEK2() != ']') {
 871                         /* range */
 872                         NEXT();
 873                         if (EAT('-'))
 874                                 finish = '-';
 875                         else
 876                                 finish = p_b_symbol(p);
 877                 } else
 878                         finish = start;
 879 /* xxx what about signed chars here... */
 880                 REQUIRE(start <= finish, REG_ERANGE);
 881                 for (i = start; i <= finish; i++)
 882                         CHadd(cs, i);
 883                 break;
 884         }
 885 }
 886
 887 /*
 888  - p_b_cclass - parse a character-class name and deal with it
 889  == static void p_b_cclass(struct parse *p, cset *cs);
 890  */
 891 static void
 892 p_b_cclass(
 893     struct parse *p,
 894     cset *cs)
 895 {
 896         const char *sp;
 897         const struct cclass *cp;
 898         size_t len;
 899         const char *u;
 900         char c;
 901
 902         assert(p != NULL);
 903         assert(cs != NULL);
 904
 905         sp = p->next;
 906
 907         while (MORE() && isalpha((unsigned char)PEEK()))
 908                 NEXT();
 909         len = p->next - sp;
 910         for (cp = cclasses; cp->name != NULL; cp++)
 911                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 912                         break;
 913         if (cp->name == NULL) {
 914                 /* oops, didn't find it */
 915                 SETERROR(REG_ECTYPE);
 916                 return;
 917         }
 918
 919         u = cp->chars;
 920         while ((c = *u++) != '\0')
 921                 CHadd(cs, c);
 922         for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
 923                 MCadd(p, cs, u);
 924 }
 925
 926 /*
 927  - p_b_eclass - parse an equivalence-class name and deal with it
 928  == static void p_b_eclass(struct parse *p, cset *cs);
 929  *
 930  * This implementation is incomplete. xxx
 931  */
 932 static void
 933 p_b_eclass(
 934     struct parse *p,
 935     cset *cs)
 936 {
 937         char c;
 938
 939         assert(p != NULL);
 940         assert(cs != NULL);
 941
 942         c = p_b_coll_elem(p, '=');
 943         CHadd(cs, c);
 944 }
 945
 946 /*
 947  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 948  == static char p_b_symbol(struct parse *p);
 949  */
 950 static char                     /* value of symbol */
 951 p_b_symbol(
 952     struct parse *p)
 953 {
 954         char value;
 955
 956         assert(p != NULL);
 957
 958         REQUIRE(MORE(), REG_EBRACK);
 959         if (!EATTWO('[', '.'))
 960                 return(GETNEXT());
 961
 962         /* collating symbol */
 963         value = p_b_coll_elem(p, '.');
 964         REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
 965         return(value);
 966 }
 967
 968 /*
 969  - p_b_coll_elem - parse a collating-element name and look it up
 970  == static char p_b_coll_elem(struct parse *p, int endc);
 971  */
 972 static char                     /* value of collating element */
 973 p_b_coll_elem(
 974     struct parse *p,
 975     int endc)                   /* name ended by endc,']' */
 976 {
 977         const char *sp;
 978         const struct cname *cp;
 979         size_t len;
 980
 981         assert(p != NULL);
 982
 983         sp = p->next;
 984
 985         while (MORE() && !SEETWO(endc, ']'))
 986                 NEXT();
 987         if (!MORE()) {
 988                 SETERROR(REG_EBRACK);
 989                 return(0);
 990         }
 991         len = p->next - sp;
 992         for (cp = cnames; cp->name != NULL; cp++)
 993                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 994                         return(cp->code);       /* known name */
 995         if (len == 1)
 996                 return(*sp);    /* single character */
 997         SETERROR(REG_ECOLLATE);                 /* neither */
 998         return(0);
 999 }
1000
1001 /*
1002  - othercase - return the case counterpart of an alphabetic
1003  == static int othercase(int ch);
1004  */
1005 static int                      /* if no counterpart, return ch */
1006 othercase(
1007     int ch)
1008 {
1009         assert(isalpha(ch));
1010         if (isupper(ch))
1011                 return(tolower(ch));
1012         else if (islower(ch))
1013                 return(toupper(ch));
1014         else                    /* peculiar, but could happen */
1015                 return(ch);
1016 }
1017
1018 /*
1019  - bothcases - emit a dualcase version of a two-case character
1020  == static void bothcases(struct parse *p, int ch);
1021  *
1022  * Boy, is this implementation ever a kludge...
1023  */
1024 static void
1025 bothcases(
1026     struct parse *p,
1027     int ch)
1028 {
1029         const char *oldnext;
1030         const char *oldend;
1031         char bracket[3];
1032
1033         assert(p != NULL);
1034
1035         oldnext = p->next;
1036         oldend = p->end;
1037
1038         assert(othercase(ch) != ch);    /* p_bracket() would recurse */
1039         p->next = bracket;
1040         p->end = bracket+2;
1041         bracket[0] = ch;
1042         bracket[1] = ']';
1043         bracket[2] = '\0';
1044         p_bracket(p);
1045         assert(p->next == bracket+2);
1046         p->next = oldnext;
1047         p->end = oldend;
1048 }
1049
1050 /*
1051  - ordinary - emit an ordinary character
1052  == static void ordinary(struct parse *p, int ch);
1053  */
1054 static void
1055 ordinary(
1056     struct parse *p,
1057     int ch)
1058 {
1059         cat_t *cap;
1060
1061         assert(p != NULL);
1062
1063         cap = p->g->categories;
1064         if ((p->g->cflags&REG_ICASE) && isalpha((unsigned char) ch)
1065             && othercase((unsigned char) ch) != (unsigned char) ch)
1066                 bothcases(p, (unsigned char) ch);
1067         else {
1068                 EMIT(OCHAR, (unsigned char)ch);
1069                 if (cap[ch] == 0)
1070                         cap[ch] = p->g->ncategories++;
1071         }
1072 }
1073
1074 /*
1075  - nonnewline - emit REG_NEWLINE version of OANY
1076  == static void nonnewline(struct parse *p);
1077  *
1078  * Boy, is this implementation ever a kludge...
1079  */
1080 static void
1081 nonnewline(
1082     struct parse *p)
1083 {
1084         const char *oldnext;
1085         const char *oldend;
1086         char bracket[4];
1087
1088         assert(p != NULL);
1089
1090         oldnext = p->next;
1091         oldend = p->end;
1092
1093         p->next = bracket;
1094         p->end = bracket+3;
1095         bracket[0] = '^';
1096         bracket[1] = '\n';
1097         bracket[2] = ']';
1098         bracket[3] = '\0';
1099         p_bracket(p);
1100         assert(p->next == bracket+3);
1101         p->next = oldnext;
1102         p->end = oldend;
1103 }
1104
1105 /*
1106  - repeat - generate code for a bounded repetition, recursively if needed
1107  == static void repeat(struct parse *p, sopno start, int from, int to);
1108  */
1109 static void
1110 repeat(
1111     struct parse *p,
1112     sopno start,                /* operand from here to end of strip */
1113     int from,                   /* repeated from this number */
1114     int to)                     /* to this number of times (maybe INFINITY) */
1115 {
1116         sopno finish;
1117 #       define  N       2
1118 #       define  INF     3
1119 #       define  REP(f, t)       ((f)*8 + (t))
1120 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1121         sopno copy;
1122
1123         assert(p != NULL);
1124
1125         finish = HERE();
1126
1127         if (p->error != 0)      /* head off possible runaway recursion */
1128                 return;
1129
1130         assert(from <= to);
1131
1132         switch (REP(MAP(from), MAP(to))) {
1133         case REP(0, 0):                 /* must be user doing this */
1134                 DROP(finish-start);     /* drop the operand */
1135                 break;
1136         case REP(0, 1):                 /* as x{1,1}? */
1137         case REP(0, N):                 /* as x{1,n}? */
1138         case REP(0, INF):               /* as x{1,}? */
1139                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1140                 INSERT(OCH_, start);            /* offset is wrong... */
1141                 repeat(p, start+1, 1, to);
1142                 ASTERN(OOR1, start);
1143                 AHEAD(start);                   /* ... fix it */
1144                 EMIT(OOR2, 0);
1145                 AHEAD(THERE());
1146                 ASTERN(O_CH, THERETHERE());
1147                 break;
1148         case REP(1, 1):                 /* trivial case */
1149                 /* done */
1150                 break;
1151         case REP(1, N):                 /* as x?x{1,n-1} */
1152                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1153                 INSERT(OCH_, start);
1154                 ASTERN(OOR1, start);
1155                 AHEAD(start);
1156                 EMIT(OOR2, 0);                  /* offset very wrong... */
1157                 AHEAD(THERE());                 /* ...so fix it */
1158                 ASTERN(O_CH, THERETHERE());
1159                 copy = dupl(p, start+1, finish+1);
1160                 assert(copy == finish+4);
1161                 repeat(p, copy, 1, to-1);
1162                 break;
1163         case REP(1, INF):               /* as x+ */
1164                 INSERT(OPLUS_, start);
1165                 ASTERN(O_PLUS, start);
1166                 break;
1167         case REP(N, N):                 /* as xx{m-1,n-1} */
1168                 copy = dupl(p, start, finish);
1169                 repeat(p, copy, from-1, to-1);
1170                 break;
1171         case REP(N, INF):               /* as xx{n-1,INF} */
1172                 copy = dupl(p, start, finish);
1173                 repeat(p, copy, from-1, to);
1174                 break;
1175         default:                        /* "can't happen" */
1176                 SETERROR(REG_ASSERT);   /* just in case */
1177                 break;
1178         }
1179 }
1180
1181 /*
1182  - seterr - set an error condition
1183  == static int seterr(struct parse *p, int e);
1184  */
1185 static int                      /* useless but makes type checking happy */
1186 seterr(
1187     struct parse *p,
1188     int e)
1189 {
1190
1191         assert(p != NULL);
1192
1193         if (p->error == 0)      /* keep earliest error condition */
1194                 p->error = e;
1195         p->next = nuls;         /* try to bring things to a halt */
1196         p->end = nuls;
1197         return(0);              /* make the return value well-defined */
1198 }
1199
1200 /*
1201  - allocset - allocate a set of characters for []
1202  == static cset *allocset(struct parse *p);
1203  */
1204 static cset *
1205 allocset(
1206     struct parse *p)
1207 {
1208         int no;
1209         size_t nc;
1210         size_t nbytes;
1211         cset *cs;
1212         size_t css;
1213         int i;
1214
1215         assert(p != NULL);
1216
1217         no = p->g->ncsets++;
1218         css = (size_t)p->g->csetsize;
1219         if (no >= p->ncsalloc) {        /* need another column of space */
1220                 p->ncsalloc += CHAR_BIT;
1221                 nc = p->ncsalloc;
1222                 assert(nc % CHAR_BIT == 0);
1223                 nbytes = nc / CHAR_BIT * css;
1224                 if (p->g->sets == NULL)
1225                         p->g->sets = malloc(nc * sizeof(cset));
1226                 else
1227                         p->g->sets = realloc(p->g->sets, nc * sizeof(cset));
1228                 if (p->g->setbits == NULL)
1229                         p->g->setbits = malloc(nbytes);
1230                 else {
1231                         p->g->setbits = realloc(p->g->setbits, nbytes);
1232                         /* xxx this isn't right if setbits is now NULL */
1233                         for (i = 0; i < no; i++)
1234                                 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1235                 }
1236                 if (p->g->sets != NULL && p->g->setbits != NULL)
1237                         (void) memset((char *)p->g->setbits + (nbytes - css),
1238                                                                 0, css);
1239                 else {
1240                         no = 0;
1241                         SETERROR(REG_ESPACE);
1242                         /* caller's responsibility not to do set ops */
1243                 }
1244         }
1245
1246         assert(p->g->sets != NULL);     /* xxx */
1247         cs = &p->g->sets[no];
1248         cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1249         cs->mask = 1 << ((no) % CHAR_BIT);
1250         cs->hash = 0;
1251         cs->smultis = 0;
1252         cs->multis = NULL;
1253
1254         return(cs);
1255 }
1256
1257 /*
1258  - freeset - free a now-unused set
1259  == static void freeset(struct parse *p, cset *cs);
1260  */
1261 static void
1262 freeset(
1263     struct parse *p,
1264     cset *cs)
1265 {
1266         int i;
1267         cset *top;
1268         size_t css;
1269
1270         assert(p != NULL);
1271         assert(cs != NULL);
1272
1273         top = &p->g->sets[p->g->ncsets];
1274         css = (size_t)p->g->csetsize;
1275
1276         for (i = 0; i < css; i++)
1277                 CHsub(cs, i);
1278         if (cs == top-1)        /* recover only the easy case */
1279                 p->g->ncsets--;
1280 }
1281
1282 /*
1283  - freezeset - final processing on a set of characters
1284  == static int freezeset(struct parse *p, cset *cs);
1285  *
1286  * The main task here is merging identical sets.  This is usually a waste
1287  * of time (although the hash code minimizes the overhead), but can win
1288  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1289  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1290  * the same value!
1291  */
1292 static int                      /* set number */
1293 freezeset(
1294     struct parse *p,
1295     cset *cs)
1296 {
1297         uch h;
1298         int i;
1299         cset *top;
1300         cset *cs2;
1301         size_t css;
1302
1303         assert(p != NULL);
1304         assert(cs != NULL);
1305
1306         h = cs->hash;
1307         top = &p->g->sets[p->g->ncsets];
1308         css = (size_t)p->g->csetsize;
1309
1310         /* look for an earlier one which is the same */
1311         for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1312                 if (cs2->hash == h && cs2 != cs) {
1313                         /* maybe */
1314                         for (i = 0; i < css; i++)
1315                                 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1316                                         break;          /* no */
1317                         if (i == css)
1318                                 break;                  /* yes */
1319                 }
1320
1321         if (cs2 < top) {        /* found one */
1322                 freeset(p, cs);
1323                 cs = cs2;
1324         }
1325
1326         return((int)(cs - p->g->sets));
1327 }
1328
1329 /*
1330  - firstch - return first character in a set (which must have at least one)
1331  == static int firstch(struct parse *p, cset *cs);
1332  */
1333 static int                      /* character; there is no "none" value */
1334 firstch(
1335     struct parse *p,
1336     cset *cs)
1337 {
1338         int i;
1339         size_t css;
1340
1341         assert(p != NULL);
1342         assert(cs != NULL);
1343
1344         css = (size_t)p->g->csetsize;
1345
1346         for (i = 0; i < css; i++)
1347                 if (CHIN(cs, i))
1348                         return((char)i);
1349         assert(never);
1350         return(0);              /* arbitrary */
1351 }
1352
1353 /*
1354  - nch - number of characters in a set
1355  == static int nch(struct parse *p, cset *cs);
1356  */
1357 static int
1358 nch(
1359     struct parse *p,
1360     cset *cs)
1361 {
1362         int i;
1363         size_t css;
1364         int n = 0;
1365
1366         assert(p != NULL);
1367         assert(cs != NULL);
1368
1369         css = (size_t)p->g->csetsize;
1370
1371         for (i = 0; i < css; i++)
1372                 if (CHIN(cs, i))
1373                         n++;
1374         return(n);
1375 }
1376
1377 /*
1378  - mcadd - add a collating element to a cset
1379  == static void mcadd(struct parse *p, cset *cs, \
1380  ==     char *cp);
1381  */
1382 static void
1383 mcadd(
1384     struct parse *p,
1385     cset *cs,
1386     const char *cp)
1387 {
1388         size_t oldend;
1389
1390         assert(p != NULL);
1391         assert(cs != NULL);
1392         assert(cp != NULL);
1393
1394         oldend = cs->smultis;
1395
1396         cs->smultis += strlen(cp) + 1;
1397         if (cs->multis == NULL)
1398                 cs->multis = malloc(cs->smultis);
1399         else
1400                 cs->multis = realloc(cs->multis, cs->smultis);
1401         if (cs->multis == NULL) {
1402                 SETERROR(REG_ESPACE);
1403                 return;
1404         }
1405
1406         (void) strcpy(cs->multis + oldend - 1, cp);
1407         cs->multis[cs->smultis - 1] = '\0';
1408 }
1409
1410 #if 0
1411 /*
1412  - mcsub - subtract a collating element from a cset
1413  == static void mcsub(cset *cs, char *cp);
1414  */
1415 static void
1416 mcsub(
1417     cset *cs,
1418     char *cp)
1419 {
1420         char *fp;
1421         size_t len;
1422
1423         assert(cs != NULL);
1424         assert(cp != NULL);
1425
1426         fp = mcfind(cs, cp);
1427         len = strlen(fp);
1428
1429         assert(fp != NULL);
1430         (void) memmove(fp, fp + len + 1,
1431                                 cs->smultis - (fp + len + 1 - cs->multis));
1432         cs->smultis -= len;
1433
1434         if (cs->smultis == 0) {
1435                 free(cs->multis);
1436                 cs->multis = NULL;
1437                 return;
1438         }
1439
1440         cs->multis = realloc(cs->multis, cs->smultis);
1441         assert(cs->multis != NULL);
1442 }
1443
1444 /*
1445  - mcin - is a collating element in a cset?
1446  == static int mcin(cset *cs, char *cp);
1447  */
1448 static int
1449 mcin(
1450     cset *cs,
1451     char *cp)
1452 {
1453
1454         assert(cs != NULL);
1455         assert(cp != NULL);
1456
1457         return(mcfind(cs, cp) != NULL);
1458 }
1459
1460 /*
1461  - mcfind - find a collating element in a cset
1462  == static char *mcfind(cset *cs, char *cp);
1463  */
1464 static char *
1465 mcfind(
1466     cset *cs,
1467     char *cp)
1468 {
1469         char *p;
1470
1471         assert(cs != NULL);
1472         assert(cp != NULL);
1473
1474         if (cs->multis == NULL)
1475                 return(NULL);
1476         for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1477                 if (strcmp(cp, p) == 0)
1478                         return(p);
1479         return(NULL);
1480 }
1481 #endif
1482
1483 /*
1484  - mcinvert - invert the list of collating elements in a cset
1485  == static void mcinvert(struct parse *p, cset *cs);
1486  *
1487  * This would have to know the set of possibilities.  Implementation
1488  * is deferred.
1489  */
1490 /* ARGSUSED */
1491 static void
1492 mcinvert(
1493     struct parse *p,
1494     cset *cs)
1495 {
1496
1497         assert(p != NULL);
1498         assert(cs != NULL);
1499
1500         assert(cs->multis == NULL);     /* xxx */
1501 }
1502
1503 /*
1504  - mccase - add case counterparts of the list of collating elements in a cset
1505  == static void mccase(struct parse *p, cset *cs);
1506  *
1507  * This would have to know the set of possibilities.  Implementation
1508  * is deferred.
1509  */
1510 /* ARGSUSED */
1511 static void
1512 mccase(
1513     struct parse *p,
1514     cset *cs)
1515 {
1516
1517         assert(p != NULL);
1518         assert(cs != NULL);
1519
1520         assert(cs->multis == NULL);     /* xxx */
1521 }
1522
1523 /*
1524  - isinsets - is this character in any sets?
1525  == static int isinsets(struct re_guts *g, int c);
1526  */
1527 static int                      /* predicate */
1528 isinsets(
1529     struct re_guts *g,
1530     int c)
1531 {
1532         uch *col;
1533         int i;
1534         int ncols;
1535         unsigned uc = (unsigned char)c;
1536
1537         assert(g != NULL);
1538
1539         ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1540
1541         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1542                 if (col[uc] != 0)
1543                         return(1);
1544         return(0);
1545 }
1546
1547 /*
1548  - samesets - are these two characters in exactly the same sets?
1549  == static int samesets(struct re_guts *g, int c1, int c2);
1550  */
1551 static int                      /* predicate */
1552 samesets(
1553     struct re_guts *g,
1554     int c1,
1555     int c2)
1556 {
1557         uch *col;
1558         int i;
1559         int ncols;
1560         unsigned uc1 = (unsigned char)c1;
1561         unsigned uc2 = (unsigned char)c2;
1562
1563         assert(g != NULL);
1564
1565         ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1566
1567         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1568                 if (col[uc1] != col[uc2])
1569                         return(0);
1570         return(1);
1571 }
1572
1573 /*
1574  - categorize - sort out character categories
1575  == static void categorize(struct parse *p, struct re_guts *g);
1576  */
1577 static void
1578 categorize(
1579     struct parse *p,
1580     struct re_guts *g)
1581 {
1582         cat_t *cats;
1583         int c;
1584         int c2;
1585         cat_t cat;
1586
1587         assert(p != NULL);
1588         assert(g != NULL);
1589
1590         cats = g->categories;
1591
1592         /* avoid making error situations worse */
1593         if (p->error != 0)
1594                 return;
1595
1596         for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1597                 if (cats[c] == 0 && isinsets(g, c)) {
1598                         cat = g->ncategories++;
1599                         cats[c] = cat;
1600                         for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1601                                 if (cats[c2] == 0 && samesets(g, c, c2))
1602                                         cats[c2] = cat;
1603                 }
1604 }
1605
1606 /*
1607  - dupl - emit a duplicate of a bunch of sops
1608  == static sopno dupl(struct parse *p, sopno start, sopno finish);
1609  */
1610 static sopno                    /* start of duplicate */
1611 dupl(
1612     struct parse *p,
1613     sopno start,                        /* from here */
1614     sopno finish)                       /* to this less one */
1615 {
1616         sopno ret;
1617         sopno len = finish - start;
1618
1619         assert(p != NULL);
1620
1621         ret = HERE();
1622
1623         assert(finish >= start);
1624         if (len == 0)
1625                 return(ret);
1626         enlarge(p, p->ssize + len);     /* this many unexpected additions */
1627         assert(p->ssize >= p->slen + len);
1628         (void)memcpy(p->strip + p->slen, p->strip + start,
1629             (size_t)len * sizeof(sop));
1630         p->slen += len;
1631         return(ret);
1632 }
1633
1634 /*
1635  - doemit - emit a strip operator
1636  == static void doemit(struct parse *p, sop op, size_t opnd);
1637  *
1638  * It might seem better to implement this as a macro with a function as
1639  * hard-case backup, but it's just too big and messy unless there are
1640  * some changes to the data structures.  Maybe later.
1641  */
1642 static void
1643 doemit(
1644     struct parse *p,
1645     sop op,
1646     sopno opnd)
1647 {
1648
1649         assert(p != NULL);
1650
1651         /* avoid making error situations worse */
1652         if (p->error != 0)
1653                 return;
1654
1655         /* deal with oversize operands ("can't happen", more or less) */
1656         assert(opnd < 1<<OPSHIFT);
1657
1658         /* deal with undersized strip */
1659         if (p->slen >= p->ssize)
1660                 enlarge(p, (p->ssize+1) / 2 * 3);       /* +50% */
1661         assert(p->slen < p->ssize);
1662
1663         /* finally, it's all reduced to the easy case */
1664         p->strip[p->slen++] = SOP(op, opnd);
1665 }
1666
1667 /*
1668  - doinsert - insert a sop into the strip
1669  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1670  */
1671 static void
1672 doinsert(
1673     struct parse *p,
1674     sop op,
1675     sopno opnd,
1676     sopno pos)
1677 {
1678         sopno sn;
1679         sop s;
1680         int i;
1681
1682         assert(p != NULL);
1683
1684         /* avoid making error situations worse */
1685         if (p->error != 0)
1686                 return;
1687
1688         sn = HERE();
1689         EMIT(op, opnd);         /* do checks, ensure space */
1690         assert(HERE() == sn+1);
1691         s = p->strip[sn];
1692
1693         /* adjust paren pointers */
1694         assert(pos > 0);
1695         for (i = 1; i < NPAREN; i++) {
1696                 if (p->pbegin[i] >= pos) {
1697                         p->pbegin[i]++;
1698                 }
1699                 if (p->pend[i] >= pos) {
1700                         p->pend[i]++;
1701                 }
1702         }
1703
1704         memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop));
1705         p->strip[pos] = s;
1706 }
1707
1708 /*
1709  - dofwd - complete a forward reference
1710  == static void dofwd(struct parse *p, sopno pos, sop value);
1711  */
1712 static void
1713 dofwd(
1714     struct parse *p,
1715     sopno pos,
1716     sopno value)
1717 {
1718
1719         assert(p != NULL);
1720
1721         /* avoid making error situations worse */
1722         if (p->error != 0)
1723                 return;
1724
1725         assert(value < 1<<OPSHIFT);
1726         p->strip[pos] = OP(p->strip[pos]) | value;
1727 }
1728
1729 /*
1730  - enlarge - enlarge the strip
1731  == static void enlarge(struct parse *p, sopno size);
1732  */
1733 static void
1734 enlarge(
1735     struct parse *p,
1736     sopno size)
1737 {
1738         sop *sp;
1739
1740         assert(p != NULL);
1741
1742         if (p->ssize >= size)
1743                 return;
1744
1745         sp = (sop *)realloc(p->strip, size*sizeof(sop));
1746         if (sp == NULL) {
1747                 SETERROR(REG_ESPACE);
1748                 return;
1749         }
1750         p->strip = sp;
1751         p->ssize = size;
1752 }
1753
1754 /*
1755  - stripsnug - compact the strip
1756  == static void stripsnug(struct parse *p, struct re_guts *g);
1757  */
1758 static void
1759 stripsnug(
1760     struct parse *p,
1761     struct re_guts *g)
1762 {
1763
1764         assert(p != NULL);
1765         assert(g != NULL);
1766
1767         g->nstates = p->slen;
1768         g->strip = realloc(p->strip, p->slen * sizeof(sop));
1769         if (g->strip == NULL) {
1770                 SETERROR(REG_ESPACE);
1771                 g->strip = p->strip;
1772         }
1773 }
1774
1775 /*
1776  - findmust - fill in must and mlen with longest mandatory literal string
1777  == static void findmust(struct parse *p, struct re_guts *g);
1778  *
1779  * This algorithm could do fancy things like analyzing the operands of |
1780  * for common subsequences.  Someday.  This code is simple and finds most
1781  * of the interesting cases.
1782  *
1783  * Note that must and mlen got initialized during setup.
1784  */
1785 static void
1786 findmust(
1787     struct parse *p,
1788     struct re_guts *g)
1789 {
1790         sop *scan;
1791         sop *start = NULL;
1792         sop *newstart = NULL;
1793         sopno newlen;
1794         sop s;
1795         char *cp;
1796         sopno i;
1797
1798         assert(p != NULL);
1799         assert(g != NULL);
1800
1801         /* avoid making error situations worse */
1802         if (p->error != 0)
1803                 return;
1804
1805         /* find the longest OCHAR sequence in strip */
1806         newlen = 0;
1807         scan = g->strip + 1;
1808         do {
1809                 s = *scan++;
1810                 switch (OP(s)) {
1811                 case OCHAR:             /* sequence member */
1812                         if (newlen == 0)                /* new sequence */
1813                                 newstart = scan - 1;
1814                         newlen++;
1815                         break;
1816                 case OPLUS_:            /* things that don't break one */
1817                 case OLPAREN:
1818                 case ORPAREN:
1819                         break;
1820                 case OQUEST_:           /* things that must be skipped */
1821                 case OCH_:
1822                         scan--;
1823                         do {
1824                                 scan += OPND(s);
1825                                 s = *scan;
1826                                 /* assert() interferes w debug printouts */
1827                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1828                                                         OP(s) != OOR2) {
1829                                         g->iflags |= BAD;
1830                                         return;
1831                                 }
1832                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
1833                         /* FALLTHROUGH */
1834                 default:                /* things that break a sequence */
1835                         if (newlen > g->mlen) {         /* ends one */
1836                                 start = newstart;
1837                                 g->mlen = newlen;
1838                         }
1839                         newlen = 0;
1840                         break;
1841                 }
1842         } while (OP(s) != OEND);
1843
1844         if (start == NULL)
1845                 g->mlen = 0;
1846
1847         if (g->mlen == 0)       /* there isn't one */
1848                 return;
1849
1850         /* turn it into a character string */
1851         g->must = malloc((size_t)g->mlen + 1);
1852         if (g->must == NULL) {          /* argh; just forget it */
1853                 g->mlen = 0;
1854                 return;
1855         }
1856         cp = g->must;
1857         scan = start;
1858         for (i = g->mlen; i > 0; i--) {
1859                 while (OP(s = *scan++) != OCHAR)
1860                         continue;
1861                 assert(cp < g->must + g->mlen);
1862                 *cp++ = (char)OPND(s);
1863         }
1864         assert(cp == g->must + g->mlen);
1865         *cp++ = '\0';           /* just on general principles */
1866 }
1867
1868 /*
1869  - pluscount - count + nesting
1870  == static sopno pluscount(struct parse *p, struct re_guts *g);
1871  */
1872 static sopno                    /* nesting depth */
1873 pluscount(
1874     struct parse *p,
1875     struct re_guts *g)
1876 {
1877         sop *scan;
1878         sop s;
1879         sopno plusnest = 0;
1880         sopno maxnest = 0;
1881
1882         assert(p != NULL);
1883         assert(g != NULL);
1884
1885         if (p->error != 0)
1886                 return(0);      /* there may not be an OEND */
1887
1888         scan = g->strip + 1;
1889         do {
1890                 s = *scan++;
1891                 switch (OP(s)) {
1892                 case OPLUS_:
1893                         plusnest++;
1894                         break;
1895                 case O_PLUS:
1896                         if (plusnest > maxnest)
1897                                 maxnest = plusnest;
1898                         plusnest--;
1899                         break;
1900                 }
1901         } while (OP(s) != OEND);
1902         if (plusnest != 0)
1903                 g->iflags |= BAD;
1904         return(maxnest);
1905 }