adopted an updated path to a test script.
[AROS.git] / compiler / posixc / regex / regcomp.c
blobaba26005b380c87855a4f5b952a95a2eb6aa38e1
1 /* $NetBSD: regcomp.c,v 1.28 2007/02/09 23:44:18 junyoung Exp $ */
3 /*-
4 * Copyright (c) 1992, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
37 /*-
38 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
40 * This code is derived from software contributed to Berkeley by
41 * Henry Spencer.
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. All advertising materials mentioning features or use of this software
52 * must display the following acknowledgement:
53 * This product includes software developed by the University of
54 * California, Berkeley and its contributors.
55 * 4. Neither the name of the University nor the names of its contributors
56 * may be used to endorse or promote products derived from this software
57 * without specific prior written permission.
59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69 * SUCH DAMAGE.
71 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
75 __RCSID("$NetBSD: regcomp.c,v 1.28 2007/02/09 23:44:18 junyoung Exp $");
78 #if defined(__AROS__)
79 #if !DEBUG
80 #define NDEBUG
81 #else
82 #define REDEBUG
83 #endif
84 #endif
86 #include <sys/types.h>
87 #include <inttypes.h>
89 #include <assert.h>
90 #include <ctype.h>
91 #include <limits.h>
92 #include <stdio.h>
93 #include <stdlib.h>
94 #include <string.h>
95 #include <regex.h>
97 #ifdef __weak_alias
98 __weak_alias(regcomp,_regcomp)
99 #endif
101 #include "utils.h"
102 #include "regex2.h"
104 #include "cclass.h"
105 #include "cname.h"
108 * parse structure, passed up and down to avoid global variables and
109 * other clumsinesses
111 struct parse {
112 const char *next; /* next character in RE */
113 const char *end; /* end of string (-> NUL normally) */
114 int error; /* has an error been seen? */
115 sop *strip; /* malloced strip */
116 sopno ssize; /* malloced strip size (allocated) */
117 sopno slen; /* malloced strip length (used) */
118 int ncsalloc; /* number of csets allocated */
119 struct re_guts *g;
120 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
121 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
122 sopno pend[NPAREN]; /* -> ) ([0] unused) */
125 /* ========= begin header generated by ./mkh ========= */
126 #ifdef __cplusplus
127 extern "C" {
128 #endif
130 /* === regcomp.c === */
131 static void p_ere(struct parse *p, int stop);
132 static void p_ere_exp(struct parse *p);
133 static void p_str(struct parse *p);
134 static void p_bre(struct parse *p, int end1, int end2);
135 static int p_simp_re(struct parse *p, int starordinary);
136 static int p_count(struct parse *p);
137 static void p_bracket(struct parse *p);
138 static void p_b_term(struct parse *p, cset *cs);
139 static void p_b_cclass(struct parse *p, cset *cs);
140 static void p_b_eclass(struct parse *p, cset *cs);
141 static char p_b_symbol(struct parse *p);
142 static char p_b_coll_elem(struct parse *p, int endc);
143 static int othercase(int ch);
144 static void bothcases(struct parse *p, int ch);
145 static void ordinary(struct parse *p, int ch);
146 static void nonnewline(struct parse *p);
147 static void repeat(struct parse *p, sopno start, int from, int to);
148 static int seterr(struct parse *p, int e);
149 static cset *allocset(struct parse *p);
150 static void freeset(struct parse *p, cset *cs);
151 static int freezeset(struct parse *p, cset *cs);
152 static int firstch(struct parse *p, cset *cs);
153 static int nch(struct parse *p, cset *cs);
154 static void mcadd(struct parse *p, cset *cs, const char *cp);
155 #if 0
156 static void mcsub(cset *cs, char *cp);
157 static int mcin(cset *cs, char *cp);
158 static char *mcfind(cset *cs, char *cp);
159 #endif
160 static void mcinvert(struct parse *p, cset *cs);
161 static void mccase(struct parse *p, cset *cs);
162 static int isinsets(struct re_guts *g, int c);
163 static int samesets(struct re_guts *g, int c1, int c2);
164 static void categorize(struct parse *p, struct re_guts *g);
165 static sopno dupl(struct parse *p, sopno start, sopno finish);
166 static void doemit(struct parse *p, sop op, sopno opnd);
167 static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos);
168 static void dofwd(struct parse *p, sopno pos, sopno value);
169 static void enlarge(struct parse *p, sopno size);
170 static void stripsnug(struct parse *p, struct re_guts *g);
171 static void findmust(struct parse *p, struct re_guts *g);
172 static sopno pluscount(struct parse *p, struct re_guts *g);
174 #ifdef __cplusplus
176 #endif
177 /* ========= end header generated by ./mkh ========= */
179 static char nuls[10]; /* place to point scanner in event of error */
182 * macros for use with parse structure
183 * BEWARE: these know that the parse structure is named `p' !!!
185 #define PEEK() (*p->next)
186 #define PEEK2() (*(p->next+1))
187 #define MORE() (p->next < p->end)
188 #define MORE2() (p->next+1 < p->end)
189 #define SEE(c) (MORE() && PEEK() == (c))
190 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
191 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
192 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
193 #define NEXT() (p->next++)
194 #define NEXT2() (p->next += 2)
195 #define NEXTn(n) (p->next += (n))
196 #define GETNEXT() (*p->next++)
197 #define SETERROR(e) seterr(p, (e))
198 #define REQUIRE(co, e) (void) ((co) || SETERROR(e))
199 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
200 #define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e))
201 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
202 #define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd)
203 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
204 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
205 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
206 #define HERE() (p->slen)
207 #define THERE() (p->slen - 1)
208 #define THERETHERE() (p->slen - 2)
209 #define DROP(n) (p->slen -= (n))
211 #ifndef NDEBUG
212 static int never = 0; /* for use in asserts; shuts lint up */
213 #else
214 #define never 0 /* some <assert.h>s have bugs too */
215 #endif
218 - regcomp - interface for parser and compilation
219 = extern int regcomp(regex_t *, const char *, int);
220 = #define REG_BASIC 0000
221 = #define REG_EXTENDED 0001
222 = #define REG_ICASE 0002
223 = #define REG_NOSUB 0004
224 = #define REG_NEWLINE 0010
225 = #define REG_NOSPEC 0020
226 = #define REG_PEND 0040
227 = #define REG_DUMP 0200
229 int /* 0 success, otherwise REG_something */
230 regcomp(
231 regex_t *preg,
232 const char *pattern,
233 int cflags)
235 struct parse pa;
236 struct re_guts *g;
237 struct parse *p = &pa;
238 int i;
239 size_t len;
240 #ifdef REDEBUG
241 # define GOODFLAGS(f) (f)
242 #else
243 # define GOODFLAGS(f) ((f)&~REG_DUMP)
244 #endif
246 assert(preg != NULL);
247 assert(pattern != NULL);
249 cflags = GOODFLAGS(cflags);
250 if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
251 return(REG_INVARG);
253 if (cflags&REG_PEND) {
254 if (preg->re_endp < pattern)
255 return(REG_INVARG);
256 len = preg->re_endp - pattern;
257 } else
258 len = strlen(pattern);
260 /* do the mallocs early so failure handling is easy */
261 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
262 (NC-1)*sizeof(cat_t));
263 if (g == NULL)
264 return(REG_ESPACE);
265 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
266 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
267 p->slen = 0;
268 if (p->strip == NULL) {
269 free(g);
270 return(REG_ESPACE);
273 /* set things up */
274 p->g = g;
275 p->next = pattern;
276 p->end = p->next + len;
277 p->error = 0;
278 p->ncsalloc = 0;
279 for (i = 0; i < NPAREN; i++) {
280 p->pbegin[i] = 0;
281 p->pend[i] = 0;
283 g->csetsize = NC;
284 g->sets = NULL;
285 g->setbits = NULL;
286 g->ncsets = 0;
287 g->cflags = cflags;
288 g->iflags = 0;
289 g->nbol = 0;
290 g->neol = 0;
291 g->must = NULL;
292 g->mlen = 0;
293 g->nsub = 0;
294 g->ncategories = 1; /* category 0 is "everything else" */
295 g->categories = &g->catspace[-(CHAR_MIN)];
296 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
297 g->backrefs = 0;
299 /* do it */
300 EMIT(OEND, 0);
301 g->firststate = THERE();
302 if (cflags&REG_EXTENDED)
303 p_ere(p, OUT);
304 else if (cflags&REG_NOSPEC)
305 p_str(p);
306 else
307 p_bre(p, OUT, OUT);
308 EMIT(OEND, 0);
309 g->laststate = THERE();
311 /* tidy up loose ends and fill things in */
312 categorize(p, g);
313 stripsnug(p, g);
314 findmust(p, g);
315 g->nplus = pluscount(p, g);
316 g->magic = MAGIC2;
317 preg->re_nsub = g->nsub;
318 preg->re_g = g;
319 preg->re_magic = MAGIC1;
320 #ifndef REDEBUG
321 /* not debugging, so can't rely on the assert() in regexec() */
322 if (g->iflags&BAD)
323 SETERROR(REG_ASSERT);
324 #endif
326 /* win or lose, we're done */
327 if (p->error != 0) /* lose */
328 regfree(preg);
329 return(p->error);
333 - p_ere - ERE parser top level, concatenation and alternation
334 == static void p_ere(struct parse *p, int stop);
336 static void
337 p_ere(
338 struct parse *p,
339 int stop) /* character this ERE should end at */
341 char c;
342 sopno prevback = 0; /* pacify gcc */
343 sopno prevfwd = 0; /* pacify gcc */
344 sopno conc;
345 int first = 1; /* is this the first alternative? */
347 assert(p != NULL);
349 for (;;) {
350 /* do a bunch of concatenated expressions */
351 conc = HERE();
352 while (MORE() && (c = PEEK()) != '|' && c != stop)
353 p_ere_exp(p);
354 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
356 if (!EAT('|'))
357 break; /* NOTE BREAK OUT */
359 if (first) {
360 INSERT(OCH_, conc); /* offset is wrong */
361 prevfwd = conc;
362 prevback = conc;
363 first = 0;
365 ASTERN(OOR1, prevback);
366 prevback = THERE();
367 AHEAD(prevfwd); /* fix previous offset */
368 prevfwd = HERE();
369 EMIT(OOR2, 0); /* offset is very wrong */
372 if (!first) { /* tail-end fixups */
373 AHEAD(prevfwd);
374 ASTERN(O_CH, prevback);
377 assert(!MORE() || SEE(stop));
381 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
382 == static void p_ere_exp(struct parse *p);
384 static void
385 p_ere_exp(
386 struct parse *p)
388 char c;
389 sopno pos;
390 int count;
391 int count2;
392 sopno subno;
393 int wascaret = 0;
395 assert(p != NULL);
397 assert(MORE()); /* caller should have ensured this */
398 c = GETNEXT();
400 pos = HERE();
401 switch (c) {
402 case '(':
403 REQUIRE(MORE(), REG_EPAREN);
404 p->g->nsub++;
405 subno = p->g->nsub;
406 if (subno < NPAREN)
407 p->pbegin[subno] = HERE();
408 EMIT(OLPAREN, subno);
409 if (!SEE(')'))
410 p_ere(p, ')');
411 if (subno < NPAREN) {
412 p->pend[subno] = HERE();
413 assert(p->pend[subno] != 0);
415 EMIT(ORPAREN, subno);
416 MUSTEAT(')', REG_EPAREN);
417 break;
418 #ifndef POSIX_MISTAKE
419 case ')': /* happens only if no current unmatched ( */
421 * You may ask, why the ifndef? Because I didn't notice
422 * this until slightly too late for 1003.2, and none of the
423 * other 1003.2 regular-expression reviewers noticed it at
424 * all. So an unmatched ) is legal POSIX, at least until
425 * we can get it fixed.
427 SETERROR(REG_EPAREN);
428 break;
429 #endif
430 case '^':
431 EMIT(OBOL, 0);
432 p->g->iflags |= USEBOL;
433 p->g->nbol++;
434 wascaret = 1;
435 break;
436 case '$':
437 EMIT(OEOL, 0);
438 p->g->iflags |= USEEOL;
439 p->g->neol++;
440 break;
441 case '|':
442 SETERROR(REG_EMPTY);
443 break;
444 case '*':
445 case '+':
446 case '?':
447 SETERROR(REG_BADRPT);
448 break;
449 case '.':
450 if (p->g->cflags&REG_NEWLINE)
451 nonnewline(p);
452 else
453 EMIT(OANY, 0);
454 break;
455 case '[':
456 p_bracket(p);
457 break;
458 case '\\':
459 REQUIRE(MORE(), REG_EESCAPE);
460 c = GETNEXT();
461 ordinary(p, c);
462 break;
463 case '{': /* okay as ordinary except if digit follows */
464 REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT);
465 /* FALLTHROUGH */
466 default:
467 ordinary(p, c);
468 break;
471 if (!MORE())
472 return;
473 c = PEEK();
474 /* we call { a repetition if followed by a digit */
475 if (!( c == '*' || c == '+' || c == '?' ||
476 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ))
477 return; /* no repetition, we're done */
478 NEXT();
480 REQUIRE(!wascaret, REG_BADRPT);
481 switch (c) {
482 case '*': /* implemented as +? */
483 /* this case does not require the (y|) trick, noKLUDGE */
484 INSERT(OPLUS_, pos);
485 ASTERN(O_PLUS, pos);
486 INSERT(OQUEST_, pos);
487 ASTERN(O_QUEST, pos);
488 break;
489 case '+':
490 INSERT(OPLUS_, pos);
491 ASTERN(O_PLUS, pos);
492 break;
493 case '?':
494 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
495 INSERT(OCH_, pos); /* offset slightly wrong */
496 ASTERN(OOR1, pos); /* this one's right */
497 AHEAD(pos); /* fix the OCH_ */
498 EMIT(OOR2, 0); /* offset very wrong... */
499 AHEAD(THERE()); /* ...so fix it */
500 ASTERN(O_CH, THERETHERE());
501 break;
502 case '{':
503 count = p_count(p);
504 if (EAT(',')) {
505 if (isdigit((unsigned char)PEEK())) {
506 count2 = p_count(p);
507 REQUIRE(count <= count2, REG_BADBR);
508 } else /* single number with comma */
509 count2 = INFINITY;
510 } else /* just a single number */
511 count2 = count;
512 repeat(p, pos, count, count2);
513 if (!EAT('}')) { /* error heuristics */
514 while (MORE() && PEEK() != '}')
515 NEXT();
516 REQUIRE(MORE(), REG_EBRACE);
517 SETERROR(REG_BADBR);
519 break;
522 if (!MORE())
523 return;
524 c = PEEK();
525 if (!( c == '*' || c == '+' || c == '?' ||
526 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) )
527 return;
528 SETERROR(REG_BADRPT);
532 - p_str - string (no metacharacters) "parser"
533 == static void p_str(struct parse *p);
535 static void
536 p_str(
537 struct parse *p)
540 assert(p != NULL);
542 REQUIRE(MORE(), REG_EMPTY);
543 while (MORE())
544 ordinary(p, GETNEXT());
548 - p_bre - BRE parser top level, anchoring and concatenation
549 == static void p_bre(struct parse *p, int end1, \
550 == int end2);
551 * Giving end1 as OUT essentially eliminates the end1/end2 check.
553 * This implementation is a bit of a kludge, in that a trailing $ is first
554 * taken as an ordinary character and then revised to be an anchor. The
555 * only undesirable side effect is that '$' gets included as a character
556 * category in such cases. This is fairly harmless; not worth fixing.
557 * The amount of lookahead needed to avoid this kludge is excessive.
559 static void
560 p_bre(
561 struct parse *p,
562 int end1, /* first terminating character */
563 int end2) /* second terminating character */
565 sopno start;
566 int first = 1; /* first subexpression? */
567 int wasdollar = 0;
569 assert(p != NULL);
571 start = HERE();
573 if (EAT('^')) {
574 EMIT(OBOL, 0);
575 p->g->iflags |= USEBOL;
576 p->g->nbol++;
578 while (MORE() && !SEETWO(end1, end2)) {
579 wasdollar = p_simp_re(p, first);
580 first = 0;
582 if (wasdollar) { /* oops, that was a trailing anchor */
583 DROP(1);
584 EMIT(OEOL, 0);
585 p->g->iflags |= USEEOL;
586 p->g->neol++;
589 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
593 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
594 == static int p_simp_re(struct parse *p, int starordinary);
596 static int /* was the simple RE an unbackslashed $? */
597 p_simp_re(
598 struct parse *p,
599 int starordinary) /* is a leading * an ordinary character? */
601 int c;
602 int count;
603 int count2;
604 sopno pos;
605 int i;
606 sopno subno;
607 # define BACKSL (1<<CHAR_BIT)
609 assert(p != NULL);
611 pos = HERE(); /* repetion op, if any, covers from here */
613 assert(MORE()); /* caller should have ensured this */
614 c = GETNEXT();
615 if (c == '\\') {
616 REQUIRE(MORE(), REG_EESCAPE);
617 c = BACKSL | (unsigned char)GETNEXT();
619 switch (c) {
620 case '.':
621 if (p->g->cflags&REG_NEWLINE)
622 nonnewline(p);
623 else
624 EMIT(OANY, 0);
625 break;
626 case '[':
627 p_bracket(p);
628 break;
629 case BACKSL|'{':
630 SETERROR(REG_BADRPT);
631 break;
632 case BACKSL|'(':
633 p->g->nsub++;
634 subno = p->g->nsub;
635 if (subno < NPAREN)
636 p->pbegin[subno] = HERE();
637 EMIT(OLPAREN, subno);
638 /* the MORE here is an error heuristic */
639 if (MORE() && !SEETWO('\\', ')'))
640 p_bre(p, '\\', ')');
641 if (subno < NPAREN) {
642 p->pend[subno] = HERE();
643 assert(p->pend[subno] != 0);
645 EMIT(ORPAREN, subno);
646 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
647 break;
648 case BACKSL|')': /* should not get here -- must be user */
649 case BACKSL|'}':
650 SETERROR(REG_EPAREN);
651 break;
652 case BACKSL|'1':
653 case BACKSL|'2':
654 case BACKSL|'3':
655 case BACKSL|'4':
656 case BACKSL|'5':
657 case BACKSL|'6':
658 case BACKSL|'7':
659 case BACKSL|'8':
660 case BACKSL|'9':
661 i = (c&~BACKSL) - '0';
662 assert(i < NPAREN);
663 if (p->pend[i] != 0) {
664 assert(i <= p->g->nsub);
665 EMIT(OBACK_, i);
666 assert(p->pbegin[i] != 0);
667 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
668 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
669 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
670 EMIT(O_BACK, i);
671 } else
672 SETERROR(REG_ESUBREG);
673 p->g->backrefs = 1;
674 break;
675 case '*':
676 REQUIRE(starordinary, REG_BADRPT);
677 /* FALLTHROUGH */
678 default:
679 ordinary(p, c &~ BACKSL);
680 break;
683 if (EAT('*')) { /* implemented as +? */
684 /* this case does not require the (y|) trick, noKLUDGE */
685 INSERT(OPLUS_, pos);
686 ASTERN(O_PLUS, pos);
687 INSERT(OQUEST_, pos);
688 ASTERN(O_QUEST, pos);
689 } else if (EATTWO('\\', '{')) {
690 count = p_count(p);
691 if (EAT(',')) {
692 if (MORE() && isdigit((unsigned char)PEEK())) {
693 count2 = p_count(p);
694 REQUIRE(count <= count2, REG_BADBR);
695 } else /* single number with comma */
696 count2 = INFINITY;
697 } else /* just a single number */
698 count2 = count;
699 repeat(p, pos, count, count2);
700 if (!EATTWO('\\', '}')) { /* error heuristics */
701 while (MORE() && !SEETWO('\\', '}'))
702 NEXT();
703 REQUIRE(MORE(), REG_EBRACE);
704 SETERROR(REG_BADBR);
706 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */
707 return(1);
709 return(0);
713 - p_count - parse a repetition count
714 == static int p_count(struct parse *p);
716 static int /* the value */
717 p_count(
718 struct parse *p)
720 int count = 0;
721 int ndigits = 0;
723 assert(p != NULL);
725 while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) {
726 count = count*10 + (GETNEXT() - '0');
727 ndigits++;
730 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
731 return(count);
735 - p_bracket - parse a bracketed character list
736 == static void p_bracket(struct parse *p);
738 * Note a significant property of this code: if the allocset() did SETERROR,
739 * no set operations are done.
741 static void
742 p_bracket(
743 struct parse *p)
745 cset *cs;
746 int invert = 0;
748 assert(p != NULL);
750 cs = allocset(p);
752 /* Dept of Truly Sickening Special-Case Kludges */
753 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]",
754 (size_t)6) == 0) {
755 EMIT(OBOW, 0);
756 NEXTn(6);
757 return;
759 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]",
760 (size_t)6) == 0) {
761 EMIT(OEOW, 0);
762 NEXTn(6);
763 return;
766 if (EAT('^'))
767 invert++; /* make note to invert set at end */
768 if (EAT(']'))
769 CHadd(cs, ']');
770 else if (EAT('-'))
771 CHadd(cs, '-');
772 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
773 p_b_term(p, cs);
774 if (EAT('-'))
775 CHadd(cs, '-');
776 MUSTEAT(']', REG_EBRACK);
778 if (p->error != 0) /* don't mess things up further */
779 return;
781 if (p->g->cflags&REG_ICASE) {
782 int i;
783 int ci;
785 for (i = p->g->csetsize - 1; i >= 0; i--)
786 if (CHIN(cs, i) && isalpha(i)) {
787 ci = othercase(i);
788 if (ci != i)
789 CHadd(cs, ci);
791 if (cs->multis != NULL)
792 mccase(p, cs);
794 if (invert) {
795 int i;
797 for (i = p->g->csetsize - 1; i >= 0; i--)
798 if (CHIN(cs, i))
799 CHsub(cs, i);
800 else
801 CHadd(cs, i);
802 if (p->g->cflags&REG_NEWLINE)
803 CHsub(cs, '\n');
804 if (cs->multis != NULL)
805 mcinvert(p, cs);
808 assert(cs->multis == NULL); /* xxx */
810 if (nch(p, cs) == 1) { /* optimize singleton sets */
811 ordinary(p, firstch(p, cs));
812 freeset(p, cs);
813 } else
814 EMIT(OANYOF, freezeset(p, cs));
818 - p_b_term - parse one term of a bracketed character list
819 == static void p_b_term(struct parse *p, cset *cs);
821 static void
822 p_b_term(
823 struct parse *p,
824 cset *cs)
826 char c;
827 char start, finish;
828 int i;
830 assert(p != NULL);
831 assert(cs != NULL);
833 /* classify what we've got */
834 switch ((MORE()) ? PEEK() : '\0') {
835 case '[':
836 c = (MORE2()) ? PEEK2() : '\0';
837 break;
839 case '-':
840 SETERROR(REG_ERANGE);
841 return; /* NOTE RETURN */
843 default:
844 c = '\0';
845 break;
848 switch (c) {
849 case ':': /* character class */
850 NEXT2();
851 REQUIRE(MORE(), REG_EBRACK);
852 c = PEEK();
853 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
854 p_b_cclass(p, cs);
855 REQUIRE(MORE(), REG_EBRACK);
856 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
857 break;
858 case '=': /* equivalence class */
859 NEXT2();
860 REQUIRE(MORE(), REG_EBRACK);
861 c = PEEK();
862 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
863 p_b_eclass(p, cs);
864 REQUIRE(MORE(), REG_EBRACK);
865 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
866 break;
867 default: /* symbol, ordinary character, or range */
868 /* xxx revision needed for multichar stuff */
869 start = p_b_symbol(p);
870 if (SEE('-') && MORE2() && PEEK2() != ']') {
871 /* range */
872 NEXT();
873 if (EAT('-'))
874 finish = '-';
875 else
876 finish = p_b_symbol(p);
877 } else
878 finish = start;
879 /* xxx what about signed chars here... */
880 REQUIRE(start <= finish, REG_ERANGE);
881 for (i = start; i <= finish; i++)
882 CHadd(cs, i);
883 break;
888 - p_b_cclass - parse a character-class name and deal with it
889 == static void p_b_cclass(struct parse *p, cset *cs);
891 static void
892 p_b_cclass(
893 struct parse *p,
894 cset *cs)
896 const char *sp;
897 const struct cclass *cp;
898 size_t len;
899 const char *u;
900 char c;
902 assert(p != NULL);
903 assert(cs != NULL);
905 sp = p->next;
907 while (MORE() && isalpha((unsigned char)PEEK()))
908 NEXT();
909 len = p->next - sp;
910 for (cp = cclasses; cp->name != NULL; cp++)
911 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
912 break;
913 if (cp->name == NULL) {
914 /* oops, didn't find it */
915 SETERROR(REG_ECTYPE);
916 return;
919 u = cp->chars;
920 while ((c = *u++) != '\0')
921 CHadd(cs, c);
922 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
923 MCadd(p, cs, u);
927 - p_b_eclass - parse an equivalence-class name and deal with it
928 == static void p_b_eclass(struct parse *p, cset *cs);
930 * This implementation is incomplete. xxx
932 static void
933 p_b_eclass(
934 struct parse *p,
935 cset *cs)
937 char c;
939 assert(p != NULL);
940 assert(cs != NULL);
942 c = p_b_coll_elem(p, '=');
943 CHadd(cs, c);
947 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
948 == static char p_b_symbol(struct parse *p);
950 static char /* value of symbol */
951 p_b_symbol(
952 struct parse *p)
954 char value;
956 assert(p != NULL);
958 REQUIRE(MORE(), REG_EBRACK);
959 if (!EATTWO('[', '.'))
960 return(GETNEXT());
962 /* collating symbol */
963 value = p_b_coll_elem(p, '.');
964 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
965 return(value);
969 - p_b_coll_elem - parse a collating-element name and look it up
970 == static char p_b_coll_elem(struct parse *p, int endc);
972 static char /* value of collating element */
973 p_b_coll_elem(
974 struct parse *p,
975 int endc) /* name ended by endc,']' */
977 const char *sp;
978 const struct cname *cp;
979 size_t len;
981 assert(p != NULL);
983 sp = p->next;
985 while (MORE() && !SEETWO(endc, ']'))
986 NEXT();
987 if (!MORE()) {
988 SETERROR(REG_EBRACK);
989 return(0);
991 len = p->next - sp;
992 for (cp = cnames; cp->name != NULL; cp++)
993 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
994 return(cp->code); /* known name */
995 if (len == 1)
996 return(*sp); /* single character */
997 SETERROR(REG_ECOLLATE); /* neither */
998 return(0);
1002 - othercase - return the case counterpart of an alphabetic
1003 == static int othercase(int ch);
1005 static int /* if no counterpart, return ch */
1006 othercase(
1007 int ch)
1009 assert(isalpha(ch));
1010 if (isupper(ch))
1011 return(tolower(ch));
1012 else if (islower(ch))
1013 return(toupper(ch));
1014 else /* peculiar, but could happen */
1015 return(ch);
1019 - bothcases - emit a dualcase version of a two-case character
1020 == static void bothcases(struct parse *p, int ch);
1022 * Boy, is this implementation ever a kludge...
1024 static void
1025 bothcases(
1026 struct parse *p,
1027 int ch)
1029 const char *oldnext;
1030 const char *oldend;
1031 char bracket[3];
1033 assert(p != NULL);
1035 oldnext = p->next;
1036 oldend = p->end;
1038 assert(othercase(ch) != ch); /* p_bracket() would recurse */
1039 p->next = bracket;
1040 p->end = bracket+2;
1041 bracket[0] = ch;
1042 bracket[1] = ']';
1043 bracket[2] = '\0';
1044 p_bracket(p);
1045 assert(p->next == bracket+2);
1046 p->next = oldnext;
1047 p->end = oldend;
1051 - ordinary - emit an ordinary character
1052 == static void ordinary(struct parse *p, int ch);
1054 static void
1055 ordinary(
1056 struct parse *p,
1057 int ch)
1059 cat_t *cap;
1061 assert(p != NULL);
1063 cap = p->g->categories;
1064 if ((p->g->cflags&REG_ICASE) && isalpha((unsigned char) ch)
1065 && othercase((unsigned char) ch) != (unsigned char) ch)
1066 bothcases(p, (unsigned char) ch);
1067 else {
1068 EMIT(OCHAR, (unsigned char)ch);
1069 if (cap[ch] == 0)
1070 cap[ch] = p->g->ncategories++;
1075 - nonnewline - emit REG_NEWLINE version of OANY
1076 == static void nonnewline(struct parse *p);
1078 * Boy, is this implementation ever a kludge...
1080 static void
1081 nonnewline(
1082 struct parse *p)
1084 const char *oldnext;
1085 const char *oldend;
1086 char bracket[4];
1088 assert(p != NULL);
1090 oldnext = p->next;
1091 oldend = p->end;
1093 p->next = bracket;
1094 p->end = bracket+3;
1095 bracket[0] = '^';
1096 bracket[1] = '\n';
1097 bracket[2] = ']';
1098 bracket[3] = '\0';
1099 p_bracket(p);
1100 assert(p->next == bracket+3);
1101 p->next = oldnext;
1102 p->end = oldend;
1106 - repeat - generate code for a bounded repetition, recursively if needed
1107 == static void repeat(struct parse *p, sopno start, int from, int to);
1109 static void
1110 repeat(
1111 struct parse *p,
1112 sopno start, /* operand from here to end of strip */
1113 int from, /* repeated from this number */
1114 int to) /* to this number of times (maybe INFINITY) */
1116 sopno finish;
1117 # define N 2
1118 # define INF 3
1119 # define REP(f, t) ((f)*8 + (t))
1120 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1121 sopno copy;
1123 assert(p != NULL);
1125 finish = HERE();
1127 if (p->error != 0) /* head off possible runaway recursion */
1128 return;
1130 assert(from <= to);
1132 switch (REP(MAP(from), MAP(to))) {
1133 case REP(0, 0): /* must be user doing this */
1134 DROP(finish-start); /* drop the operand */
1135 break;
1136 case REP(0, 1): /* as x{1,1}? */
1137 case REP(0, N): /* as x{1,n}? */
1138 case REP(0, INF): /* as x{1,}? */
1139 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1140 INSERT(OCH_, start); /* offset is wrong... */
1141 repeat(p, start+1, 1, to);
1142 ASTERN(OOR1, start);
1143 AHEAD(start); /* ... fix it */
1144 EMIT(OOR2, 0);
1145 AHEAD(THERE());
1146 ASTERN(O_CH, THERETHERE());
1147 break;
1148 case REP(1, 1): /* trivial case */
1149 /* done */
1150 break;
1151 case REP(1, N): /* as x?x{1,n-1} */
1152 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1153 INSERT(OCH_, start);
1154 ASTERN(OOR1, start);
1155 AHEAD(start);
1156 EMIT(OOR2, 0); /* offset very wrong... */
1157 AHEAD(THERE()); /* ...so fix it */
1158 ASTERN(O_CH, THERETHERE());
1159 copy = dupl(p, start+1, finish+1);
1160 assert(copy == finish+4);
1161 repeat(p, copy, 1, to-1);
1162 break;
1163 case REP(1, INF): /* as x+ */
1164 INSERT(OPLUS_, start);
1165 ASTERN(O_PLUS, start);
1166 break;
1167 case REP(N, N): /* as xx{m-1,n-1} */
1168 copy = dupl(p, start, finish);
1169 repeat(p, copy, from-1, to-1);
1170 break;
1171 case REP(N, INF): /* as xx{n-1,INF} */
1172 copy = dupl(p, start, finish);
1173 repeat(p, copy, from-1, to);
1174 break;
1175 default: /* "can't happen" */
1176 SETERROR(REG_ASSERT); /* just in case */
1177 break;
1182 - seterr - set an error condition
1183 == static int seterr(struct parse *p, int e);
1185 static int /* useless but makes type checking happy */
1186 seterr(
1187 struct parse *p,
1188 int e)
1191 assert(p != NULL);
1193 if (p->error == 0) /* keep earliest error condition */
1194 p->error = e;
1195 p->next = nuls; /* try to bring things to a halt */
1196 p->end = nuls;
1197 return(0); /* make the return value well-defined */
1201 - allocset - allocate a set of characters for []
1202 == static cset *allocset(struct parse *p);
1204 static cset *
1205 allocset(
1206 struct parse *p)
1208 int no;
1209 size_t nc;
1210 size_t nbytes;
1211 cset *cs;
1212 size_t css;
1213 int i;
1215 assert(p != NULL);
1217 no = p->g->ncsets++;
1218 css = (size_t)p->g->csetsize;
1219 if (no >= p->ncsalloc) { /* need another column of space */
1220 p->ncsalloc += CHAR_BIT;
1221 nc = p->ncsalloc;
1222 assert(nc % CHAR_BIT == 0);
1223 nbytes = nc / CHAR_BIT * css;
1224 if (p->g->sets == NULL)
1225 p->g->sets = malloc(nc * sizeof(cset));
1226 else
1227 p->g->sets = realloc(p->g->sets, nc * sizeof(cset));
1228 if (p->g->setbits == NULL)
1229 p->g->setbits = malloc(nbytes);
1230 else {
1231 p->g->setbits = realloc(p->g->setbits, nbytes);
1232 /* xxx this isn't right if setbits is now NULL */
1233 for (i = 0; i < no; i++)
1234 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1236 if (p->g->sets != NULL && p->g->setbits != NULL)
1237 (void) memset((char *)p->g->setbits + (nbytes - css),
1238 0, css);
1239 else {
1240 no = 0;
1241 SETERROR(REG_ESPACE);
1242 /* caller's responsibility not to do set ops */
1246 assert(p->g->sets != NULL); /* xxx */
1247 cs = &p->g->sets[no];
1248 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1249 cs->mask = 1 << ((no) % CHAR_BIT);
1250 cs->hash = 0;
1251 cs->smultis = 0;
1252 cs->multis = NULL;
1254 return(cs);
1258 - freeset - free a now-unused set
1259 == static void freeset(struct parse *p, cset *cs);
1261 static void
1262 freeset(
1263 struct parse *p,
1264 cset *cs)
1266 int i;
1267 cset *top;
1268 size_t css;
1270 assert(p != NULL);
1271 assert(cs != NULL);
1273 top = &p->g->sets[p->g->ncsets];
1274 css = (size_t)p->g->csetsize;
1276 for (i = 0; i < css; i++)
1277 CHsub(cs, i);
1278 if (cs == top-1) /* recover only the easy case */
1279 p->g->ncsets--;
1283 - freezeset - final processing on a set of characters
1284 == static int freezeset(struct parse *p, cset *cs);
1286 * The main task here is merging identical sets. This is usually a waste
1287 * of time (although the hash code minimizes the overhead), but can win
1288 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1289 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1290 * the same value!
1292 static int /* set number */
1293 freezeset(
1294 struct parse *p,
1295 cset *cs)
1297 uch h;
1298 int i;
1299 cset *top;
1300 cset *cs2;
1301 size_t css;
1303 assert(p != NULL);
1304 assert(cs != NULL);
1306 h = cs->hash;
1307 top = &p->g->sets[p->g->ncsets];
1308 css = (size_t)p->g->csetsize;
1310 /* look for an earlier one which is the same */
1311 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1312 if (cs2->hash == h && cs2 != cs) {
1313 /* maybe */
1314 for (i = 0; i < css; i++)
1315 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1316 break; /* no */
1317 if (i == css)
1318 break; /* yes */
1321 if (cs2 < top) { /* found one */
1322 freeset(p, cs);
1323 cs = cs2;
1326 return((int)(cs - p->g->sets));
1330 - firstch - return first character in a set (which must have at least one)
1331 == static int firstch(struct parse *p, cset *cs);
1333 static int /* character; there is no "none" value */
1334 firstch(
1335 struct parse *p,
1336 cset *cs)
1338 int i;
1339 size_t css;
1341 assert(p != NULL);
1342 assert(cs != NULL);
1344 css = (size_t)p->g->csetsize;
1346 for (i = 0; i < css; i++)
1347 if (CHIN(cs, i))
1348 return((char)i);
1349 assert(never);
1350 return(0); /* arbitrary */
1354 - nch - number of characters in a set
1355 == static int nch(struct parse *p, cset *cs);
1357 static int
1358 nch(
1359 struct parse *p,
1360 cset *cs)
1362 int i;
1363 size_t css;
1364 int n = 0;
1366 assert(p != NULL);
1367 assert(cs != NULL);
1369 css = (size_t)p->g->csetsize;
1371 for (i = 0; i < css; i++)
1372 if (CHIN(cs, i))
1373 n++;
1374 return(n);
1378 - mcadd - add a collating element to a cset
1379 == static void mcadd(struct parse *p, cset *cs, \
1380 == char *cp);
1382 static void
1383 mcadd(
1384 struct parse *p,
1385 cset *cs,
1386 const char *cp)
1388 size_t oldend;
1390 assert(p != NULL);
1391 assert(cs != NULL);
1392 assert(cp != NULL);
1394 oldend = cs->smultis;
1396 cs->smultis += strlen(cp) + 1;
1397 if (cs->multis == NULL)
1398 cs->multis = malloc(cs->smultis);
1399 else
1400 cs->multis = realloc(cs->multis, cs->smultis);
1401 if (cs->multis == NULL) {
1402 SETERROR(REG_ESPACE);
1403 return;
1406 (void) strcpy(cs->multis + oldend - 1, cp);
1407 cs->multis[cs->smultis - 1] = '\0';
1410 #if 0
1412 - mcsub - subtract a collating element from a cset
1413 == static void mcsub(cset *cs, char *cp);
1415 static void
1416 mcsub(
1417 cset *cs,
1418 char *cp)
1420 char *fp;
1421 size_t len;
1423 assert(cs != NULL);
1424 assert(cp != NULL);
1426 fp = mcfind(cs, cp);
1427 len = strlen(fp);
1429 assert(fp != NULL);
1430 (void) memmove(fp, fp + len + 1,
1431 cs->smultis - (fp + len + 1 - cs->multis));
1432 cs->smultis -= len;
1434 if (cs->smultis == 0) {
1435 free(cs->multis);
1436 cs->multis = NULL;
1437 return;
1440 cs->multis = realloc(cs->multis, cs->smultis);
1441 assert(cs->multis != NULL);
1445 - mcin - is a collating element in a cset?
1446 == static int mcin(cset *cs, char *cp);
1448 static int
1449 mcin(
1450 cset *cs,
1451 char *cp)
1454 assert(cs != NULL);
1455 assert(cp != NULL);
1457 return(mcfind(cs, cp) != NULL);
1461 - mcfind - find a collating element in a cset
1462 == static char *mcfind(cset *cs, char *cp);
1464 static char *
1465 mcfind(
1466 cset *cs,
1467 char *cp)
1469 char *p;
1471 assert(cs != NULL);
1472 assert(cp != NULL);
1474 if (cs->multis == NULL)
1475 return(NULL);
1476 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1477 if (strcmp(cp, p) == 0)
1478 return(p);
1479 return(NULL);
1481 #endif
1484 - mcinvert - invert the list of collating elements in a cset
1485 == static void mcinvert(struct parse *p, cset *cs);
1487 * This would have to know the set of possibilities. Implementation
1488 * is deferred.
1490 /* ARGSUSED */
1491 static void
1492 mcinvert(
1493 struct parse *p,
1494 cset *cs)
1497 assert(p != NULL);
1498 assert(cs != NULL);
1500 assert(cs->multis == NULL); /* xxx */
1504 - mccase - add case counterparts of the list of collating elements in a cset
1505 == static void mccase(struct parse *p, cset *cs);
1507 * This would have to know the set of possibilities. Implementation
1508 * is deferred.
1510 /* ARGSUSED */
1511 static void
1512 mccase(
1513 struct parse *p,
1514 cset *cs)
1517 assert(p != NULL);
1518 assert(cs != NULL);
1520 assert(cs->multis == NULL); /* xxx */
1524 - isinsets - is this character in any sets?
1525 == static int isinsets(struct re_guts *g, int c);
1527 static int /* predicate */
1528 isinsets(
1529 struct re_guts *g,
1530 int c)
1532 uch *col;
1533 int i;
1534 int ncols;
1535 unsigned uc = (unsigned char)c;
1537 assert(g != NULL);
1539 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1541 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1542 if (col[uc] != 0)
1543 return(1);
1544 return(0);
1548 - samesets - are these two characters in exactly the same sets?
1549 == static int samesets(struct re_guts *g, int c1, int c2);
1551 static int /* predicate */
1552 samesets(
1553 struct re_guts *g,
1554 int c1,
1555 int c2)
1557 uch *col;
1558 int i;
1559 int ncols;
1560 unsigned uc1 = (unsigned char)c1;
1561 unsigned uc2 = (unsigned char)c2;
1563 assert(g != NULL);
1565 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1567 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1568 if (col[uc1] != col[uc2])
1569 return(0);
1570 return(1);
1574 - categorize - sort out character categories
1575 == static void categorize(struct parse *p, struct re_guts *g);
1577 static void
1578 categorize(
1579 struct parse *p,
1580 struct re_guts *g)
1582 cat_t *cats;
1583 int c;
1584 int c2;
1585 cat_t cat;
1587 assert(p != NULL);
1588 assert(g != NULL);
1590 cats = g->categories;
1592 /* avoid making error situations worse */
1593 if (p->error != 0)
1594 return;
1596 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1597 if (cats[c] == 0 && isinsets(g, c)) {
1598 cat = g->ncategories++;
1599 cats[c] = cat;
1600 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1601 if (cats[c2] == 0 && samesets(g, c, c2))
1602 cats[c2] = cat;
1607 - dupl - emit a duplicate of a bunch of sops
1608 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1610 static sopno /* start of duplicate */
1611 dupl(
1612 struct parse *p,
1613 sopno start, /* from here */
1614 sopno finish) /* to this less one */
1616 sopno ret;
1617 sopno len = finish - start;
1619 assert(p != NULL);
1621 ret = HERE();
1623 assert(finish >= start);
1624 if (len == 0)
1625 return(ret);
1626 enlarge(p, p->ssize + len); /* this many unexpected additions */
1627 assert(p->ssize >= p->slen + len);
1628 (void)memcpy(p->strip + p->slen, p->strip + start,
1629 (size_t)len * sizeof(sop));
1630 p->slen += len;
1631 return(ret);
1635 - doemit - emit a strip operator
1636 == static void doemit(struct parse *p, sop op, size_t opnd);
1638 * It might seem better to implement this as a macro with a function as
1639 * hard-case backup, but it's just too big and messy unless there are
1640 * some changes to the data structures. Maybe later.
1642 static void
1643 doemit(
1644 struct parse *p,
1645 sop op,
1646 sopno opnd)
1649 assert(p != NULL);
1651 /* avoid making error situations worse */
1652 if (p->error != 0)
1653 return;
1655 /* deal with oversize operands ("can't happen", more or less) */
1656 assert(opnd < 1<<OPSHIFT);
1658 /* deal with undersized strip */
1659 if (p->slen >= p->ssize)
1660 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1661 assert(p->slen < p->ssize);
1663 /* finally, it's all reduced to the easy case */
1664 p->strip[p->slen++] = SOP(op, opnd);
1668 - doinsert - insert a sop into the strip
1669 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1671 static void
1672 doinsert(
1673 struct parse *p,
1674 sop op,
1675 sopno opnd,
1676 sopno pos)
1678 sopno sn;
1679 sop s;
1680 int i;
1682 assert(p != NULL);
1684 /* avoid making error situations worse */
1685 if (p->error != 0)
1686 return;
1688 sn = HERE();
1689 EMIT(op, opnd); /* do checks, ensure space */
1690 assert(HERE() == sn+1);
1691 s = p->strip[sn];
1693 /* adjust paren pointers */
1694 assert(pos > 0);
1695 for (i = 1; i < NPAREN; i++) {
1696 if (p->pbegin[i] >= pos) {
1697 p->pbegin[i]++;
1699 if (p->pend[i] >= pos) {
1700 p->pend[i]++;
1704 memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop));
1705 p->strip[pos] = s;
1709 - dofwd - complete a forward reference
1710 == static void dofwd(struct parse *p, sopno pos, sop value);
1712 static void
1713 dofwd(
1714 struct parse *p,
1715 sopno pos,
1716 sopno value)
1719 assert(p != NULL);
1721 /* avoid making error situations worse */
1722 if (p->error != 0)
1723 return;
1725 assert(value < 1<<OPSHIFT);
1726 p->strip[pos] = OP(p->strip[pos]) | value;
1730 - enlarge - enlarge the strip
1731 == static void enlarge(struct parse *p, sopno size);
1733 static void
1734 enlarge(
1735 struct parse *p,
1736 sopno size)
1738 sop *sp;
1740 assert(p != NULL);
1742 if (p->ssize >= size)
1743 return;
1745 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1746 if (sp == NULL) {
1747 SETERROR(REG_ESPACE);
1748 return;
1750 p->strip = sp;
1751 p->ssize = size;
1755 - stripsnug - compact the strip
1756 == static void stripsnug(struct parse *p, struct re_guts *g);
1758 static void
1759 stripsnug(
1760 struct parse *p,
1761 struct re_guts *g)
1764 assert(p != NULL);
1765 assert(g != NULL);
1767 g->nstates = p->slen;
1768 g->strip = realloc(p->strip, p->slen * sizeof(sop));
1769 if (g->strip == NULL) {
1770 SETERROR(REG_ESPACE);
1771 g->strip = p->strip;
1776 - findmust - fill in must and mlen with longest mandatory literal string
1777 == static void findmust(struct parse *p, struct re_guts *g);
1779 * This algorithm could do fancy things like analyzing the operands of |
1780 * for common subsequences. Someday. This code is simple and finds most
1781 * of the interesting cases.
1783 * Note that must and mlen got initialized during setup.
1785 static void
1786 findmust(
1787 struct parse *p,
1788 struct re_guts *g)
1790 sop *scan;
1791 sop *start = NULL;
1792 sop *newstart = NULL;
1793 sopno newlen;
1794 sop s;
1795 char *cp;
1796 sopno i;
1798 assert(p != NULL);
1799 assert(g != NULL);
1801 /* avoid making error situations worse */
1802 if (p->error != 0)
1803 return;
1805 /* find the longest OCHAR sequence in strip */
1806 newlen = 0;
1807 scan = g->strip + 1;
1808 do {
1809 s = *scan++;
1810 switch (OP(s)) {
1811 case OCHAR: /* sequence member */
1812 if (newlen == 0) /* new sequence */
1813 newstart = scan - 1;
1814 newlen++;
1815 break;
1816 case OPLUS_: /* things that don't break one */
1817 case OLPAREN:
1818 case ORPAREN:
1819 break;
1820 case OQUEST_: /* things that must be skipped */
1821 case OCH_:
1822 scan--;
1823 do {
1824 scan += OPND(s);
1825 s = *scan;
1826 /* assert() interferes w debug printouts */
1827 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1828 OP(s) != OOR2) {
1829 g->iflags |= BAD;
1830 return;
1832 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1833 /* FALLTHROUGH */
1834 default: /* things that break a sequence */
1835 if (newlen > g->mlen) { /* ends one */
1836 start = newstart;
1837 g->mlen = newlen;
1839 newlen = 0;
1840 break;
1842 } while (OP(s) != OEND);
1844 if (start == NULL)
1845 g->mlen = 0;
1847 if (g->mlen == 0) /* there isn't one */
1848 return;
1850 /* turn it into a character string */
1851 g->must = malloc((size_t)g->mlen + 1);
1852 if (g->must == NULL) { /* argh; just forget it */
1853 g->mlen = 0;
1854 return;
1856 cp = g->must;
1857 scan = start;
1858 for (i = g->mlen; i > 0; i--) {
1859 while (OP(s = *scan++) != OCHAR)
1860 continue;
1861 assert(cp < g->must + g->mlen);
1862 *cp++ = (char)OPND(s);
1864 assert(cp == g->must + g->mlen);
1865 *cp++ = '\0'; /* just on general principles */
1869 - pluscount - count + nesting
1870 == static sopno pluscount(struct parse *p, struct re_guts *g);
1872 static sopno /* nesting depth */
1873 pluscount(
1874 struct parse *p,
1875 struct re_guts *g)
1877 sop *scan;
1878 sop s;
1879 sopno plusnest = 0;
1880 sopno maxnest = 0;
1882 assert(p != NULL);
1883 assert(g != NULL);
1885 if (p->error != 0)
1886 return(0); /* there may not be an OEND */
1888 scan = g->strip + 1;
1889 do {
1890 s = *scan++;
1891 switch (OP(s)) {
1892 case OPLUS_:
1893 plusnest++;
1894 break;
1895 case O_PLUS:
1896 if (plusnest > maxnest)
1897 maxnest = plusnest;
1898 plusnest--;
1899 break;
1901 } while (OP(s) != OEND);
1902 if (plusnest != 0)
1903 g->iflags |= BAD;
1904 return(maxnest);