amd64: declare initializecpu outside of SMP
[dragonfly.git] / lib / libc / regex / engine.c
blob5e589f033153eec7cac779cfa7152ca037f514e6
1 /*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 * The Regents of the University of California. All rights reserved.
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
33 * @(#)engine.c 8.5 (Berkeley) 3/20/94
34 * $FreeBSD: src/lib/libc/regex/engine.c,v 1.21 2007/05/25 12:44:58 delphij Exp $
35 * $DragonFly: src/lib/libc/regex/engine.c,v 1.7 2005/11/20 09:18:37 swildner Exp $
39 * The matching engine and friends. This file is #included by regexec.c
40 * after suitable #defines of a variety of macros used herein, so that
41 * different state representations can be used without duplicating masses
42 * of code.
45 #ifdef SNAMES
46 #define matcher smatcher
47 #define fast sfast
48 #define slow sslow
49 #define dissect sdissect
50 #define backref sbackref
51 #define step sstep
52 #define print sprint
53 #define at sat
54 #define match smat
55 #endif
56 #ifdef LNAMES
57 #define matcher lmatcher
58 #define fast lfast
59 #define slow lslow
60 #define dissect ldissect
61 #define backref lbackref
62 #define step lstep
63 #define print lprint
64 #define at lat
65 #define match lmat
66 #endif
67 #ifdef MNAMES
68 #define matcher mmatcher
69 #define fast mfast
70 #define slow mslow
71 #define dissect mdissect
72 #define backref mbackref
73 #define step mstep
74 #define print mprint
75 #define at mat
76 #define match mmat
77 #endif
79 /* another structure passed up and down to avoid zillions of parameters */
80 struct match {
81 struct re_guts *g;
82 int eflags;
83 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
84 const char *offp; /* offsets work from here */
85 const char *beginp; /* start of string -- virtual NUL precedes */
86 const char *endp; /* end of string -- virtual NUL here */
87 const char *coldp; /* can be no match starting before here */
88 const char **lastpos; /* [nplus+1] */
89 STATEVARS;
90 states st; /* current states */
91 states fresh; /* states for a fresh start */
92 states tmp; /* temporary */
93 states empty; /* empty set of states */
94 mbstate_t mbs; /* multibyte conversion state */
97 /* ========= begin header generated by ./mkh ========= */
98 #ifdef __cplusplus
99 extern "C" {
100 #endif
102 /* === engine.c === */
103 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
104 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
105 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
106 static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
107 static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
108 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft);
109 #define MAX_RECURSION 100
110 #define BOL (OUT-1)
111 #define EOL (BOL-1)
112 #define BOLEOL (BOL-2)
113 #define NOTHING (BOL-3)
114 #define BOW (BOL-4)
115 #define EOW (BOL-5)
116 #define BADCHAR (BOL-6)
117 #define NONCHAR(c) ((c) <= OUT)
118 #ifdef REDEBUG
119 static void print(struct match *m, const char *caption, states st, int ch, FILE *d);
120 #endif
121 #ifdef REDEBUG
122 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst);
123 #endif
124 #ifdef REDEBUG
125 static const char *pchar(int ch);
126 #endif
128 #ifdef __cplusplus
130 #endif
131 /* ========= end header generated by ./mkh ========= */
133 #ifdef REDEBUG
134 #define SP(t, s, c) print(m, t, s, c, stdout)
135 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
136 #define NOTE(str) { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
137 #else
138 #define SP(t, s, c) /* nothing */
139 #define AT(t, p1, p2, s1, s2) /* nothing */
140 #define NOTE(s) /* nothing */
141 #endif
144 - matcher - the actual matching engine
145 == static int matcher(struct re_guts *g, const char *string, \
146 == size_t nmatch, regmatch_t pmatch[], int eflags);
148 static int /* 0 success, REG_NOMATCH failure */
149 matcher(struct re_guts *g,
150 const char *string,
151 size_t nmatch,
152 regmatch_t pmatch[],
153 int eflags)
155 const char *endp;
156 int i;
157 struct match mv;
158 struct match *m = &mv;
159 const char *dp;
160 const sopno gf = g->firststate+1; /* +1 for OEND */
161 const sopno gl = g->laststate;
162 const char *start;
163 const char *stop;
164 /* Boyer-Moore algorithms variables */
165 const char *pp;
166 int cj, mj;
167 const char *mustfirst;
168 const char *mustlast;
169 int *matchjump;
170 int *charjump;
172 /* simplify the situation where possible */
173 if (g->cflags&REG_NOSUB)
174 nmatch = 0;
175 if (eflags&REG_STARTEND) {
176 start = string + pmatch[0].rm_so;
177 stop = string + pmatch[0].rm_eo;
178 } else {
179 start = string;
180 stop = start + strlen(start);
182 if (stop < start)
183 return(REG_INVARG);
185 /* prescreening; this does wonders for this rather slow code */
186 if (g->must != NULL) {
187 if (g->charjump != NULL && g->matchjump != NULL) {
188 mustfirst = g->must;
189 mustlast = g->must + g->mlen - 1;
190 charjump = g->charjump;
191 matchjump = g->matchjump;
192 pp = mustlast;
193 for (dp = start+g->mlen-1; dp < stop;) {
194 /* Fast skip non-matches */
195 while (dp < stop && charjump[(int)*dp])
196 dp += charjump[(int)*dp];
198 if (dp >= stop)
199 break;
201 /* Greedy matcher */
202 /* We depend on not being used for
203 * for strings of length 1
205 while (*--dp == *--pp && pp != mustfirst);
207 if (*dp == *pp)
208 break;
210 /* Jump to next possible match */
211 mj = matchjump[pp - mustfirst];
212 cj = charjump[(int)*dp];
213 dp += (cj < mj ? mj : cj);
214 pp = mustlast;
216 if (pp != mustfirst)
217 return(REG_NOMATCH);
218 } else {
219 for (dp = start; dp < stop; dp++)
220 if (*dp == g->must[0] &&
221 stop - dp >= g->mlen &&
222 memcmp(dp, g->must, (size_t)g->mlen) == 0)
223 break;
224 if (dp == stop) /* we didn't find g->must */
225 return(REG_NOMATCH);
229 /* match struct setup */
230 m->g = g;
231 m->eflags = eflags;
232 m->pmatch = NULL;
233 m->lastpos = NULL;
234 m->offp = string;
235 m->beginp = start;
236 m->endp = stop;
237 STATESETUP(m, 4);
238 SETUP(m->st);
239 SETUP(m->fresh);
240 SETUP(m->tmp);
241 SETUP(m->empty);
242 CLEAR(m->empty);
243 ZAPSTATE(&m->mbs);
245 /* Adjust start according to moffset, to speed things up */
246 if (g->moffset > -1)
247 start = ((dp - g->moffset) < start) ? start : dp - g->moffset;
249 /* this loop does only one repetition except for backrefs */
250 for (;;) {
251 endp = fast(m, start, stop, gf, gl);
252 if (endp == NULL) { /* a miss */
253 if (m->pmatch != NULL)
254 free((char *)m->pmatch);
255 if (m->lastpos != NULL)
256 free((char *)m->lastpos);
257 STATETEARDOWN(m);
258 return(REG_NOMATCH);
260 if (nmatch == 0 && !g->backrefs)
261 break; /* no further info needed */
263 /* where? */
264 assert(m->coldp != NULL);
265 for (;;) {
266 NOTE("finding start");
267 endp = slow(m, m->coldp, stop, gf, gl);
268 if (endp != NULL)
269 break;
270 assert(m->coldp < m->endp);
271 m->coldp += XMBRTOWC(NULL, m->coldp,
272 m->endp - m->coldp, &m->mbs, 0);
274 if (nmatch == 1 && !g->backrefs)
275 break; /* no further info needed */
277 /* oh my, he wants the subexpressions... */
278 if (m->pmatch == NULL)
279 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
280 sizeof(regmatch_t));
281 if (m->pmatch == NULL) {
282 STATETEARDOWN(m);
283 return(REG_ESPACE);
285 for (i = 1; i <= m->g->nsub; i++)
286 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
287 if (!g->backrefs && !(m->eflags&REG_BACKR)) {
288 NOTE("dissecting");
289 dp = dissect(m, m->coldp, endp, gf, gl);
290 } else {
291 if (g->nplus > 0 && m->lastpos == NULL)
292 m->lastpos = malloc((g->nplus+1) *
293 sizeof(const char *));
294 if (g->nplus > 0 && m->lastpos == NULL) {
295 free(m->pmatch);
296 STATETEARDOWN(m);
297 return(REG_ESPACE);
299 NOTE("backref dissect");
300 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
302 if (dp != NULL)
303 break;
305 /* uh-oh... we couldn't find a subexpression-level match */
306 assert(g->backrefs); /* must be back references doing it */
307 assert(g->nplus == 0 || m->lastpos != NULL);
308 for (;;) {
309 if (dp != NULL || endp <= m->coldp)
310 break; /* defeat */
311 NOTE("backoff");
312 endp = slow(m, m->coldp, endp-1, gf, gl);
313 if (endp == NULL)
314 break; /* defeat */
315 /* try it on a shorter possibility */
316 #ifndef NDEBUG
317 for (i = 1; i <= m->g->nsub; i++) {
318 assert(m->pmatch[i].rm_so == -1);
319 assert(m->pmatch[i].rm_eo == -1);
321 #endif
322 NOTE("backoff dissect");
323 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
325 assert(dp == NULL || dp == endp);
326 if (dp != NULL) /* found a shorter one */
327 break;
329 /* despite initial appearances, there is no match here */
330 NOTE("false alarm");
331 /* recycle starting later */
332 start = m->coldp + XMBRTOWC(NULL, m->coldp,
333 stop - m->coldp, &m->mbs, 0);
334 assert(start <= stop);
337 /* fill in the details if requested */
338 if (nmatch > 0) {
339 pmatch[0].rm_so = m->coldp - m->offp;
340 pmatch[0].rm_eo = endp - m->offp;
342 if (nmatch > 1) {
343 assert(m->pmatch != NULL);
344 for (i = 1; i < nmatch; i++)
345 if (i <= m->g->nsub)
346 pmatch[i] = m->pmatch[i];
347 else {
348 pmatch[i].rm_so = -1;
349 pmatch[i].rm_eo = -1;
353 if (m->pmatch != NULL)
354 free((char *)m->pmatch);
355 if (m->lastpos != NULL)
356 free((char *)m->lastpos);
357 STATETEARDOWN(m);
358 return(0);
362 - dissect - figure out what matched what, no back references
363 == static const char *dissect(struct match *m, const char *start, \
364 == const char *stop, sopno startst, sopno stopst);
366 static const char * /* == stop (success) always */
367 dissect(struct match *m,
368 const char *start,
369 const char *stop,
370 sopno startst,
371 sopno stopst)
373 int i;
374 sopno ss; /* start sop of current subRE */
375 sopno es; /* end sop of current subRE */
376 const char *sp; /* start of string matched by it */
377 const char *stp; /* string matched by it cannot pass here */
378 const char *rest; /* start of rest of string */
379 const char *tail; /* string unmatched by rest of RE */
380 sopno ssub; /* start sop of subsubRE */
381 sopno esub; /* end sop of subsubRE */
382 const char *ssp; /* start of string matched by subsubRE */
383 const char *sep; /* end of string matched by subsubRE */
384 const char *oldssp; /* previous ssp */
385 const char *dp;
387 AT("diss", start, stop, startst, stopst);
388 sp = start;
389 for (ss = startst; ss < stopst; ss = es) {
390 /* identify end of subRE */
391 es = ss;
392 switch (OP(m->g->strip[es])) {
393 case OPLUS_:
394 case OQUEST_:
395 es += OPND(m->g->strip[es]);
396 break;
397 case OCH_:
398 while (OP(m->g->strip[es]) != O_CH)
399 es += OPND(m->g->strip[es]);
400 break;
402 es++;
404 /* figure out what it matched */
405 switch (OP(m->g->strip[ss])) {
406 case OEND:
407 assert(nope);
408 break;
409 case OCHAR:
410 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
411 break;
412 case OBOL:
413 case OEOL:
414 case OBOW:
415 case OEOW:
416 break;
417 case OANY:
418 case OANYOF:
419 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
420 break;
421 case OBACK_:
422 case O_BACK:
423 assert(nope);
424 break;
425 /* cases where length of match is hard to find */
426 case OQUEST_:
427 stp = stop;
428 for (;;) {
429 /* how long could this one be? */
430 rest = slow(m, sp, stp, ss, es);
431 assert(rest != NULL); /* it did match */
432 /* could the rest match the rest? */
433 tail = slow(m, rest, stop, es, stopst);
434 if (tail == stop)
435 break; /* yes! */
436 /* no -- try a shorter match for this one */
437 stp = rest - 1;
438 assert(stp >= sp); /* it did work */
440 ssub = ss + 1;
441 esub = es - 1;
442 /* did innards match? */
443 if (slow(m, sp, rest, ssub, esub) != NULL) {
444 dp = dissect(m, sp, rest, ssub, esub);
445 assert(dp == rest);
446 } else /* no */
447 assert(sp == rest);
448 sp = rest;
449 break;
450 case OPLUS_:
451 stp = stop;
452 for (;;) {
453 /* how long could this one be? */
454 rest = slow(m, sp, stp, ss, es);
455 assert(rest != NULL); /* it did match */
456 /* could the rest match the rest? */
457 tail = slow(m, rest, stop, es, stopst);
458 if (tail == stop)
459 break; /* yes! */
460 /* no -- try a shorter match for this one */
461 stp = rest - 1;
462 assert(stp >= sp); /* it did work */
464 ssub = ss + 1;
465 esub = es - 1;
466 ssp = sp;
467 oldssp = ssp;
468 for (;;) { /* find last match of innards */
469 sep = slow(m, ssp, rest, ssub, esub);
470 if (sep == NULL || sep == ssp)
471 break; /* failed or matched null */
472 oldssp = ssp; /* on to next try */
473 ssp = sep;
475 if (sep == NULL) {
476 /* last successful match */
477 sep = ssp;
478 ssp = oldssp;
480 assert(sep == rest); /* must exhaust substring */
481 assert(slow(m, ssp, sep, ssub, esub) == rest);
482 dp = dissect(m, ssp, sep, ssub, esub);
483 assert(dp == sep);
484 sp = rest;
485 break;
486 case OCH_:
487 stp = stop;
488 for (;;) {
489 /* how long could this one be? */
490 rest = slow(m, sp, stp, ss, es);
491 assert(rest != NULL); /* it did match */
492 /* could the rest match the rest? */
493 tail = slow(m, rest, stop, es, stopst);
494 if (tail == stop)
495 break; /* yes! */
496 /* no -- try a shorter match for this one */
497 stp = rest - 1;
498 assert(stp >= sp); /* it did work */
500 ssub = ss + 1;
501 esub = ss + OPND(m->g->strip[ss]) - 1;
502 assert(OP(m->g->strip[esub]) == OOR1);
503 for (;;) { /* find first matching branch */
504 if (slow(m, sp, rest, ssub, esub) == rest)
505 break; /* it matched all of it */
506 /* that one missed, try next one */
507 assert(OP(m->g->strip[esub]) == OOR1);
508 esub++;
509 assert(OP(m->g->strip[esub]) == OOR2);
510 ssub = esub + 1;
511 esub += OPND(m->g->strip[esub]);
512 if (OP(m->g->strip[esub]) == OOR2)
513 esub--;
514 else
515 assert(OP(m->g->strip[esub]) == O_CH);
517 dp = dissect(m, sp, rest, ssub, esub);
518 assert(dp == rest);
519 sp = rest;
520 break;
521 case O_PLUS:
522 case O_QUEST:
523 case OOR1:
524 case OOR2:
525 case O_CH:
526 assert(nope);
527 break;
528 case OLPAREN:
529 i = OPND(m->g->strip[ss]);
530 assert(0 < i && i <= m->g->nsub);
531 m->pmatch[i].rm_so = sp - m->offp;
532 break;
533 case ORPAREN:
534 i = OPND(m->g->strip[ss]);
535 assert(0 < i && i <= m->g->nsub);
536 m->pmatch[i].rm_eo = sp - m->offp;
537 break;
538 default: /* uh oh */
539 assert(nope);
540 break;
544 assert(sp == stop);
545 return(sp);
549 - backref - figure out what matched what, figuring in back references
550 == static const char *backref(struct match *m, const char *start, \
551 == const char *stop, sopno startst, sopno stopst, sopno lev);
553 static const char * /* == stop (success) or NULL (failure) */
554 backref(struct match *m,
555 const char *start,
556 const char *stop,
557 sopno startst,
558 sopno stopst,
559 sopno lev, /* PLUS nesting level */
560 int rec)
562 int i;
563 sopno ss; /* start sop of current subRE */
564 const char *sp; /* start of string matched by it */
565 sopno ssub; /* start sop of subsubRE */
566 sopno esub; /* end sop of subsubRE */
567 const char *ssp; /* start of string matched by subsubRE */
568 const char *dp;
569 size_t len;
570 int hard;
571 sop s;
572 regoff_t offsave;
573 cset *cs;
574 wint_t wc;
576 AT("back", start, stop, startst, stopst);
577 sp = start;
579 /* get as far as we can with easy stuff */
580 hard = 0;
581 for (ss = startst; !hard && ss < stopst; ss++)
582 switch (OP(s = m->g->strip[ss])) {
583 case OCHAR:
584 if (sp == stop)
585 return(NULL);
586 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
587 if (wc != OPND(s))
588 return(NULL);
589 break;
590 case OANY:
591 if (sp == stop)
592 return(NULL);
593 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
594 if (wc == BADCHAR)
595 return (NULL);
596 break;
597 case OANYOF:
598 if (sp == stop)
599 return (NULL);
600 cs = &m->g->sets[OPND(s)];
601 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
602 if (wc == BADCHAR || !CHIN(cs, wc))
603 return(NULL);
604 break;
605 case OBOL:
606 if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
607 (sp < m->endp && *(sp-1) == '\n' &&
608 (m->g->cflags&REG_NEWLINE)) )
609 { /* yes */ }
610 else
611 return(NULL);
612 break;
613 case OEOL:
614 if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
615 (sp < m->endp && *sp == '\n' &&
616 (m->g->cflags&REG_NEWLINE)) )
617 { /* yes */ }
618 else
619 return(NULL);
620 break;
621 case OBOW:
622 if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
623 (sp < m->endp && *(sp-1) == '\n' &&
624 (m->g->cflags&REG_NEWLINE)) ||
625 (sp > m->beginp &&
626 !ISWORD(*(sp-1))) ) &&
627 (sp < m->endp && ISWORD(*sp)) )
628 { /* yes */ }
629 else
630 return(NULL);
631 break;
632 case OEOW:
633 if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
634 (sp < m->endp && *sp == '\n' &&
635 (m->g->cflags&REG_NEWLINE)) ||
636 (sp < m->endp && !ISWORD(*sp)) ) &&
637 (sp > m->beginp && ISWORD(*(sp-1))) )
638 { /* yes */ }
639 else
640 return(NULL);
641 break;
642 case O_QUEST:
643 break;
644 case OOR1: /* matches null but needs to skip */
645 ss++;
646 s = m->g->strip[ss];
647 do {
648 assert(OP(s) == OOR2);
649 ss += OPND(s);
650 } while (OP(s = m->g->strip[ss]) != O_CH);
651 /* note that the ss++ gets us past the O_CH */
652 break;
653 default: /* have to make a choice */
654 hard = 1;
655 break;
657 if (!hard) { /* that was it! */
658 if (sp != stop)
659 return(NULL);
660 return(sp);
662 ss--; /* adjust for the for's final increment */
664 /* the hard stuff */
665 AT("hard", sp, stop, ss, stopst);
666 s = m->g->strip[ss];
667 switch (OP(s)) {
668 case OBACK_: /* the vilest depths */
669 i = OPND(s);
670 assert(0 < i && i <= m->g->nsub);
671 if (m->pmatch[i].rm_eo == -1)
672 return(NULL);
673 assert(m->pmatch[i].rm_so != -1);
674 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
675 if (len == 0 && rec++ > MAX_RECURSION)
676 return(NULL);
677 assert(stop - m->beginp >= len);
678 if (sp > stop - len)
679 return(NULL); /* not enough left to match */
680 ssp = m->offp + m->pmatch[i].rm_so;
681 if (memcmp(sp, ssp, len) != 0)
682 return(NULL);
683 while (m->g->strip[ss] != SOP(O_BACK, i))
684 ss++;
685 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
686 break;
687 case OQUEST_: /* to null or not */
688 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
689 if (dp != NULL)
690 return(dp); /* not */
691 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
692 break;
693 case OPLUS_:
694 assert(m->lastpos != NULL);
695 assert(lev+1 <= m->g->nplus);
696 m->lastpos[lev+1] = sp;
697 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
698 break;
699 case O_PLUS:
700 if (sp == m->lastpos[lev]) /* last pass matched null */
701 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
702 /* try another pass */
703 m->lastpos[lev] = sp;
704 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
705 if (dp == NULL)
706 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
707 else
708 return(dp);
709 break;
710 case OCH_: /* find the right one, if any */
711 ssub = ss + 1;
712 esub = ss + OPND(s) - 1;
713 assert(OP(m->g->strip[esub]) == OOR1);
714 for (;;) { /* find first matching branch */
715 dp = backref(m, sp, stop, ssub, esub, lev, rec);
716 if (dp != NULL)
717 return(dp);
718 /* that one missed, try next one */
719 if (OP(m->g->strip[esub]) == O_CH)
720 return(NULL); /* there is none */
721 esub++;
722 assert(OP(m->g->strip[esub]) == OOR2);
723 ssub = esub + 1;
724 esub += OPND(m->g->strip[esub]);
725 if (OP(m->g->strip[esub]) == OOR2)
726 esub--;
727 else
728 assert(OP(m->g->strip[esub]) == O_CH);
730 break;
731 case OLPAREN: /* must undo assignment if rest fails */
732 i = OPND(s);
733 assert(0 < i && i <= m->g->nsub);
734 offsave = m->pmatch[i].rm_so;
735 m->pmatch[i].rm_so = sp - m->offp;
736 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
737 if (dp != NULL)
738 return(dp);
739 m->pmatch[i].rm_so = offsave;
740 return(NULL);
741 break;
742 case ORPAREN: /* must undo assignment if rest fails */
743 i = OPND(s);
744 assert(0 < i && i <= m->g->nsub);
745 offsave = m->pmatch[i].rm_eo;
746 m->pmatch[i].rm_eo = sp - m->offp;
747 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
748 if (dp != NULL)
749 return(dp);
750 m->pmatch[i].rm_eo = offsave;
751 return(NULL);
752 break;
753 default: /* uh oh */
754 assert(nope);
755 break;
758 /* "can't happen" */
759 assert(nope);
760 /* NOTREACHED */
761 return "shut up gcc";
765 - fast - step through the string at top speed
766 == static const char *fast(struct match *m, const char *start, \
767 == const char *stop, sopno startst, sopno stopst);
769 static const char * /* where tentative match ended, or NULL */
770 fast( struct match *m,
771 const char *start,
772 const char *stop,
773 sopno startst,
774 sopno stopst)
776 states st = m->st;
777 states fresh = m->fresh;
778 states tmp = m->tmp;
779 const char *p = start;
780 wint_t c;
781 wint_t lastc; /* previous c */
782 wint_t flagch;
783 int i;
784 const char *coldp; /* last p after which no match was underway */
785 size_t clen;
787 CLEAR(st);
788 SET1(st, startst);
789 st = step(m->g, startst, stopst, st, NOTHING, st);
790 ASSIGN(fresh, st);
791 SP("start", st, *p);
792 coldp = NULL;
793 if (start == m->beginp)
794 c = OUT;
795 else {
797 * XXX Wrong if the previous character was multi-byte.
798 * Newline never is (in supported encodings),
799 * so this only breaks the ISWORD tests below.
801 c = (uch)*(start - 1);
803 for (;;) {
804 /* next character */
805 lastc = c;
806 if (p == m->endp) {
807 clen = 0;
808 c = OUT;
809 } else
810 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
811 if (EQ(st, fresh))
812 coldp = p;
814 /* is there an EOL and/or BOL between lastc and c? */
815 flagch = '\0';
816 i = 0;
817 if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
818 (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
819 flagch = BOL;
820 i = m->g->nbol;
822 if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
823 (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
824 flagch = (flagch == BOL) ? BOLEOL : EOL;
825 i += m->g->neol;
827 if (i != 0) {
828 for (; i > 0; i--)
829 st = step(m->g, startst, stopst, st, flagch, st);
830 SP("boleol", st, c);
833 /* how about a word boundary? */
834 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
835 (c != OUT && ISWORD(c)) ) {
836 flagch = BOW;
838 if ( (lastc != OUT && ISWORD(lastc)) &&
839 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
840 flagch = EOW;
842 if (flagch == BOW || flagch == EOW) {
843 st = step(m->g, startst, stopst, st, flagch, st);
844 SP("boweow", st, c);
847 /* are we done? */
848 if (ISSET(st, stopst) || p == stop || clen > stop - p)
849 break; /* NOTE BREAK OUT */
851 /* no, we must deal with this character */
852 ASSIGN(tmp, st);
853 ASSIGN(st, fresh);
854 assert(c != OUT);
855 st = step(m->g, startst, stopst, tmp, c, st);
856 SP("aft", st, c);
857 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
858 p += clen;
861 assert(coldp != NULL);
862 m->coldp = coldp;
863 if (ISSET(st, stopst))
864 return(p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
865 else
866 return(NULL);
870 - slow - step through the string more deliberately
871 == static const char *slow(struct match *m, const char *start, \
872 == const char *stop, sopno startst, sopno stopst);
874 static const char * /* where it ended */
875 slow( struct match *m,
876 const char *start,
877 const char *stop,
878 sopno startst,
879 sopno stopst)
881 states st = m->st;
882 states empty = m->empty;
883 states tmp = m->tmp;
884 const char *p = start;
885 wint_t c;
886 wint_t lastc; /* previous c */
887 wint_t flagch;
888 int i;
889 const char *matchp; /* last p at which a match ended */
890 size_t clen;
892 AT("slow", start, stop, startst, stopst);
893 CLEAR(st);
894 SET1(st, startst);
895 SP("sstart", st, *p);
896 st = step(m->g, startst, stopst, st, NOTHING, st);
897 matchp = NULL;
898 if (start == m->beginp)
899 c = OUT;
900 else {
902 * XXX Wrong if the previous character was multi-byte.
903 * Newline never is (in supported encodings),
904 * so this only breaks the ISWORD tests below.
906 c = (uch)*(start - 1);
908 for (;;) {
909 /* next character */
910 lastc = c;
911 if (p == m->endp) {
912 c = OUT;
913 clen = 0;
914 } else
915 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
917 /* is there an EOL and/or BOL between lastc and c? */
918 flagch = '\0';
919 i = 0;
920 if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
921 (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
922 flagch = BOL;
923 i = m->g->nbol;
925 if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
926 (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
927 flagch = (flagch == BOL) ? BOLEOL : EOL;
928 i += m->g->neol;
930 if (i != 0) {
931 for (; i > 0; i--)
932 st = step(m->g, startst, stopst, st, flagch, st);
933 SP("sboleol", st, c);
936 /* how about a word boundary? */
937 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
938 (c != OUT && ISWORD(c)) ) {
939 flagch = BOW;
941 if ( (lastc != OUT && ISWORD(lastc)) &&
942 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
943 flagch = EOW;
945 if (flagch == BOW || flagch == EOW) {
946 st = step(m->g, startst, stopst, st, flagch, st);
947 SP("sboweow", st, c);
950 /* are we done? */
951 if (ISSET(st, stopst))
952 matchp = p;
953 if (EQ(st, empty) || p == stop || clen > stop - p)
954 break; /* NOTE BREAK OUT */
956 /* no, we must deal with this character */
957 ASSIGN(tmp, st);
958 ASSIGN(st, empty);
959 assert(c != OUT);
960 st = step(m->g, startst, stopst, tmp, c, st);
961 SP("saft", st, c);
962 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
963 p += clen;
966 return(matchp);
971 - step - map set of states reachable before char to set reachable after
972 == static states step(struct re_guts *g, sopno start, sopno stop, \
973 == states bef, int ch, states aft);
974 == #define BOL (OUT-1)
975 == #define EOL (BOL-1)
976 == #define BOLEOL (BOL-2)
977 == #define NOTHING (BOL-3)
978 == #define BOW (BOL-4)
979 == #define EOW (BOL-5)
980 == #define BADCHAR (BOL-6)
981 == #define NONCHAR(c) ((c) <= OUT)
983 static states
984 step(struct re_guts *g,
985 sopno start, /* start state within strip */
986 sopno stop, /* state after stop state within strip */
987 states bef, /* states reachable before */
988 wint_t ch, /* character or NONCHAR code */
989 states aft) /* states already known reachable after */
991 cset *cs;
992 sop s;
993 sopno pc;
994 onestate here; /* note, macros know this name */
995 sopno look;
996 int i;
998 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
999 s = g->strip[pc];
1000 switch (OP(s)) {
1001 case OEND:
1002 assert(pc == stop-1);
1003 break;
1004 case OCHAR:
1005 /* only characters can match */
1006 assert(!NONCHAR(ch) || ch != OPND(s));
1007 if (ch == OPND(s))
1008 FWD(aft, bef, 1);
1009 break;
1010 case OBOL:
1011 if (ch == BOL || ch == BOLEOL)
1012 FWD(aft, bef, 1);
1013 break;
1014 case OEOL:
1015 if (ch == EOL || ch == BOLEOL)
1016 FWD(aft, bef, 1);
1017 break;
1018 case OBOW:
1019 if (ch == BOW)
1020 FWD(aft, bef, 1);
1021 break;
1022 case OEOW:
1023 if (ch == EOW)
1024 FWD(aft, bef, 1);
1025 break;
1026 case OANY:
1027 if (!NONCHAR(ch))
1028 FWD(aft, bef, 1);
1029 break;
1030 case OANYOF:
1031 cs = &g->sets[OPND(s)];
1032 if (!NONCHAR(ch) && CHIN(cs, ch))
1033 FWD(aft, bef, 1);
1034 break;
1035 case OBACK_: /* ignored here */
1036 case O_BACK:
1037 FWD(aft, aft, 1);
1038 break;
1039 case OPLUS_: /* forward, this is just an empty */
1040 FWD(aft, aft, 1);
1041 break;
1042 case O_PLUS: /* both forward and back */
1043 FWD(aft, aft, 1);
1044 i = ISSETBACK(aft, OPND(s));
1045 BACK(aft, aft, OPND(s));
1046 if (!i && ISSETBACK(aft, OPND(s))) {
1047 /* oho, must reconsider loop body */
1048 pc -= OPND(s) + 1;
1049 INIT(here, pc);
1051 break;
1052 case OQUEST_: /* two branches, both forward */
1053 FWD(aft, aft, 1);
1054 FWD(aft, aft, OPND(s));
1055 break;
1056 case O_QUEST: /* just an empty */
1057 FWD(aft, aft, 1);
1058 break;
1059 case OLPAREN: /* not significant here */
1060 case ORPAREN:
1061 FWD(aft, aft, 1);
1062 break;
1063 case OCH_: /* mark the first two branches */
1064 FWD(aft, aft, 1);
1065 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1066 FWD(aft, aft, OPND(s));
1067 break;
1068 case OOR1: /* done a branch, find the O_CH */
1069 if (ISSTATEIN(aft, here)) {
1070 for (look = 1;
1071 OP(s = g->strip[pc+look]) != O_CH;
1072 look += OPND(s))
1073 assert(OP(s) == OOR2);
1074 FWD(aft, aft, look);
1076 break;
1077 case OOR2: /* propagate OCH_'s marking */
1078 FWD(aft, aft, 1);
1079 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1080 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1081 FWD(aft, aft, OPND(s));
1083 break;
1084 case O_CH: /* just empty */
1085 FWD(aft, aft, 1);
1086 break;
1087 default: /* ooooops... */
1088 assert(nope);
1089 break;
1093 return(aft);
1096 #ifdef REDEBUG
1098 - print - print a set of states
1099 == #ifdef REDEBUG
1100 == static void print(struct match *m, const char *caption, states st, \
1101 == int ch, FILE *d);
1102 == #endif
1104 static void
1105 print(struct match *m,
1106 const char *caption,
1107 states st,
1108 int ch,
1109 FILE *d)
1111 struct re_guts *g = m->g;
1112 int i;
1113 int first = 1;
1115 if (!(m->eflags&REG_TRACE))
1116 return;
1118 fprintf(d, "%s", caption);
1119 if (ch != '\0')
1120 fprintf(d, " %s", pchar(ch));
1121 for (i = 0; i < g->nstates; i++)
1122 if (ISSET(st, i)) {
1123 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1124 first = 0;
1126 fprintf(d, "\n");
1130 - at - print current situation
1131 == #ifdef REDEBUG
1132 == static void at(struct match *m, const char *title, const char *start, \
1133 == const char *stop, sopno startst, sopno stopst);
1134 == #endif
1136 static void
1137 at( struct match *m,
1138 const char *title,
1139 const char *start,
1140 const char *stop,
1141 sopno startst,
1142 sopno stopst)
1144 if (!(m->eflags&REG_TRACE))
1145 return;
1147 printf("%s %s-", title, pchar(*start));
1148 printf("%s ", pchar(*stop));
1149 printf("%ld-%ld\n", (long)startst, (long)stopst);
1152 #ifndef PCHARDONE
1153 #define PCHARDONE /* never again */
1155 - pchar - make a character printable
1156 == #ifdef REDEBUG
1157 == static const char *pchar(int ch);
1158 == #endif
1160 * Is this identical to regchar() over in debug.c? Well, yes. But a
1161 * duplicate here avoids having a debugging-capable regexec.o tied to
1162 * a matching debug.o, and this is convenient. It all disappears in
1163 * the non-debug compilation anyway, so it doesn't matter much.
1165 static const char * /* -> representation */
1166 pchar(int ch)
1168 static char pbuf[10];
1170 if (isprint((uch)ch) || ch == ' ')
1171 sprintf(pbuf, "%c", ch);
1172 else
1173 sprintf(pbuf, "\\%o", ch);
1174 return(pbuf);
1176 #endif
1177 #endif
1179 #undef matcher
1180 #undef fast
1181 #undef slow
1182 #undef dissect
1183 #undef backref
1184 #undef step
1185 #undef print
1186 #undef at
1187 #undef match