ex: convert line input buffer from file encoding to internal encoding
[nvi.git] / regex / engine.c
blob2a3059a6d132c8c87a7a214d84f88e4f7ab9f801
1 /*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 * The Regents of the University of California. All rights reserved.
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer of the University of Toronto.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
37 * @(#)engine.c 8.4 (Berkeley) 3/19/94
41 * The matching engine and friends. This file is #included by regexec.c
42 * after suitable #defines of a variety of macros used herein, so that
43 * different state representations can be used without duplicating masses
44 * of code.
47 #ifdef SNAMES
48 #define matcher smatcher
49 #define fast sfast
50 #define slow sslow
51 #define dissect sdissect
52 #define backref sbackref
53 #define step sstep
54 #define print sprint
55 #define at sat
56 #define match smat
57 #endif
58 #ifdef LNAMES
59 #define matcher lmatcher
60 #define fast lfast
61 #define slow lslow
62 #define dissect ldissect
63 #define backref lbackref
64 #define step lstep
65 #define print lprint
66 #define at lat
67 #define match lmat
68 #endif
70 /* another structure passed up and down to avoid zillions of parameters */
71 struct match {
72 struct re_guts *g;
73 int eflags;
74 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
75 RCHAR_T *offp; /* offsets work from here */
76 RCHAR_T *beginp; /* start of string -- virtual NUL precedes */
77 RCHAR_T *endp; /* end of string -- virtual NUL here */
78 RCHAR_T *coldp; /* can be no match starting before here */
79 RCHAR_T **lastpos; /* [nplus+1] */
80 STATEVARS;
81 states st; /* current states */
82 states fresh; /* states for a fresh start */
83 states tmp; /* temporary */
84 states empty; /* empty set of states */
87 /* ========= begin header generated by ./mkh ========= */
88 #ifdef __cplusplus
89 extern "C" {
90 #endif
92 /* === engine.c === */
93 static int matcher __P((struct re_guts *g, RCHAR_T *string, size_t nmatch, regmatch_t pmatch[], int eflags));
94 static RCHAR_T *dissect __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst));
95 static RCHAR_T *backref __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst, sopno lev));
96 static RCHAR_T *fast __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst));
97 static RCHAR_T *slow __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst));
98 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
99 #define BOL (OUT+1)
100 #define EOL (BOL+1)
101 #define BOLEOL (BOL+2)
102 #define NOTHING (BOL+3)
103 #define BOW (BOL+4)
104 #define EOW (BOL+5)
105 #define CODEMAX (BOL+5) /* highest code used */
106 #define NONCHAR(c) ((c) > RCHAR_T_MAX)
107 #define NNONCHAR (CODEMAX-CHAR_MAX)
108 #ifdef REDEBUG
109 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
110 #endif
111 #ifdef REDEBUG
112 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
113 #endif
114 #ifdef REDEBUG
115 static char *pchar __P((int ch));
116 #endif
118 #ifdef __cplusplus
120 #endif
121 /* ========= end header generated by ./mkh ========= */
123 #ifdef REDEBUG
124 #define SP(t, s, c) print(m, t, s, c, stdout)
125 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
126 #define NOTE(str) { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
127 #else
128 #define SP(t, s, c) /* nothing */
129 #define AT(t, p1, p2, s1, s2) /* nothing */
130 #define NOTE(s) /* nothing */
131 #endif
134 - matcher - the actual matching engine
135 == static int matcher(register struct re_guts *g, RCHAR_T *string, \
136 == size_t nmatch, regmatch_t pmatch[], int eflags);
138 static int /* 0 success, REG_NOMATCH failure */
139 matcher(g, string, nmatch, pmatch, eflags)
140 register struct re_guts *g;
141 RCHAR_T *string;
142 size_t nmatch;
143 regmatch_t pmatch[];
144 int eflags;
146 register RCHAR_T *endp;
147 register int i;
148 struct match mv;
149 register struct match *m = &mv;
150 register RCHAR_T *dp;
151 const register sopno gf = g->firststate+1; /* +1 for OEND */
152 const register sopno gl = g->laststate;
153 RCHAR_T *start;
154 RCHAR_T *stop;
156 /* simplify the situation where possible */
157 if (g->cflags&REG_NOSUB)
158 nmatch = 0;
159 if (eflags&REG_STARTEND) {
160 start = string + pmatch[0].rm_so;
161 stop = string + pmatch[0].rm_eo;
162 } else {
163 start = string;
164 stop = start + STRLEN(start);
166 if (stop < start)
167 return(REG_INVARG);
169 /* prescreening; this does wonders for this rather slow code */
170 if (g->must != NULL) {
171 for (dp = start; dp < stop; dp++)
172 if (*dp == g->must[0] && stop - dp >= g->mlen &&
173 MEMCMP(dp, g->must, (size_t)g->mlen) == 0)
174 break;
175 if (dp == stop) /* we didn't find g->must */
176 return(REG_NOMATCH);
179 /* match struct setup */
180 m->g = g;
181 m->eflags = eflags;
182 m->pmatch = NULL;
183 m->lastpos = NULL;
184 m->offp = string;
185 m->beginp = start;
186 m->endp = stop;
187 STATESETUP(m, 4);
188 SETUP(m->st);
189 SETUP(m->fresh);
190 SETUP(m->tmp);
191 SETUP(m->empty);
192 CLEAR(m->empty);
194 /* this loop does only one repetition except for backrefs */
195 for (;;) {
196 endp = fast(m, start, stop, gf, gl);
197 if (endp == NULL) { /* a miss */
198 STATETEARDOWN(m);
199 return(REG_NOMATCH);
201 if (nmatch == 0 && !g->backrefs)
202 break; /* no further info needed */
204 /* where? */
205 assert(m->coldp != NULL);
206 for (;;) {
207 NOTE("finding start");
208 endp = slow(m, m->coldp, stop, gf, gl);
209 if (endp != NULL)
210 break;
211 assert(m->coldp < m->endp);
212 m->coldp++;
214 if (nmatch == 1 && !g->backrefs)
215 break; /* no further info needed */
217 /* oh my, he wants the subexpressions... */
218 if (m->pmatch == NULL)
219 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
220 sizeof(regmatch_t));
221 if (m->pmatch == NULL) {
222 STATETEARDOWN(m);
223 return(REG_ESPACE);
225 for (i = 1; i <= m->g->nsub; i++)
226 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
227 if (!g->backrefs && !(m->eflags&REG_BACKR)) {
228 NOTE("dissecting");
229 dp = dissect(m, m->coldp, endp, gf, gl);
230 } else {
231 if (g->nplus > 0 && m->lastpos == NULL)
232 m->lastpos = (RCHAR_T **)malloc((g->nplus+1) *
233 sizeof(RCHAR_T *));
234 if (g->nplus > 0 && m->lastpos == NULL) {
235 free(m->pmatch);
236 STATETEARDOWN(m);
237 return(REG_ESPACE);
239 NOTE("backref dissect");
240 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
242 if (dp != NULL)
243 break;
245 /* uh-oh... we couldn't find a subexpression-level match */
246 assert(g->backrefs); /* must be back references doing it */
247 assert(g->nplus == 0 || m->lastpos != NULL);
248 for (;;) {
249 if (dp != NULL || endp <= m->coldp)
250 break; /* defeat */
251 NOTE("backoff");
252 endp = slow(m, m->coldp, endp-1, gf, gl);
253 if (endp == NULL)
254 break; /* defeat */
255 /* try it on a shorter possibility */
256 #ifndef NDEBUG
257 for (i = 1; i <= m->g->nsub; i++) {
258 assert(m->pmatch[i].rm_so == -1);
259 assert(m->pmatch[i].rm_eo == -1);
261 #endif
262 NOTE("backoff dissect");
263 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
265 assert(dp == NULL || dp == endp);
266 if (dp != NULL) /* found a shorter one */
267 break;
269 /* despite initial appearances, there is no match here */
270 NOTE("false alarm");
271 start = m->coldp + 1; /* recycle starting later */
272 assert(start <= stop);
275 /* fill in the details if requested */
276 if (nmatch > 0) {
277 pmatch[0].rm_so = m->coldp - m->offp;
278 pmatch[0].rm_eo = endp - m->offp;
280 if (nmatch > 1) {
281 assert(m->pmatch != NULL);
282 for (i = 1; i < nmatch; i++)
283 if (i <= m->g->nsub)
284 pmatch[i] = m->pmatch[i];
285 else {
286 pmatch[i].rm_so = -1;
287 pmatch[i].rm_eo = -1;
291 if (m->pmatch != NULL)
292 free((char *)m->pmatch);
293 if (m->lastpos != NULL)
294 free((char *)m->lastpos);
295 STATETEARDOWN(m);
296 return(0);
300 - dissect - figure out what matched what, no back references
301 == static RCHAR_T *dissect(register struct match *m, RCHAR_T *start, \
302 == RCHAR_T *stop, sopno startst, sopno stopst);
304 static RCHAR_T * /* == stop (success) always */
305 dissect(m, start, stop, startst, stopst)
306 register struct match *m;
307 RCHAR_T *start;
308 RCHAR_T *stop;
309 sopno startst;
310 sopno stopst;
312 register int i;
313 register sopno ss; /* start sop of current subRE */
314 register sopno es; /* end sop of current subRE */
315 register RCHAR_T *sp; /* start of string matched by it */
316 register RCHAR_T *stp; /* string matched by it cannot pass here */
317 register RCHAR_T *rest; /* start of rest of string */
318 register RCHAR_T *tail; /* string unmatched by rest of RE */
319 register sopno ssub; /* start sop of subsubRE */
320 register sopno esub; /* end sop of subsubRE */
321 register RCHAR_T *ssp; /* start of string matched by subsubRE */
322 register RCHAR_T *sep; /* end of string matched by subsubRE */
323 register RCHAR_T *oldssp; /* previous ssp */
324 register RCHAR_T *dp;
326 AT("diss", start, stop, startst, stopst);
327 sp = start;
328 for (ss = startst; ss < stopst; ss = es) {
329 /* identify end of subRE */
330 es = ss;
331 switch (OP(m->g->strip[es])) {
332 case OPLUS_:
333 case OQUEST_:
334 es += OPND(m->g->strip[es]);
335 break;
336 case OCH_:
337 while (OP(m->g->strip[es]) != O_CH)
338 es += OPND(m->g->strip[es]);
339 break;
341 es++;
343 /* figure out what it matched */
344 switch (OP(m->g->strip[ss])) {
345 case OEND:
346 assert(nope);
347 break;
348 case OCHAR:
349 sp++;
350 break;
351 case OBOL:
352 case OEOL:
353 case OBOW:
354 case OEOW:
355 break;
356 case OANY:
357 case OANYOF:
358 sp++;
359 break;
360 case OBACK_:
361 case O_BACK:
362 assert(nope);
363 break;
364 /* cases where length of match is hard to find */
365 case OQUEST_:
366 stp = stop;
367 for (;;) {
368 /* how long could this one be? */
369 rest = slow(m, sp, stp, ss, es);
370 assert(rest != NULL); /* it did match */
371 /* could the rest match the rest? */
372 tail = slow(m, rest, stop, es, stopst);
373 if (tail == stop)
374 break; /* yes! */
375 /* no -- try a shorter match for this one */
376 stp = rest - 1;
377 assert(stp >= sp); /* it did work */
379 ssub = ss + 1;
380 esub = es - 1;
381 /* did innards match? */
382 if (slow(m, sp, rest, ssub, esub) != NULL) {
383 dp = dissect(m, sp, rest, ssub, esub);
384 assert(dp == rest);
385 } else /* no */
386 assert(sp == rest);
387 sp = rest;
388 break;
389 case OPLUS_:
390 stp = stop;
391 for (;;) {
392 /* how long could this one be? */
393 rest = slow(m, sp, stp, ss, es);
394 assert(rest != NULL); /* it did match */
395 /* could the rest match the rest? */
396 tail = slow(m, rest, stop, es, stopst);
397 if (tail == stop)
398 break; /* yes! */
399 /* no -- try a shorter match for this one */
400 stp = rest - 1;
401 assert(stp >= sp); /* it did work */
403 ssub = ss + 1;
404 esub = es - 1;
405 ssp = sp;
406 oldssp = ssp;
407 for (;;) { /* find last match of innards */
408 sep = slow(m, ssp, rest, ssub, esub);
409 if (sep == NULL || sep == ssp)
410 break; /* failed or matched null */
411 oldssp = ssp; /* on to next try */
412 ssp = sep;
414 if (sep == NULL) {
415 /* last successful match */
416 sep = ssp;
417 ssp = oldssp;
419 assert(sep == rest); /* must exhaust substring */
420 assert(slow(m, ssp, sep, ssub, esub) == rest);
421 dp = dissect(m, ssp, sep, ssub, esub);
422 assert(dp == sep);
423 sp = rest;
424 break;
425 case OCH_:
426 stp = stop;
427 for (;;) {
428 /* how long could this one be? */
429 rest = slow(m, sp, stp, ss, es);
430 assert(rest != NULL); /* it did match */
431 /* could the rest match the rest? */
432 tail = slow(m, rest, stop, es, stopst);
433 if (tail == stop)
434 break; /* yes! */
435 /* no -- try a shorter match for this one */
436 stp = rest - 1;
437 assert(stp >= sp); /* it did work */
439 ssub = ss + 1;
440 esub = ss + OPND(m->g->strip[ss]) - 1;
441 assert(OP(m->g->strip[esub]) == OOR1);
442 for (;;) { /* find first matching branch */
443 if (slow(m, sp, rest, ssub, esub) == rest)
444 break; /* it matched all of it */
445 /* that one missed, try next one */
446 assert(OP(m->g->strip[esub]) == OOR1);
447 esub++;
448 assert(OP(m->g->strip[esub]) == OOR2);
449 ssub = esub + 1;
450 esub += OPND(m->g->strip[esub]);
451 if (OP(m->g->strip[esub]) == OOR2)
452 esub--;
453 else
454 assert(OP(m->g->strip[esub]) == O_CH);
456 dp = dissect(m, sp, rest, ssub, esub);
457 assert(dp == rest);
458 sp = rest;
459 break;
460 case O_PLUS:
461 case O_QUEST:
462 case OOR1:
463 case OOR2:
464 case O_CH:
465 assert(nope);
466 break;
467 case OLPAREN:
468 i = OPND(m->g->strip[ss]);
469 assert(0 < i && i <= m->g->nsub);
470 m->pmatch[i].rm_so = sp - m->offp;
471 break;
472 case ORPAREN:
473 i = OPND(m->g->strip[ss]);
474 assert(0 < i && i <= m->g->nsub);
475 m->pmatch[i].rm_eo = sp - m->offp;
476 break;
477 default: /* uh oh */
478 assert(nope);
479 break;
483 assert(sp == stop);
484 return(sp);
488 - backref - figure out what matched what, figuring in back references
489 == static RCHAR_T *backref(register struct match *m, RCHAR_T *start, \
490 == RCHAR_T *stop, sopno startst, sopno stopst, sopno lev);
492 static RCHAR_T * /* == stop (success) or NULL (failure) */
493 backref(m, start, stop, startst, stopst, lev)
494 register struct match *m;
495 RCHAR_T *start;
496 RCHAR_T *stop;
497 sopno startst;
498 sopno stopst;
499 sopno lev; /* PLUS nesting level */
501 register int i;
502 register sopno ss; /* start sop of current subRE */
503 register RCHAR_T *sp; /* start of string matched by it */
504 register sopno ssub; /* start sop of subsubRE */
505 register sopno esub; /* end sop of subsubRE */
506 register RCHAR_T *ssp; /* start of string matched by subsubRE */
507 register RCHAR_T *dp;
508 register size_t len;
509 register int hard;
510 register sop s;
511 register regoff_t offsave;
512 register cset *cs;
514 AT("back", start, stop, startst, stopst);
515 sp = start;
517 /* get as far as we can with easy stuff */
518 hard = 0;
519 for (ss = startst; !hard && ss < stopst; ss++)
520 switch (OP(s = m->g->strip[ss])) {
521 case OCHAR:
522 if (sp == stop || *sp++ != (RCHAR_T)OPND(s))
523 return(NULL);
524 break;
525 case OANY:
526 if (sp == stop)
527 return(NULL);
528 sp++;
529 break;
530 case OANYOF:
531 cs = &m->g->sets[OPND(s)];
532 if (sp == stop || !CHIN(cs, *sp++))
533 return(NULL);
534 break;
535 case OBOL:
536 if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
537 (sp < m->endp && *(sp-1) == '\n' &&
538 (m->g->cflags&REG_NEWLINE)) )
539 { /* yes */ }
540 else
541 return(NULL);
542 break;
543 case OEOL:
544 if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
545 (sp < m->endp && *sp == '\n' &&
546 (m->g->cflags&REG_NEWLINE)) )
547 { /* yes */ }
548 else
549 return(NULL);
550 break;
551 case OBOW:
552 if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
553 (sp < m->endp && *(sp-1) == '\n' &&
554 (m->g->cflags&REG_NEWLINE)) ||
555 (sp > m->beginp &&
556 !ISWORD(*(sp-1))) ) &&
557 (sp < m->endp && ISWORD(*sp)) )
558 { /* yes */ }
559 else
560 return(NULL);
561 break;
562 case OEOW:
563 if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
564 (sp < m->endp && *sp == '\n' &&
565 (m->g->cflags&REG_NEWLINE)) ||
566 (sp < m->endp && !ISWORD(*sp)) ) &&
567 (sp > m->beginp && ISWORD(*(sp-1))) )
568 { /* yes */ }
569 else
570 return(NULL);
571 break;
572 case O_QUEST:
573 break;
574 case OOR1: /* matches null but needs to skip */
575 ss++;
576 s = m->g->strip[ss];
577 do {
578 assert(OP(s) == OOR2);
579 ss += OPND(s);
580 } while (OP(s = m->g->strip[ss]) != O_CH);
581 /* note that the ss++ gets us past the O_CH */
582 break;
583 default: /* have to make a choice */
584 hard = 1;
585 break;
587 if (!hard) { /* that was it! */
588 if (sp != stop)
589 return(NULL);
590 return(sp);
592 ss--; /* adjust for the for's final increment */
594 /* the hard stuff */
595 AT("hard", sp, stop, ss, stopst);
596 s = m->g->strip[ss];
597 switch (OP(s)) {
598 case OBACK_: /* the vilest depths */
599 i = OPND(s);
600 assert(0 < i && i <= m->g->nsub);
601 if (m->pmatch[i].rm_eo == -1)
602 return(NULL);
603 assert(m->pmatch[i].rm_so != -1);
604 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
605 assert(stop - m->beginp >= len);
606 if (sp > stop - len)
607 return(NULL); /* not enough left to match */
608 ssp = m->offp + m->pmatch[i].rm_so;
609 if (memcmp(sp, ssp, len) != 0)
610 return(NULL);
611 while (m->g->strip[ss] != SOP(O_BACK, i))
612 ss++;
613 return(backref(m, sp+len, stop, ss+1, stopst, lev));
614 break;
615 case OQUEST_: /* to null or not */
616 dp = backref(m, sp, stop, ss+1, stopst, lev);
617 if (dp != NULL)
618 return(dp); /* not */
619 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
620 break;
621 case OPLUS_:
622 assert(m->lastpos != NULL);
623 assert(lev+1 <= m->g->nplus);
624 m->lastpos[lev+1] = sp;
625 return(backref(m, sp, stop, ss+1, stopst, lev+1));
626 break;
627 case O_PLUS:
628 if (sp == m->lastpos[lev]) /* last pass matched null */
629 return(backref(m, sp, stop, ss+1, stopst, lev-1));
630 /* try another pass */
631 m->lastpos[lev] = sp;
632 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
633 if (dp == NULL)
634 return(backref(m, sp, stop, ss+1, stopst, lev-1));
635 else
636 return(dp);
637 break;
638 case OCH_: /* find the right one, if any */
639 ssub = ss + 1;
640 esub = ss + OPND(s) - 1;
641 assert(OP(m->g->strip[esub]) == OOR1);
642 for (;;) { /* find first matching branch */
643 dp = backref(m, sp, stop, ssub, esub, lev);
644 if (dp != NULL)
645 return(dp);
646 /* that one missed, try next one */
647 if (OP(m->g->strip[esub]) == O_CH)
648 return(NULL); /* there is none */
649 esub++;
650 assert(OP(m->g->strip[esub]) == OOR2);
651 ssub = esub + 1;
652 esub += OPND(m->g->strip[esub]);
653 if (OP(m->g->strip[esub]) == OOR2)
654 esub--;
655 else
656 assert(OP(m->g->strip[esub]) == O_CH);
658 break;
659 case OLPAREN: /* must undo assignment if rest fails */
660 i = OPND(s);
661 assert(0 < i && i <= m->g->nsub);
662 offsave = m->pmatch[i].rm_so;
663 m->pmatch[i].rm_so = sp - m->offp;
664 dp = backref(m, sp, stop, ss+1, stopst, lev);
665 if (dp != NULL)
666 return(dp);
667 m->pmatch[i].rm_so = offsave;
668 return(NULL);
669 break;
670 case ORPAREN: /* must undo assignment if rest fails */
671 i = OPND(s);
672 assert(0 < i && i <= m->g->nsub);
673 offsave = m->pmatch[i].rm_eo;
674 m->pmatch[i].rm_eo = sp - m->offp;
675 dp = backref(m, sp, stop, ss+1, stopst, lev);
676 if (dp != NULL)
677 return(dp);
678 m->pmatch[i].rm_eo = offsave;
679 return(NULL);
680 break;
681 default: /* uh oh */
682 assert(nope);
683 break;
686 /* "can't happen" */
687 assert(nope);
688 /* NOTREACHED */
692 - fast - step through the string at top speed
693 == static RCHAR_T *fast(register struct match *m, RCHAR_T *start, \
694 == RCHAR_T *stop, sopno startst, sopno stopst);
696 static RCHAR_T * /* where tentative match ended, or NULL */
697 fast(m, start, stop, startst, stopst)
698 register struct match *m;
699 RCHAR_T *start;
700 RCHAR_T *stop;
701 sopno startst;
702 sopno stopst;
704 register states st = m->st;
705 register states fresh = m->fresh;
706 register states tmp = m->tmp;
707 register RCHAR_T *p = start;
708 register int c = (start == m->beginp) ? OUT : *(start-1);
709 register int lastc; /* previous c */
710 register int flagch;
711 register int i;
712 register RCHAR_T *coldp; /* last p after which no match was underway */
714 CLEAR(st);
715 SET1(st, startst);
716 st = step(m->g, startst, stopst, st, NOTHING, st);
717 ASSIGN(fresh, st);
718 SP("start", st, *p);
719 coldp = NULL;
720 for (;;) {
721 /* next character */
722 lastc = c;
723 c = (p == m->endp) ? OUT : *p;
724 if (EQ(st, fresh))
725 coldp = p;
727 /* is there an EOL and/or BOL between lastc and c? */
728 flagch = '\0';
729 i = 0;
730 if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
731 (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
732 flagch = BOL;
733 i = m->g->nbol;
735 if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
736 (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
737 flagch = (flagch == BOL) ? BOLEOL : EOL;
738 i += m->g->neol;
740 if (i != 0) {
741 for (; i > 0; i--)
742 st = step(m->g, startst, stopst, st, flagch, st);
743 SP("boleol", st, c);
746 /* how about a word boundary? */
747 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
748 (c != OUT && ISWORD(c)) ) {
749 flagch = BOW;
751 if ( (lastc != OUT && ISWORD(lastc)) &&
752 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
753 flagch = EOW;
755 if (flagch == BOW || flagch == EOW) {
756 st = step(m->g, startst, stopst, st, flagch, st);
757 SP("boweow", st, c);
760 /* are we done? */
761 if (ISSET(st, stopst) || p == stop)
762 break; /* NOTE BREAK OUT */
764 /* no, we must deal with this character */
765 ASSIGN(tmp, st);
766 ASSIGN(st, fresh);
767 assert(c != OUT);
768 st = step(m->g, startst, stopst, tmp, c, st);
769 SP("aft", st, c);
770 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
771 p++;
774 assert(coldp != NULL);
775 m->coldp = coldp;
776 if (ISSET(st, stopst))
777 return(p+1);
778 else
779 return(NULL);
783 - slow - step through the string more deliberately
784 == static RCHAR_T *slow(register struct match *m, RCHAR_T *start, \
785 == RCHAR_T *stop, sopno startst, sopno stopst);
787 static RCHAR_T * /* where it ended */
788 slow(m, start, stop, startst, stopst)
789 register struct match *m;
790 RCHAR_T *start;
791 RCHAR_T *stop;
792 sopno startst;
793 sopno stopst;
795 register states st = m->st;
796 register states empty = m->empty;
797 register states tmp = m->tmp;
798 register RCHAR_T *p = start;
799 register int c = (start == m->beginp) ? OUT : *(start-1);
800 register int lastc; /* previous c */
801 register int flagch;
802 register int i;
803 register RCHAR_T *matchp; /* last p at which a match ended */
805 AT("slow", start, stop, startst, stopst);
806 CLEAR(st);
807 SET1(st, startst);
808 SP("sstart", st, *p);
809 st = step(m->g, startst, stopst, st, NOTHING, st);
810 matchp = NULL;
811 for (;;) {
812 /* next character */
813 lastc = c;
814 c = (p == m->endp) ? OUT : *p;
816 /* is there an EOL and/or BOL between lastc and c? */
817 flagch = '\0';
818 i = 0;
819 if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
820 (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
821 flagch = BOL;
822 i = m->g->nbol;
824 if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
825 (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
826 flagch = (flagch == BOL) ? BOLEOL : EOL;
827 i += m->g->neol;
829 if (i != 0) {
830 for (; i > 0; i--)
831 st = step(m->g, startst, stopst, st, flagch, st);
832 SP("sboleol", st, c);
835 /* how about a word boundary? */
836 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
837 (c != OUT && ISWORD(c)) ) {
838 flagch = BOW;
840 if ( (lastc != OUT && ISWORD(lastc)) &&
841 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
842 flagch = EOW;
844 if (flagch == BOW || flagch == EOW) {
845 st = step(m->g, startst, stopst, st, flagch, st);
846 SP("sboweow", st, c);
849 /* are we done? */
850 if (ISSET(st, stopst))
851 matchp = p;
852 if (EQ(st, empty) || p == stop)
853 break; /* NOTE BREAK OUT */
855 /* no, we must deal with this character */
856 ASSIGN(tmp, st);
857 ASSIGN(st, empty);
858 assert(c != OUT);
859 st = step(m->g, startst, stopst, tmp, c, st);
860 SP("saft", st, c);
861 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
862 p++;
865 return(matchp);
870 - step - map set of states reachable before char to set reachable after
871 == static states step(register struct re_guts *g, sopno start, sopno stop, \
872 == register states bef, int ch, register states aft);
873 == #define BOL (OUT+1)
874 == #define EOL (BOL+1)
875 == #define BOLEOL (BOL+2)
876 == #define NOTHING (BOL+3)
877 == #define BOW (BOL+4)
878 == #define EOW (BOL+5)
879 == #define CODEMAX (BOL+5) // highest code used
880 == #define NONCHAR(c) ((c) > CHAR_MAX)
881 == #define NNONCHAR (CODEMAX-CHAR_MAX)
883 static states
884 step(g, start, stop, bef, ch, aft)
885 register struct re_guts *g;
886 sopno start; /* start state within strip */
887 sopno stop; /* state after stop state within strip */
888 register states bef; /* states reachable before */
889 int ch; /* character or NONCHAR code */
890 register states aft; /* states already known reachable after */
892 register cset *cs;
893 register sop s;
894 register sopno pc;
895 register onestate here; /* note, macros know this name */
896 register sopno look;
897 register int i;
899 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
900 s = g->strip[pc];
901 switch (OP(s)) {
902 case OEND:
903 assert(pc == stop-1);
904 break;
905 case OCHAR:
906 /* only characters can match */
907 assert(!NONCHAR(ch) || ch != (RCHAR_T)OPND(s));
908 if (ch == (RCHAR_T)OPND(s))
909 FWD(aft, bef, 1);
910 break;
911 case OBOL:
912 if (ch == BOL || ch == BOLEOL)
913 FWD(aft, bef, 1);
914 break;
915 case OEOL:
916 if (ch == EOL || ch == BOLEOL)
917 FWD(aft, bef, 1);
918 break;
919 case OBOW:
920 if (ch == BOW)
921 FWD(aft, bef, 1);
922 break;
923 case OEOW:
924 if (ch == EOW)
925 FWD(aft, bef, 1);
926 break;
927 case OANY:
928 if (!NONCHAR(ch))
929 FWD(aft, bef, 1);
930 break;
931 case OANYOF:
932 cs = &g->sets[OPND(s)];
933 if (!NONCHAR(ch) && CHIN(cs, ch))
934 FWD(aft, bef, 1);
935 break;
936 case OBACK_: /* ignored here */
937 case O_BACK:
938 FWD(aft, aft, 1);
939 break;
940 case OPLUS_: /* forward, this is just an empty */
941 FWD(aft, aft, 1);
942 break;
943 case O_PLUS: /* both forward and back */
944 FWD(aft, aft, 1);
945 i = ISSETBACK(aft, OPND(s));
946 BACK(aft, aft, OPND(s));
947 if (!i && ISSETBACK(aft, OPND(s))) {
948 /* oho, must reconsider loop body */
949 pc -= OPND(s) + 1;
950 INIT(here, pc);
952 break;
953 case OQUEST_: /* two branches, both forward */
954 FWD(aft, aft, 1);
955 FWD(aft, aft, OPND(s));
956 break;
957 case O_QUEST: /* just an empty */
958 FWD(aft, aft, 1);
959 break;
960 case OLPAREN: /* not significant here */
961 case ORPAREN:
962 FWD(aft, aft, 1);
963 break;
964 case OCH_: /* mark the first two branches */
965 FWD(aft, aft, 1);
966 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
967 FWD(aft, aft, OPND(s));
968 break;
969 case OOR1: /* done a branch, find the O_CH */
970 if (ISSTATEIN(aft, here)) {
971 for (look = 1;
972 OP(s = g->strip[pc+look]) != O_CH;
973 look += OPND(s))
974 assert(OP(s) == OOR2);
975 FWD(aft, aft, look);
977 break;
978 case OOR2: /* propagate OCH_'s marking */
979 FWD(aft, aft, 1);
980 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
981 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
982 FWD(aft, aft, OPND(s));
984 break;
985 case O_CH: /* just empty */
986 FWD(aft, aft, 1);
987 break;
988 default: /* ooooops... */
989 assert(nope);
990 break;
994 return(aft);
997 #ifdef REDEBUG
999 - print - print a set of states
1000 == #ifdef REDEBUG
1001 == static void print(struct match *m, char *caption, states st, \
1002 == int ch, FILE *d);
1003 == #endif
1005 static void
1006 print(m, caption, st, ch, d)
1007 struct match *m;
1008 char *caption;
1009 states st;
1010 int ch;
1011 FILE *d;
1013 register struct re_guts *g = m->g;
1014 register int i;
1015 register int first = 1;
1017 if (!(m->eflags&REG_TRACE))
1018 return;
1020 fprintf(d, "%s", caption);
1021 if (ch != '\0')
1022 fprintf(d, " %s", pchar(ch));
1023 for (i = 0; i < g->nstates; i++)
1024 if (ISSET(st, i)) {
1025 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1026 first = 0;
1028 fprintf(d, "\n");
1032 - at - print current situation
1033 == #ifdef REDEBUG
1034 == static void at(struct match *m, char *title, char *start, char *stop, \
1035 == sopno startst, sopno stopst);
1036 == #endif
1038 static void
1039 at(m, title, start, stop, startst, stopst)
1040 struct match *m;
1041 char *title;
1042 char *start;
1043 char *stop;
1044 sopno startst;
1045 sopno stopst;
1047 if (!(m->eflags&REG_TRACE))
1048 return;
1050 printf("%s %s-", title, pchar(*start));
1051 printf("%s ", pchar(*stop));
1052 printf("%ld-%ld\n", (long)startst, (long)stopst);
1055 #ifndef PCHARDONE
1056 #define PCHARDONE /* never again */
1058 - pchar - make a character printable
1059 == #ifdef REDEBUG
1060 == static char *pchar(int ch);
1061 == #endif
1063 * Is this identical to regchar() over in debug.c? Well, yes. But a
1064 * duplicate here avoids having a debugging-capable regexec.o tied to
1065 * a matching debug.o, and this is convenient. It all disappears in
1066 * the non-debug compilation anyway, so it doesn't matter much.
1068 static char * /* -> representation */
1069 pchar(ch)
1070 int ch;
1072 static char pbuf[10];
1074 if (isprint(ch) || ch == ' ')
1075 sprintf(pbuf, "%c", ch);
1076 else
1077 sprintf(pbuf, "\\%o", ch);
1078 return(pbuf);
1080 #endif
1081 #endif
1083 #undef matcher
1084 #undef fast
1085 #undef slow
1086 #undef dissect
1087 #undef backref
1088 #undef step
1089 #undef print
1090 #undef at
1091 #undef match