* When the locale is not set up to UTF-8, alpine might determine the width
[alpine.git] / regex / engine.c
blobe9bb5d4bc25da7819ff265423c5dd1fb1834df5d
1 /*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 * The Regents of the University of California. All rights reserved.
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
37 * @(#)engine.c 8.5 (Berkeley) 3/20/94
41 * The matching engine and friends. This file is #included by regexec.c
42 * after suitable #defines of a variety of macros used herein, so that
43 * different state representations can be used without duplicating masses
44 * of code.
47 #ifdef SNAMES
48 #define matcher smatcher
49 #define fast sfast
50 #define slow sslow
51 #define dissect sdissect
52 #define backref sbackref
53 #define step sstep
54 #define print sprint
55 #define at sat
56 #define match smat
57 #endif
58 #ifdef LNAMES
59 #define matcher lmatcher
60 #define fast lfast
61 #define slow lslow
62 #define dissect ldissect
63 #define backref lbackref
64 #define step lstep
65 #define print lprint
66 #define at lat
67 #define match lmat
68 #endif
70 /* another structure passed up and down to avoid zillions of parameters */
71 struct match {
72 struct re_guts *g;
73 int eflags;
74 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
75 char *offp; /* offsets work from here */
76 char *beginp; /* start of string -- virtual NUL precedes */
77 char *endp; /* end of string -- virtual NUL here */
78 char *coldp; /* can be no match starting before here */
79 char **lastpos; /* [nplus+1] */
80 STATEVARS;
81 states st; /* current states */
82 states fresh; /* states for a fresh start */
83 states tmp; /* temporary */
84 states empty; /* empty set of states */
87 /* ========= begin header generated by ./mkh ========= */
88 #ifdef __cplusplus
89 extern "C" {
90 #endif
92 /* === engine.c === */
93 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
94 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
95 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
96 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
97 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
98 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
99 #define BOL (OUT+1)
100 #define EOL (BOL+1)
101 #define BOLEOL (BOL+2)
102 #define NOTHING (BOL+3)
103 #define BOW (BOL+4)
104 #define EOW (BOL+5)
105 #define CODEMAX (BOL+5) /* highest code used */
106 #define NONCHAR(c) ((c) > CHAR_MAX)
107 #define NNONCHAR (CODEMAX-CHAR_MAX)
108 #ifdef REDEBUG
109 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
110 #endif
111 #ifdef REDEBUG
112 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
113 #endif
114 #ifdef REDEBUG
115 static char *pchar __P((int ch));
116 #endif
118 #ifdef __cplusplus
120 #endif
121 /* ========= end header generated by ./mkh ========= */
123 #ifdef REDEBUG
124 #define SP(t, s, c) print(m, t, s, c, stdout)
125 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
126 #define NOTE(str) { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
127 #else
128 #define SP(t, s, c) /* nothing */
129 #define AT(t, p1, p2, s1, s2) /* nothing */
130 #define NOTE(s) /* nothing */
131 #endif
134 - matcher - the actual matching engine
135 == static int matcher(register struct re_guts *g, char *string, \
136 == size_t nmatch, regmatch_t pmatch[], int eflags);
138 static int /* 0 success, REG_NOMATCH failure */
139 matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
141 register char *endp;
142 register int i;
143 struct match mv;
144 register struct match *m = &mv;
145 register char *dp;
146 const register sopno gf = g->firststate+1; /* +1 for OEND */
147 const register sopno gl = g->laststate;
148 char *start;
149 char *stop;
151 /* simplify the situation where possible */
152 if (g->cflags&REG_NOSUB)
153 nmatch = 0;
154 if (eflags&REG_STARTEND) {
155 start = string + pmatch[0].rm_so;
156 stop = string + pmatch[0].rm_eo;
157 } else {
158 start = string;
159 stop = start + strlen(start);
161 if (stop < start)
162 return(REG_INVARG);
164 /* prescreening; this does wonders for this rather slow code */
165 if (g->must != NULL) {
166 for (dp = start; dp < stop; dp++)
167 if (*dp == g->must[0] && stop - dp >= g->mlen &&
168 memcmp(dp, g->must, (size_t)g->mlen) == 0)
169 break;
170 if (dp == stop) /* we didn't find g->must */
171 return(REG_NOMATCH);
174 /* match struct setup */
175 m->g = g;
176 m->eflags = eflags;
177 m->pmatch = NULL;
178 m->lastpos = NULL;
179 m->offp = string;
180 m->beginp = start;
181 m->endp = stop;
182 STATESETUP(m, 4);
183 SETUP(m->st);
184 SETUP(m->fresh);
185 SETUP(m->tmp);
186 SETUP(m->empty);
187 CLEAR(m->empty);
189 /* this loop does only one repetition except for backrefs */
190 for (;;) {
191 endp = fast(m, start, stop, gf, gl);
192 if (endp == NULL) { /* a miss */
193 STATETEARDOWN(m);
194 return(REG_NOMATCH);
196 if (nmatch == 0 && !g->backrefs)
197 break; /* no further info needed */
199 /* where? */
200 assert(m->coldp != NULL);
201 for (;;) {
202 NOTE("finding start");
203 endp = slow(m, m->coldp, stop, gf, gl);
204 if (endp != NULL)
205 break;
206 assert(m->coldp < m->endp);
207 m->coldp++;
209 if (nmatch == 1 && !g->backrefs)
210 break; /* no further info needed */
212 /* oh my, he wants the subexpressions... */
213 if (m->pmatch == NULL)
214 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
215 sizeof(regmatch_t));
216 if (m->pmatch == NULL) {
217 STATETEARDOWN(m);
218 return(REG_ESPACE);
220 for (i = 1; i <= m->g->nsub; i++)
221 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
222 if (!g->backrefs && !(m->eflags&REG_BACKR)) {
223 NOTE("dissecting");
224 dp = dissect(m, m->coldp, endp, gf, gl);
225 } else {
226 if (g->nplus > 0 && m->lastpos == NULL)
227 m->lastpos = (char **)malloc((g->nplus+1) *
228 sizeof(char *));
229 if (g->nplus > 0 && m->lastpos == NULL) {
230 free(m->pmatch);
231 STATETEARDOWN(m);
232 return(REG_ESPACE);
234 NOTE("backref dissect");
235 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
237 if (dp != NULL)
238 break;
240 /* uh-oh... we couldn't find a subexpression-level match */
241 assert(g->backrefs); /* must be back references doing it */
242 assert(g->nplus == 0 || m->lastpos != NULL);
243 for (;;) {
244 if (dp != NULL || endp <= m->coldp)
245 break; /* defeat */
246 NOTE("backoff");
247 endp = slow(m, m->coldp, endp-1, gf, gl);
248 if (endp == NULL)
249 break; /* defeat */
250 /* try it on a shorter possibility */
251 #ifndef NDEBUG
252 for (i = 1; i <= m->g->nsub; i++) {
253 assert(m->pmatch[i].rm_so == -1);
254 assert(m->pmatch[i].rm_eo == -1);
256 #endif
257 NOTE("backoff dissect");
258 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
260 assert(dp == NULL || dp == endp);
261 if (dp != NULL) /* found a shorter one */
262 break;
264 /* despite initial appearances, there is no match here */
265 NOTE("false alarm");
266 start = m->coldp + 1; /* recycle starting later */
267 assert(start <= stop);
270 /* fill in the details if requested */
271 if (nmatch > 0) {
272 pmatch[0].rm_so = m->coldp - m->offp;
273 pmatch[0].rm_eo = endp - m->offp;
275 if (nmatch > 1) {
276 assert(m->pmatch != NULL);
277 for (i = 1; i < nmatch; i++)
278 if (i <= m->g->nsub)
279 pmatch[i] = m->pmatch[i];
280 else {
281 pmatch[i].rm_so = -1;
282 pmatch[i].rm_eo = -1;
286 if (m->pmatch != NULL)
287 free((char *)m->pmatch);
288 if (m->lastpos != NULL)
289 free((char *)m->lastpos);
290 STATETEARDOWN(m);
291 return(0);
295 - dissect - figure out what matched what, no back references
296 == static char *dissect(register struct match *m, char *start, \
297 == char *stop, sopno startst, sopno stopst);
299 static char * /* == stop (success) always */
300 dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
302 register int i;
303 register sopno ss; /* start sop of current subRE */
304 register sopno es; /* end sop of current subRE */
305 register char *sp; /* start of string matched by it */
306 register char *stp; /* string matched by it cannot pass here */
307 register char *rest; /* start of rest of string */
308 register char *tail; /* string unmatched by rest of RE */
309 register sopno ssub; /* start sop of subsubRE */
310 register sopno esub; /* end sop of subsubRE */
311 register char *ssp; /* start of string matched by subsubRE */
312 register char *sep; /* end of string matched by subsubRE */
313 register char *oldssp; /* previous ssp */
314 register char *dp;
316 AT("diss", start, stop, startst, stopst);
317 sp = start;
318 for (ss = startst; ss < stopst; ss = es) {
319 /* identify end of subRE */
320 es = ss;
321 switch (OP(m->g->strip[es])) {
322 case OPLUS_:
323 case OQUEST_:
324 es += OPND(m->g->strip[es]);
325 break;
326 case OCH_:
327 while (OP(m->g->strip[es]) != O_CH)
328 es += OPND(m->g->strip[es]);
329 break;
331 es++;
333 /* figure out what it matched */
334 switch (OP(m->g->strip[ss])) {
335 case OEND:
336 assert(nope);
337 break;
338 case OCHAR:
339 sp++;
340 break;
341 case OBOL:
342 case OEOL:
343 case OBOW:
344 case OEOW:
345 break;
346 case OANY:
347 case OANYOF:
348 sp++;
349 break;
350 case OBACK_:
351 case O_BACK:
352 assert(nope);
353 break;
354 /* cases where length of match is hard to find */
355 case OQUEST_:
356 stp = stop;
357 for (;;) {
358 /* how long could this one be? */
359 rest = slow(m, sp, stp, ss, es);
360 assert(rest != NULL); /* it did match */
361 /* could the rest match the rest? */
362 tail = slow(m, rest, stop, es, stopst);
363 if (tail == stop)
364 break; /* yes! */
365 /* no -- try a shorter match for this one */
366 stp = rest - 1;
367 assert(stp >= sp); /* it did work */
369 ssub = ss + 1;
370 esub = es - 1;
371 /* did innards match? */
372 if (slow(m, sp, rest, ssub, esub) != NULL) {
373 dp = dissect(m, sp, rest, ssub, esub);
374 assert(dp == rest);
375 } else /* no */
376 assert(sp == rest);
377 sp = rest;
378 break;
379 case OPLUS_:
380 stp = stop;
381 for (;;) {
382 /* how long could this one be? */
383 rest = slow(m, sp, stp, ss, es);
384 assert(rest != NULL); /* it did match */
385 /* could the rest match the rest? */
386 tail = slow(m, rest, stop, es, stopst);
387 if (tail == stop)
388 break; /* yes! */
389 /* no -- try a shorter match for this one */
390 stp = rest - 1;
391 assert(stp >= sp); /* it did work */
393 ssub = ss + 1;
394 esub = es - 1;
395 ssp = sp;
396 oldssp = ssp;
397 for (;;) { /* find last match of innards */
398 sep = slow(m, ssp, rest, ssub, esub);
399 if (sep == NULL || sep == ssp)
400 break; /* failed or matched null */
401 oldssp = ssp; /* on to next try */
402 ssp = sep;
404 if (sep == NULL) {
405 /* last successful match */
406 sep = ssp;
407 ssp = oldssp;
409 assert(sep == rest); /* must exhaust substring */
410 assert(slow(m, ssp, sep, ssub, esub) == rest);
411 dp = dissect(m, ssp, sep, ssub, esub);
412 assert(dp == sep);
413 sp = rest;
414 break;
415 case OCH_:
416 stp = stop;
417 for (;;) {
418 /* how long could this one be? */
419 rest = slow(m, sp, stp, ss, es);
420 assert(rest != NULL); /* it did match */
421 /* could the rest match the rest? */
422 tail = slow(m, rest, stop, es, stopst);
423 if (tail == stop)
424 break; /* yes! */
425 /* no -- try a shorter match for this one */
426 stp = rest - 1;
427 assert(stp >= sp); /* it did work */
429 ssub = ss + 1;
430 esub = ss + OPND(m->g->strip[ss]) - 1;
431 assert(OP(m->g->strip[esub]) == OOR1);
432 for (;;) { /* find first matching branch */
433 if (slow(m, sp, rest, ssub, esub) == rest)
434 break; /* it matched all of it */
435 /* that one missed, try next one */
436 assert(OP(m->g->strip[esub]) == OOR1);
437 esub++;
438 assert(OP(m->g->strip[esub]) == OOR2);
439 ssub = esub + 1;
440 esub += OPND(m->g->strip[esub]);
441 if (OP(m->g->strip[esub]) == OOR2)
442 esub--;
443 else
444 assert(OP(m->g->strip[esub]) == O_CH);
446 dp = dissect(m, sp, rest, ssub, esub);
447 assert(dp == rest);
448 sp = rest;
449 break;
450 case O_PLUS:
451 case O_QUEST:
452 case OOR1:
453 case OOR2:
454 case O_CH:
455 assert(nope);
456 break;
457 case OLPAREN:
458 i = OPND(m->g->strip[ss]);
459 assert(0 < i && i <= m->g->nsub);
460 m->pmatch[i].rm_so = sp - m->offp;
461 break;
462 case ORPAREN:
463 i = OPND(m->g->strip[ss]);
464 assert(0 < i && i <= m->g->nsub);
465 m->pmatch[i].rm_eo = sp - m->offp;
466 break;
467 default: /* uh oh */
468 assert(nope);
469 break;
473 assert(sp == stop);
474 return(sp);
478 - backref - figure out what matched what, figuring in back references
479 == static char *backref(register struct match *m, char *start, \
480 == char *stop, sopno startst, sopno stopst, sopno lev);
482 static char * /* == stop (success) or NULL (failure) */
483 backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev)
485 register int i;
486 register sopno ss; /* start sop of current subRE */
487 register char *sp; /* start of string matched by it */
488 register sopno ssub; /* start sop of subsubRE */
489 register sopno esub; /* end sop of subsubRE */
490 register char *ssp; /* start of string matched by subsubRE */
491 register char *dp;
492 register size_t len;
493 register int hard;
494 register sop s;
495 register regoff_t offsave;
496 register cset *cs;
498 AT("back", start, stop, startst, stopst);
499 sp = start;
501 /* get as far as we can with easy stuff */
502 hard = 0;
503 for (ss = startst; !hard && ss < stopst; ss++)
504 switch (OP(s = m->g->strip[ss])) {
505 case OCHAR:
506 if (sp == stop || *sp++ != (char)OPND(s))
507 return(NULL);
508 break;
509 case OANY:
510 if (sp == stop)
511 return(NULL);
512 sp++;
513 break;
514 case OANYOF:
515 cs = &m->g->sets[OPND(s)];
516 if (sp == stop || !CHIN(cs, *sp++))
517 return(NULL);
518 break;
519 case OBOL:
520 if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
521 (sp < m->endp && *(sp-1) == '\n' &&
522 (m->g->cflags&REG_NEWLINE)) )
523 { /* yes */ }
524 else
525 return(NULL);
526 break;
527 case OEOL:
528 if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
529 (sp < m->endp && *sp == '\n' &&
530 (m->g->cflags&REG_NEWLINE)) )
531 { /* yes */ }
532 else
533 return(NULL);
534 break;
535 case OBOW:
536 if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
537 (sp < m->endp && *(sp-1) == '\n' &&
538 (m->g->cflags&REG_NEWLINE)) ||
539 (sp > m->beginp &&
540 !ISWORD(*(sp-1))) ) &&
541 (sp < m->endp && ISWORD(*sp)) )
542 { /* yes */ }
543 else
544 return(NULL);
545 break;
546 case OEOW:
547 if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
548 (sp < m->endp && *sp == '\n' &&
549 (m->g->cflags&REG_NEWLINE)) ||
550 (sp < m->endp && !ISWORD(*sp)) ) &&
551 (sp > m->beginp && ISWORD(*(sp-1))) )
552 { /* yes */ }
553 else
554 return(NULL);
555 break;
556 case O_QUEST:
557 break;
558 case OOR1: /* matches null but needs to skip */
559 ss++;
560 s = m->g->strip[ss];
561 do {
562 assert(OP(s) == OOR2);
563 ss += OPND(s);
564 } while (OP(s = m->g->strip[ss]) != O_CH);
565 /* note that the ss++ gets us past the O_CH */
566 break;
567 default: /* have to make a choice */
568 hard = 1;
569 break;
571 if (!hard) { /* that was it! */
572 if (sp != stop)
573 return(NULL);
574 return(sp);
576 ss--; /* adjust for the for's final increment */
578 /* the hard stuff */
579 AT("hard", sp, stop, ss, stopst);
580 s = m->g->strip[ss];
581 switch (OP(s)) {
582 case OBACK_: /* the vilest depths */
583 i = OPND(s);
584 assert(0 < i && i <= m->g->nsub);
585 if (m->pmatch[i].rm_eo == -1)
586 return(NULL);
587 assert(m->pmatch[i].rm_so != -1);
588 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
589 assert(stop - m->beginp >= len);
590 if (sp > stop - len)
591 return(NULL); /* not enough left to match */
592 ssp = m->offp + m->pmatch[i].rm_so;
593 if (memcmp(sp, ssp, len) != 0)
594 return(NULL);
595 while (m->g->strip[ss] != SOP(O_BACK, i))
596 ss++;
597 return(backref(m, sp+len, stop, ss+1, stopst, lev));
598 break;
599 case OQUEST_: /* to null or not */
600 dp = backref(m, sp, stop, ss+1, stopst, lev);
601 if (dp != NULL)
602 return(dp); /* not */
603 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
604 break;
605 case OPLUS_:
606 assert(m->lastpos != NULL);
607 assert(lev+1 <= m->g->nplus);
608 m->lastpos[lev+1] = sp;
609 return(backref(m, sp, stop, ss+1, stopst, lev+1));
610 break;
611 case O_PLUS:
612 if (sp == m->lastpos[lev]) /* last pass matched null */
613 return(backref(m, sp, stop, ss+1, stopst, lev-1));
614 /* try another pass */
615 m->lastpos[lev] = sp;
616 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
617 if (dp == NULL)
618 return(backref(m, sp, stop, ss+1, stopst, lev-1));
619 else
620 return(dp);
621 break;
622 case OCH_: /* find the right one, if any */
623 ssub = ss + 1;
624 esub = ss + OPND(s) - 1;
625 assert(OP(m->g->strip[esub]) == OOR1);
626 for (;;) { /* find first matching branch */
627 dp = backref(m, sp, stop, ssub, esub, lev);
628 if (dp != NULL)
629 return(dp);
630 /* that one missed, try next one */
631 if (OP(m->g->strip[esub]) == O_CH)
632 return(NULL); /* there is none */
633 esub++;
634 assert(OP(m->g->strip[esub]) == OOR2);
635 ssub = esub + 1;
636 esub += OPND(m->g->strip[esub]);
637 if (OP(m->g->strip[esub]) == OOR2)
638 esub--;
639 else
640 assert(OP(m->g->strip[esub]) == O_CH);
642 break;
643 case OLPAREN: /* must undo assignment if rest fails */
644 i = OPND(s);
645 assert(0 < i && i <= m->g->nsub);
646 offsave = m->pmatch[i].rm_so;
647 m->pmatch[i].rm_so = sp - m->offp;
648 dp = backref(m, sp, stop, ss+1, stopst, lev);
649 if (dp != NULL)
650 return(dp);
651 m->pmatch[i].rm_so = offsave;
652 return(NULL);
653 break;
654 case ORPAREN: /* must undo assignment if rest fails */
655 i = OPND(s);
656 assert(0 < i && i <= m->g->nsub);
657 offsave = m->pmatch[i].rm_eo;
658 m->pmatch[i].rm_eo = sp - m->offp;
659 dp = backref(m, sp, stop, ss+1, stopst, lev);
660 if (dp != NULL)
661 return(dp);
662 m->pmatch[i].rm_eo = offsave;
663 return(NULL);
664 break;
665 default: /* uh oh */
666 assert(nope);
667 break;
670 /* "can't happen" */
671 assert(nope);
672 /* NOTREACHED */
673 return(NULL);
677 - fast - step through the string at top speed
678 == static char *fast(register struct match *m, char *start, \
679 == char *stop, sopno startst, sopno stopst);
681 static char * /* where tentative match ended, or NULL */
682 fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
684 register states st = m->st;
685 register states fresh = m->fresh;
686 register states tmp = m->tmp;
687 register char *p = start;
688 register int c = (start == m->beginp) ? OUT : *(start-1);
689 register int lastc; /* previous c */
690 register int flagch;
691 register int i;
692 register char *coldp; /* last p after which no match was underway */
694 CLEAR(st);
695 SET1(st, startst);
696 st = step(m->g, startst, stopst, st, NOTHING, st);
697 ASSIGN(fresh, st);
698 SP("start", st, *p);
699 coldp = NULL;
700 for (;;) {
701 /* next character */
702 lastc = c;
703 c = (p == m->endp) ? OUT : *p;
704 if (EQ(st, fresh))
705 coldp = p;
707 /* is there an EOL and/or BOL between lastc and c? */
708 flagch = '\0';
709 i = 0;
710 if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
711 (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
712 flagch = BOL;
713 i = m->g->nbol;
715 if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
716 (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
717 flagch = (flagch == BOL) ? BOLEOL : EOL;
718 i += m->g->neol;
720 if (i != 0) {
721 for (; i > 0; i--)
722 st = step(m->g, startst, stopst, st, flagch, st);
723 SP("boleol", st, c);
726 /* how about a word boundary? */
727 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
728 (c != OUT && ISWORD(c)) ) {
729 flagch = BOW;
731 if ( (lastc != OUT && ISWORD(lastc)) &&
732 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
733 flagch = EOW;
735 if (flagch == BOW || flagch == EOW) {
736 st = step(m->g, startst, stopst, st, flagch, st);
737 SP("boweow", st, c);
740 /* are we done? */
741 if (ISSET(st, stopst) || p == stop)
742 break; /* NOTE BREAK OUT */
744 /* no, we must deal with this character */
745 ASSIGN(tmp, st);
746 ASSIGN(st, fresh);
747 assert(c != OUT);
748 st = step(m->g, startst, stopst, tmp, c, st);
749 SP("aft", st, c);
750 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
751 p++;
754 assert(coldp != NULL);
755 m->coldp = coldp;
756 if (ISSET(st, stopst))
757 return(p+1);
758 else
759 return(NULL);
763 - slow - step through the string more deliberately
764 == static char *slow(register struct match *m, char *start, \
765 == char *stop, sopno startst, sopno stopst);
767 static char * /* where it ended */
768 slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
770 register states st = m->st;
771 register states empty = m->empty;
772 register states tmp = m->tmp;
773 register char *p = start;
774 register int c = (start == m->beginp) ? OUT : *(start-1);
775 register int lastc; /* previous c */
776 register int flagch;
777 register int i;
778 register char *matchp; /* last p at which a match ended */
780 AT("slow", start, stop, startst, stopst);
781 CLEAR(st);
782 SET1(st, startst);
783 SP("sstart", st, *p);
784 st = step(m->g, startst, stopst, st, NOTHING, st);
785 matchp = NULL;
786 for (;;) {
787 /* next character */
788 lastc = c;
789 c = (p == m->endp) ? OUT : *p;
791 /* is there an EOL and/or BOL between lastc and c? */
792 flagch = '\0';
793 i = 0;
794 if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
795 (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
796 flagch = BOL;
797 i = m->g->nbol;
799 if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
800 (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
801 flagch = (flagch == BOL) ? BOLEOL : EOL;
802 i += m->g->neol;
804 if (i != 0) {
805 for (; i > 0; i--)
806 st = step(m->g, startst, stopst, st, flagch, st);
807 SP("sboleol", st, c);
810 /* how about a word boundary? */
811 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
812 (c != OUT && ISWORD(c)) ) {
813 flagch = BOW;
815 if ( (lastc != OUT && ISWORD(lastc)) &&
816 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
817 flagch = EOW;
819 if (flagch == BOW || flagch == EOW) {
820 st = step(m->g, startst, stopst, st, flagch, st);
821 SP("sboweow", st, c);
824 /* are we done? */
825 if (ISSET(st, stopst))
826 matchp = p;
827 if (EQ(st, empty) || p == stop)
828 break; /* NOTE BREAK OUT */
830 /* no, we must deal with this character */
831 ASSIGN(tmp, st);
832 ASSIGN(st, empty);
833 assert(c != OUT);
834 st = step(m->g, startst, stopst, tmp, c, st);
835 SP("saft", st, c);
836 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
837 p++;
840 return(matchp);
845 - step - map set of states reachable before char to set reachable after
846 == static states step(register struct re_guts *g, sopno start, sopno stop, \
847 == register states bef, int ch, register states aft);
848 == #define BOL (OUT+1)
849 == #define EOL (BOL+1)
850 == #define BOLEOL (BOL+2)
851 == #define NOTHING (BOL+3)
852 == #define BOW (BOL+4)
853 == #define EOW (BOL+5)
854 == #define CODEMAX (BOL+5) // highest code used
855 == #define NONCHAR(c) ((c) > CHAR_MAX)
856 == #define NNONCHAR (CODEMAX-CHAR_MAX)
858 static states
859 step(struct re_guts *g,
860 sopno start, /* start state within strip */
861 sopno stop, /* state after stop state within strip */
862 states bef, /* states reachable before */
863 int ch, /* character or NONCHAR code */
864 states aft) /* states already known reachable after */
866 register cset *cs;
867 register sop s;
868 register sopno pc;
869 register onestate here; /* note, macros know this name */
870 register sopno look;
871 register int i;
873 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
874 s = g->strip[pc];
875 switch (OP(s)) {
876 case OEND:
877 assert(pc == stop-1);
878 break;
879 case OCHAR:
880 /* only characters can match */
881 assert(!NONCHAR(ch) || ch != (char)OPND(s));
882 if (ch == (char)OPND(s))
883 FWD(aft, bef, 1);
884 break;
885 case OBOL:
886 if (ch == BOL || ch == BOLEOL)
887 FWD(aft, bef, 1);
888 break;
889 case OEOL:
890 if (ch == EOL || ch == BOLEOL)
891 FWD(aft, bef, 1);
892 break;
893 case OBOW:
894 if (ch == BOW)
895 FWD(aft, bef, 1);
896 break;
897 case OEOW:
898 if (ch == EOW)
899 FWD(aft, bef, 1);
900 break;
901 case OANY:
902 if (!NONCHAR(ch))
903 FWD(aft, bef, 1);
904 break;
905 case OANYOF:
906 cs = &g->sets[OPND(s)];
907 if (!NONCHAR(ch) && CHIN(cs, ch))
908 FWD(aft, bef, 1);
909 break;
910 case OBACK_: /* ignored here */
911 case O_BACK:
912 FWD(aft, aft, 1);
913 break;
914 case OPLUS_: /* forward, this is just an empty */
915 FWD(aft, aft, 1);
916 break;
917 case O_PLUS: /* both forward and back */
918 FWD(aft, aft, 1);
919 i = ISSETBACK(aft, OPND(s));
920 BACK(aft, aft, OPND(s));
921 if (!i && ISSETBACK(aft, OPND(s))) {
922 /* oho, must reconsider loop body */
923 pc -= OPND(s) + 1;
924 INIT(here, pc);
926 break;
927 case OQUEST_: /* two branches, both forward */
928 FWD(aft, aft, 1);
929 FWD(aft, aft, OPND(s));
930 break;
931 case O_QUEST: /* just an empty */
932 FWD(aft, aft, 1);
933 break;
934 case OLPAREN: /* not significant here */
935 case ORPAREN:
936 FWD(aft, aft, 1);
937 break;
938 case OCH_: /* mark the first two branches */
939 FWD(aft, aft, 1);
940 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
941 FWD(aft, aft, OPND(s));
942 break;
943 case OOR1: /* done a branch, find the O_CH */
944 if (ISSTATEIN(aft, here)) {
945 for (look = 1;
946 OP(s = g->strip[pc+look]) != O_CH;
947 look += OPND(s))
948 assert(OP(s) == OOR2);
949 FWD(aft, aft, look);
951 break;
952 case OOR2: /* propagate OCH_'s marking */
953 FWD(aft, aft, 1);
954 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
955 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
956 FWD(aft, aft, OPND(s));
958 break;
959 case O_CH: /* just empty */
960 FWD(aft, aft, 1);
961 break;
962 default: /* ooooops... */
963 assert(nope);
964 break;
968 return(aft);
971 #ifdef REDEBUG
973 - print - print a set of states
974 == #ifdef REDEBUG
975 == static void print(struct match *m, char *caption, states st, \
976 == int ch, FILE *d);
977 == #endif
979 static void
980 print(struct match *m, char *caption, states st, int ch, FILE *d)
982 register struct re_guts *g = m->g;
983 register int i;
984 register int first = 1;
986 if (!(m->eflags&REG_TRACE))
987 return;
989 fprintf(d, "%s", caption);
990 if (ch != '\0')
991 fprintf(d, " %s", pchar(ch));
992 for (i = 0; i < g->nstates; i++)
993 if (ISSET(st, i)) {
994 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
995 first = 0;
997 fprintf(d, "\n");
1001 - at - print current situation
1002 == #ifdef REDEBUG
1003 == static void at(struct match *m, char *title, char *start, char *stop, \
1004 == sopno startst, sopno stopst);
1005 == #endif
1007 static void
1008 at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst)
1010 if (!(m->eflags&REG_TRACE))
1011 return;
1013 printf("%s %s-", title, pchar(*start));
1014 printf("%s ", pchar(*stop));
1015 printf("%ld-%ld\n", (long)startst, (long)stopst);
1018 #ifndef PCHARDONE
1019 #define PCHARDONE /* never again */
1021 - pchar - make a character printable
1022 == #ifdef REDEBUG
1023 == static char *pchar(int ch);
1024 == #endif
1026 * Is this identical to regchar() over in debug.c? Well, yes. But a
1027 * duplicate here avoids having a debugging-capable regexec.o tied to
1028 * a matching debug.o, and this is convenient. It all disappears in
1029 * the non-debug compilation anyway, so it doesn't matter much.
1031 static char * /* -> representation */
1032 pchar(int ch)
1034 static char pbuf[10];
1036 if (isprint(ch) || ch == ' ')
1037 sprintf(pbuf, "%c", ch);
1038 else
1039 sprintf(pbuf, "\\%o", ch);
1040 return(pbuf);
1042 #endif
1043 #endif
1045 #undef matcher
1046 #undef fast
1047 #undef slow
1048 #undef dissect
1049 #undef backref
1050 #undef step
1051 #undef print
1052 #undef at
1053 #undef match