Added package with the documentation and the examples
[lwc.git] / regexp.c
blob07720afa9dffaba5d1706af6a769df7e98b573be
1 /******************************************************************************
2 Regular Expression compiler
4 some kind of state machine is definitelly used. A parser too!
5 ******************************************************************************/
7 #include <ctype.h>
8 #include "global.h"
10 static bool regexp_case;
11 static int regexp_partition = 255;
12 static bool regexp_packed;
13 static int regexp_INFTY = 32000;
14 static bool regexp_noextract;
15 static bool regexp_noctbl = 1;
16 static char* regexp_recipe;
17 static bool regexp_anon;
18 static bool regexp_strfuncs = 0;
21 #ifdef DEBUG
22 void regexp_print (int *retons)
24 int i;
26 for (i = 0; retons [i]; i++)
27 if (retons [i] > 255)
28 PRINTF (" [%i] ", retons [i]);
29 else if (retons [i] > 31)
30 PRINTF (" %c ", retons [i]);
31 else
32 PRINTF (" -%i ", retons [i]);
34 PRINTF ("\n");
36 #endif
38 void rerror (char *s) {
39 fprintf (stderr, "regexp error: %s\n", s);
40 fprintf (stderr, "regexp error: In \"%s\"\n", regexp_recipe);
41 exit (1);
45 enum {
46 RE_SPECIAL_SSTART = 256, // ^
47 RE_SPECIAL_SEND, // $
48 RE_SPECIAL_OPARENTH, // (
49 RE_SPECIAL_CPARENTH, // )
50 RE_SPECIAL_OBRAK, // [
51 RE_SPECIAL_ONBRAK, // [^
52 RE_SPECIAL_RANGE, // -
53 RE_SPECIAL_CBRAK, // ]
54 RE_SPECIAL_DOT, // .
55 RE_SPECIAL_QUESTION, // ?
56 RE_SPECIAL_STAR, // *
57 RE_SPECIAL_PLUS, // +
58 RE_SPECIAL_OR, // |
59 RE_SPECIAL_OBRACE, // {
60 RE_SPECIAL_CBRACE, // }
61 RE_SPECIAL_GRANTED, // ?/
63 RE_SPECIAL_UCLASS, // \c
65 RE_SPECIAL_STORP = 512,
66 RE_SPECIAL_CLASS = 2048
69 static inline int isstorp (int i)
70 { return i >= RE_SPECIAL_STORP && i < RE_SPECIAL_CLASS; }
71 static inline int isuclass (int i)
72 { return i >= RE_SPECIAL_UCLASS && i < RE_SPECIAL_STORP; }
75 #define OFFS(x) x-'a'
76 static struct {
77 char *def;
78 int op;
79 } uclass [OFFS ('z')] = {
80 [OFFS ('w')] = { "[a-zA-Z0-9_]", 0 },
81 [OFFS ('s')] = { "[ \t\r\n\f]", 0 },
82 [OFFS ('d')] = { "[0-9]", 0 }
85 typedef int *regstr;
89 static int escape (char *r, int i, regstr R)
91 #define EPARSE(x, y) break; case x: *R = y; return i;
92 #define ESCAPE(x) EPARSE (x, x)
94 switch (r [i]) {
95 default: {
96 char *endptr = endptr;
97 if (r [i] == 'x' && isxdigit (r [i+1]))
98 *R = strtol (r + i + 1, &endptr, 16);
99 else if (r [i] == '0')
100 *R = strtol (r + i, &endptr, 8);
101 else if (r [i] >= '1' && r [i] <= '9')
102 rerror ("Backreferences *not* supported");
103 else {
104 if (isalpha (r [i]))
105 *R = RE_SPECIAL_UCLASS + r [i];
106 else rerror ("undefined escape sequence");
107 return i;
109 return endptr - r - 1;
111 EPARSE ('n', '\n'); EPARSE ('t', '\t'); EPARSE ('f', '\f');
112 EPARSE ('r', '\r'); EPARSE ('a', '\a');
113 ESCAPE ('('); ESCAPE (')'); ESCAPE (']'); ESCAPE ('[');
114 ESCAPE ('{'); ESCAPE ('}'); ESCAPE ('.'); ESCAPE ('?');
115 ESCAPE ('*'); ESCAPE ('+'); ESCAPE ('-'); ESCAPE ('^');
116 ESCAPE ('$'); ESCAPE ('|'); ESCAPE ('\\');
120 static int nextr;
122 static void regexp_descape (char *r, regstr R)
124 #define XPARSE(x,y) break; case x: R [j++] = y
125 int i, j, np, b;
126 for (np = i = j = 0; r [i]; i++)
127 switch (r [i]) {
128 default: R [j++] = r [i];
129 XPARSE ('^', RE_SPECIAL_SSTART);
130 XPARSE ('$', RE_SPECIAL_SEND);
131 XPARSE ('.', RE_SPECIAL_DOT);
132 XPARSE ('?', RE_SPECIAL_QUESTION);
133 XPARSE ('*', RE_SPECIAL_STAR);
134 XPARSE ('+', RE_SPECIAL_PLUS);
135 XPARSE ('|', RE_SPECIAL_OR);
136 ncase ')': R [j++] = np ? (np--, RE_SPECIAL_CPARENTH) : ')';
137 XPARSE ('(', RE_SPECIAL_OPARENTH);
138 np++;
139 b = regexp_noextract;
140 if (r [i + 1] == '?')
141 switch (r [i += 2]) {
142 default: rerror ("embedded modifier not supported");
143 ncase '/': R [j++] = RE_SPECIAL_GRANTED;
144 case ':': b = 1;
146 if (!b) R [j++] = RE_SPECIAL_STORP + nextr++;
147 ncase '[':
148 if (r [++i] == '^') {
149 R [j++] = RE_SPECIAL_ONBRAK;
150 ++i;
151 } else R [j++] = RE_SPECIAL_OBRAK;
152 if (r [i] == '-')
153 R [j++] = r [i++];
154 for (; r [i]; i++) switch (r [i]) {
155 default: R [j++] = r [i];
156 ncase ']': R [j++] = RE_SPECIAL_CBRAK; goto close1;
157 case '-': R [j++] = RE_SPECIAL_RANGE;
158 ncase '\\': i = escape (r, i + 1, &R[j++]);
160 close1:
161 if (R [j - 2] == RE_SPECIAL_RANGE)
162 R [j - 2] = '-';
163 ncase '{': {
164 char *endptr = endptr;
165 R [j++] = RE_SPECIAL_OBRACE;
166 ++i;
167 if (r [i] < '0' || r [i] > '9')
168 rerror ("no number after '{'");
169 R [j++] = strtol (r + i, &endptr, 10);
170 if (r [i = endptr - r] == ',') {
171 ++i;
172 R [j++] = ',';
173 if (r [i] >= '0' && r [i] <= '9') {
174 R [j++] = strtol (r + i, &endptr, 10);
175 i = endptr - r;
178 if (r [i] != '}') rerror ("Unclosed '{'");
179 R [j++] = RE_SPECIAL_CBRACE;
181 ncase '\\': i = escape (r, i + 1, &R [j++]);
183 R [j] = 0;
184 if (np) rerror ("unclosed parenthesis");
187 typedef char *char_class;
189 static char_class *CClass;
190 static int nclasses;
192 static int add_class (char_class C)
194 int i;
195 for (i = 0; i < nclasses; i++)
196 if (!strcmp (CClass [i], C))
197 return i + RE_SPECIAL_CLASS;
198 CClass [nclasses] = strdup (C);
199 return RE_SPECIAL_CLASS + nclasses++;
202 static inline void OR_classes (char_class C1, char_class C2)
204 int i;
205 for (i = 0; i < 258; i++)
206 if (C1 [i] == '0' && C2 [i] == '1')
207 C1 [i] = '1';
210 static int mk_class (regstr, int*);
212 static int mk_uclass (int v)
214 int cb = tolower (v - RE_SPECIAL_UCLASS);
215 int iv = cb != v - RE_SPECIAL_UCLASS, i;
216 int R [64];
218 if (!uclass [OFFS (cb)].def)
219 rerror ("undefined user class");
221 if (uclass [OFFS (cb)].op)
222 rerror ("recusrive character class");
223 uclass [OFFS (cb)].op = 1;
224 regexp_descape (uclass [OFFS (cb)].def, R);
225 mk_class (R, &v);
226 v -= RE_SPECIAL_CLASS;
227 if (iv)
228 for (i = 1; i < 258; i++)
229 CClass [v][i] = CClass [v][i] == '1' ? '0' : '1';
230 uclass [OFFS (cb)].op = 0;
231 return v + RE_SPECIAL_CLASS;
234 static int mk_class (regstr R, int *p)
236 int i = 0, j1, j2, c;
237 int not = R [i++] == RE_SPECIAL_ONBRAK;
238 char base_class [258];
240 if (!not && R [i + 1] == RE_SPECIAL_CBRAK && R [i] < 256) {
241 *p = R [i];
242 return i + 2;
245 memset (base_class, '0', 258);
246 base_class [258] = 0;
248 while (R [i] != RE_SPECIAL_CBRAK)
249 if (R [i] < 256) {
250 if (R [i + 1] != RE_SPECIAL_RANGE) {
251 base_class [R [i++]] = '1';
252 continue;
254 if (R [i + 2] >= 256) rerror ("range to special");
255 if (R [i + 2] < R [i])
256 j1 = R [i + 2], j2 = R [i];
257 else j1 = R [i], j2 = R [i + 2];
258 while (j1 <= j2)
259 base_class [j1++] = '1';
260 i += 3;
261 } else if (isuclass (R [i])) {
262 j1 = mk_uclass (R [i++]) - RE_SPECIAL_CLASS;
263 for (c = 0; c < 256; c++)
264 if (CClass [j1][c] == '1')
265 base_class [c] = '1';
266 } else rerror ("was not expecting that in here");
268 if (not)
269 for (j1 = 1; j1 < 256; j1++)
270 base_class [j1] = base_class [j1] == '0' ? '1' : '0';
272 *p = add_class (base_class);
273 return i;
276 static void build_ctbl ();
277 static void compile_classes (regstr R)
279 int i, j, R2 [256];
280 char_class tmp [256];
281 CClass = tmp;
283 for (i = 0; R [i]; i++)
284 R2 [i] = R [i];
285 R2 [i] = 0;
287 for (i = j = 0; R2 [i]; i++)
288 if (R2 [i] == RE_SPECIAL_OBRAK || R2 [i] == RE_SPECIAL_ONBRAK)
289 i += mk_class (&R2 [i], &R [j++]);
290 else if (isuclass (R2 [i]))
291 R [j++] = mk_uclass (R2 [i]);
292 else R [j++] = R2 [i];
293 R [j] = 0;
294 build_ctbl ();
298 static unsigned int *ctbl;
300 static void build_ctbl ()
302 int i, j, v;
303 char_class C;
305 if (nclasses > 31) rerror ("not supported > 32 character classes");
307 for (i = 0; i < 258; i++)
308 ctbl [i] = 0;
310 for (i = 0; i < nclasses; i++)
311 for (C = CClass [i], v = 1 << i, j = 0; j < 257; j++)
312 if (C [j] == '1')
313 ctbl [j] |= v;
316 static int count1s (int b)
318 int i, s;
319 b = 1 << b;
320 for (i = 1, s = 0; i < 256; i++)
321 if (ctbl [i] & b) ++s;
322 return s;
324 //. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
325 static inline int re_skip_parenthesis (regstr R, int i)
327 int p;
329 for (p = 1; p; i++)
330 switch (R [i]) {
331 case RE_SPECIAL_OPARENTH: ++p;
332 ncase RE_SPECIAL_CPARENTH: --p;
333 ncase 0: rerror ("Unclosed parenthesis");
335 return i;
338 static int find_or (regstr R, int o[])
340 int i, n;
342 for (i = n = 0; R [i]; i++)
343 if (R [i] == RE_SPECIAL_OR)
344 o [n++] = i;
345 else if (R [i] == RE_SPECIAL_OPARENTH)
346 i = re_skip_parenthesis (R, i + 1) - 1;
347 if (n) o [n++] = i;
349 return n;
352 static int find_and (regstr R, int o[])
354 int i, n;
356 i = n = 0;
357 while (R [i]) {
358 o [n++] = i;
359 i = R [i] == RE_SPECIAL_OPARENTH ? re_skip_parenthesis (R, i + 1) : i + 1;
360 if (R [i] != RE_SPECIAL_STAR && R [i] != RE_SPECIAL_PLUS
361 && R [i] != RE_SPECIAL_QUESTION && R [i] != RE_SPECIAL_OBRACE)
362 continue;
363 if (R [i] == RE_SPECIAL_OBRACE)
364 while (R [i] != RE_SPECIAL_CBRACE) i++;
365 i++;
366 if (R [i] == RE_SPECIAL_QUESTION) i++;
368 o [n] = i;
370 return n;
373 // * * * * * * * * * * A.n.a.l.y.s.i.s * * * * * * * * * * * * *
374 // generate and optimize the ``subregular'' array which
375 // is more convenient. We can do analysis and put all
376 // the info for the code generator in the sr_pool[]
378 enum { SB_NIL, SB_TERM, SB_OR, SB_AND, SB_REPG, SB_REPN };
380 typedef int subReg;
382 typedef struct {
383 subReg *expr, expr1, referrer, this;
384 subReg rstate;
385 int n;
386 int store_at;
387 int type;
388 int min, max;
389 int from, to;
390 bool granted;
391 // optimizations
392 bool fsckd;
393 int backtrack;
394 long long *swtbl;
395 int save_fail, use_fail;
396 char *strn;
397 // for the code generator
398 int proton, save_success, gst, usemin;
399 } subregular;
401 static subregular **sr_pool;
402 static int nsr_pool;
403 static subReg root;
405 static void free_subregs ()
407 int i;
408 for (i = 0; i < nsr_pool; i++) {
409 if (sr_pool [i]->swtbl) free (sr_pool [i]->swtbl);
410 if (sr_pool [i]->strn) free (sr_pool [i]->strn);
411 free (sr_pool [i]);
415 static void set_repet (subregular *m, regstr R)
417 int j = 1;
418 switch (R [0]) {
419 case RE_SPECIAL_STAR: m->min = 0; m->max = regexp_INFTY;
420 ncase RE_SPECIAL_PLUS: m->min = 1; m->max = regexp_INFTY;
421 ncase RE_SPECIAL_QUESTION: m->min = 0; m->max = 1;
422 ncase RE_SPECIAL_OBRACE: m->min = R [j++];
423 if (R [j] == ',') {
424 if (R [j + 1] != RE_SPECIAL_CBRACE) {
425 m->max = R [j + 1];
426 j = 5;
427 } else {
428 m->max = regexp_INFTY;
429 j = 4;
431 } else {
432 m->max = m->min;
433 j = 3;
436 m->type = R [j] == RE_SPECIAL_QUESTION ? SB_REPN : SB_REPG;
439 static subregular *mallocsubr (int type, int store_at, int from, int to, subReg referrer)
441 subregular *m = (subregular*) malloc (sizeof *m);
442 m->proton = 0;
443 m->type = type;
444 m->store_at = store_at;
445 m->save_success = 0;
446 m->from = from;
447 m->to = to;
448 m->referrer = referrer;
449 m->backtrack = -1;
450 m->fsckd = 1;
451 m->swtbl = 0;
452 m->save_fail = m->use_fail = 0;
453 m->rstate = -1;
454 m->gst = 0;
455 m->usemin = 0;
456 m->granted = 0;
457 m->strn = 0;
458 return sr_pool [m->this = nsr_pool++] = m;
461 static subReg make_subregulars (regstr R, subReg referrer)
463 int n, ors [128], tmp [256], i, j, k, orst [64];
464 subregular *m, *mr;
466 if (!R [0])
467 return mallocsubr (SB_NIL, -1, 0, 0, referrer)->this;
469 if (n = find_or (R, ors)) {
470 int from, to;
471 from = regexp_INFTY, to = 0;
472 for (i = 0; i < n; i++) {
473 for (j = 0, k = i ? ors [i-1]+1 : 0; k < ors [i];)
474 tmp [j++] = R [k++];
475 tmp [j] = 0;
476 j = orst [i] = make_subregulars (tmp, -1);
477 if (sr_pool [j]->from < from) from = sr_pool [j]->from;
478 if (sr_pool [j]->to > to) to = sr_pool [j]->to;
480 orst [i] = -1;
481 m = mallocsubr (SB_OR, -1, from, to, referrer);
482 for (i = 0; i < n; i++)
483 sr_pool [orst [i]]->referrer = m->this;
484 m->n = i;
485 m->expr = intdup (orst);
486 return m->this;
489 n = find_and (R, ors);
491 if (n > 1) {
492 int from, to;
493 for (from = to = i = 0; i < n; i++) {
494 for (j = 0, k = ors [i]; k < ors [i+1];)
495 tmp [j++] = R [k++];
496 tmp [j] = 0;
497 j = orst [i] = make_subregulars (tmp, -1);
498 from += sr_pool [j]->from;
499 to += sr_pool [j]->to;
501 orst [i] = -1;
502 m = mallocsubr (SB_AND, -1, from, min (regexp_INFTY, to), referrer);
503 for (i = 0; i < n; i++)
504 sr_pool [orst [i]]->referrer = m->this;
505 m->n = i;
506 m->expr = intdup (orst);
507 return m->this;
510 if (R [0] == RE_SPECIAL_OPARENTH) {
511 int have_stor = isstorp (R [1]);
512 int skip1 = have_stor || R [1] == RE_SPECIAL_GRANTED;
513 int have_repet = R [j = re_skip_parenthesis (R, 1)] != 0;
515 for (i = 1 + skip1, k = 0; i < j - 1;)
516 tmp [k++] = R [i++];
517 tmp [k] = 0;
518 k = make_subregulars (tmp, -1);
519 if (have_stor) sr_pool [k]->store_at = R [1] - RE_SPECIAL_STORP;
520 sr_pool [k]->granted = skip1 && !have_stor;
521 if (!have_repet) {
522 sr_pool [k]->referrer = referrer;
523 return k;
526 m = mallocsubr (0, -1, 0, 0, referrer);
527 m->expr1 = k;
528 set_repet (m, &R [j]);
529 m->from = m->min * sr_pool [k]->from;
530 m->to = min (regexp_INFTY, m->max * sr_pool [k]->to);
531 sr_pool [k]->referrer = m->this;
532 return m->this;
535 m = mallocsubr (SB_TERM, -1, 1, 1, referrer);
536 m->expr1 = R [0];
537 if (!R [1])
538 return m->this;
540 mr = mallocsubr (0, -1, 0, 0, referrer);
541 m->referrer = mr->this;
542 mr->expr1 = m->this;
543 mr->store_at = -1;
544 set_repet (mr, &R [1]);
545 mr->from = mr->min; mr->to = mr->max;
546 return mr->this;
550 static void set_r_granted (subregular *m)
552 int i;
553 m->granted = 1;
554 switch (m->type) {
555 case SB_REPG:
556 case SB_REPN: set_r_granted (sr_pool [m->expr1]);
557 ncase SB_OR:
558 case SB_AND:
559 for (i = 0; i < m->n; i++)
560 set_r_granted (sr_pool [m->expr [i]]);
564 static void set_granted ()
566 int i;
567 subregular *m;
569 for (i = 2; i < nsr_pool; i++)
570 if ((m = sr_pool [i])->granted) {
571 if (m->from != m->to)
572 rerror ("Granted strings must be fixed size!");
573 set_r_granted (m);
576 #ifdef DEBUG
578 // debugging. print the sr_pool[] through the stages
580 static void print_subregularz ()
582 int j, n, i;
583 subregular *m;
585 PRINTF ("root=%i\n", root);
586 for (i = 0; i < nsr_pool; i++) {
587 m = sr_pool [i];
588 PRINTF ("%s%i(%i)[%i-%i]: ", m->store_at >= 0 ? COLS"S"COLE : "S", i, m->this, m->from, m->to);
590 switch (m->type) {
591 case SB_NIL: PRINTF (COLS"-"COLE"NIL-\n");
592 ncase -1: PRINTF (COLS"-"COLE"1 (off)\n");
593 ncase SB_TERM: if (!m->strn) {
594 int c = m->expr1;
595 if (c <= '~')
596 if (c >= ' ') PRINTF ("%c\n", c);
597 else PRINTF (" ctrl[%i]\n", c);
598 else if (c == RE_SPECIAL_DOT) PRINTF (COLS"."COLE"\n");
599 else if (c == RE_SPECIAL_SSTART) PRINTF (COLS"^"COLE"\n");
600 else if (c == RE_SPECIAL_SEND) PRINTF (COLS"$"COLE"\n");
601 else PRINTF ("class[%i]\n", c - RE_SPECIAL_CLASS);
602 } else {
603 PRINTF ("strncmp (\"%s\")\n", m->strn);
605 ncase SB_OR:
606 n = m->n;
607 for (j = 0; j < n; j++)
608 PRINTF ("S%i %s", m->expr [j], j == n-1 ? "" : COLS"| "COLE);
609 PRINTF (m->swtbl ? COLS" *switched*"COLE"\n" : "\n");
610 ncase SB_AND:
611 n = m->n;
612 for (j = 0; j < n; j++)
613 PRINTF ("S%i %s", m->expr [j], j == n-1 ? "\n" : COLS"+ "COLE);
614 ncase SB_REPG:
615 case SB_REPN:
616 PRINTF ("S%i "COLS"^"COLE" (%i %s %i) ", m->expr1, m->min,
617 m->type == SB_REPN ? "->" : "<-", m->max);
618 if (m->use_fail) PRINTF (COLS"*use fail%i* "COLE, m->use_fail);
619 if (m->fsckd) PRINTF (COLS"*fsckd*"COLE"\n");
620 else if (m->backtrack >= 0)
621 PRINTF (COLS"*backtrack %i!*"COLE"\n", m->backtrack);
622 else PRINTF (COLS"*unroll*"COLE"\n");
624 if (m->rstate != -1)
625 PRINTF (" this=%i rstate ---> S%i\n", m->this, m->rstate);
628 #endif
630 // silly optimizations
633 /*********************************************************
634 [1] FSCKD repetitioners - unroll recursion
636 Imagine this bad case /(fo|foo)*s/ against the string
637 "foofos". We can't unroll the recusrion: the fact that
638 it matches "fo" does not mean we can advance with a
639 positive match: "foo" can be matched if we don't advance
640 /(foo|fo)*s/ is OK on the other hand.
642 Anyway, if the repetitioner applies on fixed size we
643 are definitelly OK. If it is not fixed size and the
644 subexpression contains ORs or NON-GREEDY repetitioners,
645 suppose FSCKD. We could do futher analysis, but let's
646 suppose that repetitioners on non-fixed size *are* rare.
648 /(foo??)*s/ is FSCKD.
649 /(foo?)*s/ is OK.
651 it has to do with subsets...
652 *********************************************************/
653 static int has_ors_or_temperate (subregular *m)
655 switch (m->type) {
656 case SB_NIL:
657 case SB_TERM: return 0;
658 case SB_REPN:
659 case SB_OR: return 1;
660 case SB_REPG: return has_ors_or_temperate (sr_pool [m->expr1]);
661 default:
662 case SB_AND: {
663 int i;
664 for (i = 0; i < m->n; i++)
665 if (has_ors_or_temperate (sr_pool [m->expr [i]]))
666 return 1;
667 return 0;
671 static void optimize_fsckd_repetitioners ()
673 int i;
674 subregular *m, *m2;
676 for (i = 2; i < nsr_pool; i++) {
677 m = sr_pool [i];
678 if (m->type != SB_REPG && m->type != SB_REPN)
679 continue;
680 m->fsckd = 0;
681 m2 = sr_pool [m->expr1];
682 if (m2->from == m2->to)
683 continue;
684 m->fsckd = has_ors_or_temperate (m2);
685 if (m->type == SB_REPN
686 && sr_pool [m->expr1]->to != sr_pool [m->expr1]->from)
687 m->fsckd = 1;
690 /*********************************************************
691 [2] NO BACKTRACK on greedy repetitioners
693 This is a very important optimization. Suppose regexp
694 /a*@/ and the string "aaaaaaab". Once the search for
695 'a's stops at 'b', there's no need to backtrack hoping
696 for a less greedy match: the fact that so far we matched
697 [a] means that there's no way we can match [@]. The opt
698 is important because such constructs are very frequent.
700 Another interesting case is this: /\w+\d\d\d@/
701 where we do not backtrack more than 3. This is checked
702 only if the base of the repetitioner is one character and
703 as long as it is followed by fixed subexpressions.
704 Otherwise it gets very complicated.
706 In the case C1* C2 C3 C4, where C4 and C1 have no common
707 characters, we also check C2-C4 and C3-C4. Normally
708 we backtrack no more than 2. If C2 and C4 have no common
709 and C3 and C4 have no common we backtrack no more than 0!
710 *********************************************************/
711 static subReg next_subreg_norep (subReg s)
713 int i;
714 subReg p;
715 subregular *m;
716 do {
717 s = sr_pool [p = s]->referrer;
718 if (s == -1) return -1;
719 m = sr_pool [s];
720 if (m->type == SB_REPG || m->type == SB_REPN)
721 return -1;
722 if (m->type == SB_OR) continue;
723 if (m->type != SB_AND) rerror ("bugs");
724 for (i = 0; m->expr [i] != p; i++);
725 if (m->expr [i + 1] != -1)
726 return m->expr [i + 1];
727 } while (1);
730 static void set_class_from_term (char_class C, int c)
732 int i;
733 if (c < 258)
734 if (!regexp_case && c < 127 && isalpha (c))
735 C [toupper (c)] = C [tolower (c)] = '1';
736 else C [c] = '1';
737 else if (c == RE_SPECIAL_DOT)
738 for (i = 1; i < 256; i++)
739 C [i] = '1';
740 else {
741 c = 1 << (c - RE_SPECIAL_CLASS);
742 for (i = 0; i < 258; i++)
743 if (ctbl [i] & c)
744 C [i] = '1';
748 static void get_first_class (subReg s, char_class C)
750 if (s == -1) {
751 memset (C, '1', 258);
752 return;
754 int i;
755 subregular *m = sr_pool [s];
757 switch (m->type) {
758 case SB_REPG:
759 case SB_REPN: return get_first_class (m->expr1, C);
760 case SB_AND: return get_first_class (m->expr [0], C);
761 case SB_NIL: for (i = 0; i < 258; i++) C [i] = '1';
762 ncase SB_TERM: set_class_from_term (C, m->expr1);
763 ncase SB_OR:
764 for (i = 0; i < m->n; i++)
765 get_first_class (m->expr [i], C);
769 static int no_common (char_class C1, char_class C2)
771 int i;
772 for (i = 0; i < 258; i++)
773 if (C2 [i] == '1' && C1 [i] == '1')
774 return 0;
775 return 1;
778 static int test_repetitioner1 (subregular *m)
780 char C1 [259], C2 [259];
781 memset (C1, '0', 258); C1 [258] = 0;
782 memset (C2, '0', 258); C2 [258] = 0;
784 get_first_class (m->expr1, C1);
785 get_first_class (next_subreg_norep (m->this), C2);
786 return no_common (C1, C2);
789 static void get_nth_class_from_fixed (subregular *m, char_class C, int n)
791 int i, j;
792 switch (m->type) {
793 case SB_AND:
794 j = sr_pool [m->expr [i = 0]]->to;
795 while (n >= j)
796 j += sr_pool [m->expr [++i]]->to;
797 j -= sr_pool [m->expr [i]]->to;
798 get_nth_class_from_fixed (sr_pool [m->expr [i]], C, n - j);
799 ncase SB_OR:
800 for (i = 0; i < m->n; i++)
801 get_nth_class_from_fixed (sr_pool [m->expr [i]], C, n);
802 ncase SB_TERM:
803 set_class_from_term (C, m->expr1);
804 ncase SB_NIL: for (i = 0; i < 258; i++) C [i] = '1';
808 static void test_repetitioner2 (subregular *m)
810 int i, j, k, p;
811 subReg this = m->this, s = m->referrer;
812 subregular *m2;
814 if (s == -1) return;
815 m = sr_pool [s];
816 if (m->type != SB_AND) return;
817 for (i = 0; m->expr [i] != this; i++);
818 for (j = 0, k = ++i; i < m->n; i++) {
819 m2 = sr_pool [m->expr [i]];
820 if (m2->from != m2->to) break;
821 j += m2->from;
823 if (j < 2) return;
825 char C1 [259], C2 [259];
826 memset (C1, '0', 258); C1 [258] = 0;
827 get_first_class (sr_pool [this]->expr1, C1);
829 for (j = 0, i = p = k; i < m->n; i++) {
830 m2 = sr_pool [m->expr [i]];
831 for (k = 0; k < m2->to; k++, j++) {
832 memset (C2, '0', 258); C2 [258] = 0;
833 get_nth_class_from_fixed (m2, C2, k);
834 if (no_common (C1, C2))
835 goto ok;
838 return;
839 ok:;
840 //PRINTF ("Backtrack distance = %i\n", j);
841 int i2, k2, mto;
842 for (i2 = p; i2 <= i; i2++) {
843 m2 = sr_pool [m->expr [i2]];
844 mto = (i2 == i) ? k : m2->to - 1;
845 for (k2 = 0; k2 <= mto; k2++) {
846 memset (C1, '0', 258); C1 [258] = 0;
847 get_nth_class_from_fixed (m2, C1, k2);
848 if (no_common (C1, C2)) --j;
849 else goto dbreak;
852 dbreak:
853 //PRINTF ("Reduced to %i...\n", j);
854 sr_pool [this]->backtrack = j;
857 static void optimize_backtracking_repetitioners ()
859 int i;
860 subregular *m;
862 for (i = 2; i < nsr_pool; i++) {
863 m = sr_pool [i];
864 if (m->type != SB_REPG)
865 continue;
866 m->backtrack = -1;
867 if (test_repetitioner1 (m))
868 m->backtrack = 0;
869 else if (sr_pool [m->expr1]->type == SB_TERM)
870 test_repetitioner2 (m);
873 /*********************************************************
874 [3] NON-GREEDY jump
876 If a non-greedy repetitioner is followed by another
877 repetitioner, drop complexity in failure. For example
878 /.*?\w*?@/ and the text "abcdef1234,". Once the second
879 repetitioner (\w*?) stops at ',', the first one will
880 jump at it the next time: there is no need to go through
881 "bcdef1234" because the second repetitioner will always
882 halt at the same point.
884 This sounds rare but we make all regular expressions
885 fixed at the start by adding /^.*?/ in front of them.
886 So it is quite common and very useful so we won't shift
887 more than we have to. The optimization takes place
888 only when the repetitioner is on a character class and as
889 long as it is followed by character classes, up to the
890 first repetitioner. For example /^.*?abcd\w+/
891 will do it. It is interesting if the second repetitioner
892 has a big 'max' and possibly many match characters:
893 if it will walk far. An unfortunate case is this
894 /.*?ab?c\w+/ the (b?) repetitioner is too small for
895 us to activate the optimization and yet stops us
896 from proceeding up to the (\w+) which is interesting.
898 We can do more analysis and even pass it to OR's in
899 the future. For example: /.*?(bat|bee|fly)\w+/ ....
900 *********************************************************/
901 static void make_fixed ()
903 // make the regular expression fixed to start if
904 // possible by adding /^.*?/ in front of it.
905 // we can't do that if '^' already exists.
906 // a pathological case is (^a|bc)
907 char C [259];
908 memset (C, '0', 258); C [258] = 0;
909 get_first_class (root, C);
910 if (C [RE_SPECIAL_SSTART] == '1')
911 return;
913 subregular *m = sr_pool [root];
914 subregular *st = mallocsubr (SB_TERM, -1, 1, 1, 00);
915 subregular *dt = mallocsubr (SB_TERM, -1, 1, 1, 00);
916 subregular *ng = mallocsubr (SB_REPN, -1, 0, regexp_INFTY, 00);
917 subregular *nr = mallocsubr (SB_AND, -1, m->from, m->to, -1);
919 st->expr1 = RE_SPECIAL_SSTART;
920 st->referrer = nr->this;
921 dt->expr1 = RE_SPECIAL_DOT;
922 dt->referrer = ng->this;
923 ng->expr1 = dt->this;
924 ng->referrer = nr->this;
925 ng->fsckd = 0;
926 ng->min = 0;
927 ng->max = regexp_INFTY;
928 m->referrer = nr->this;
929 nr->n = 3;
930 subReg tmp [] = { st->this, ng->this, m->this, -1 };
931 nr->expr = intdup (tmp);
932 root = nr->this;
935 static subReg descend_to_repterm (subReg s)
937 subregular *m = sr_pool [s];
938 switch (m->type) {
939 case SB_OR:
940 case SB_TERM:
941 case SB_NIL: return -1;
942 case SB_AND: return descend_to_repterm (m->expr [0]);
943 default: return sr_pool [m->expr1]->type == SB_TERM ?
944 // do if repetitioner max > 4 walk far
945 m->max >= 4 ? m->this : -1 : -1;
949 static int get_term_sequence (subregular *m, int st, subReg seq[])
951 int i, n = 0;
952 subregular *r;
953 for (i = st; i < m->n; i++) {
954 r = sr_pool [m->expr [i]];
955 switch (r->type) {
956 default: if (sr_pool [r->expr1]->type == SB_TERM)
957 goto keep;
958 case SB_OR:
959 case SB_NIL: seq [n] = -1; return 0;
960 keep:
961 case SB_TERM: seq [n++] = r->this;
962 ncase SB_AND: if (!get_term_sequence (r, 0, seq + n))
963 return 0;
964 while (seq [n] != -1) n++;
967 seq [n] = -1;
968 return 1;
971 static void distant_jmp (subregular *m)
973 int j, i;
974 subReg sq [64];
975 subregular *a = sr_pool [m->referrer];
976 if (a->type != SB_AND) return;
977 for (j = 0; a->expr [j] != m->this; j++);
978 get_term_sequence (a, j + 1, sq);
980 for (i = 0; sq [i] != -1; i++)
981 if (in2 (sr_pool [sq [i]]->type, SB_REPG, SB_REPN))
982 break;
983 if (sq [i] == -1) return;
984 /* if too small / compared to how far, abort */
985 if (sr_pool [sq [i]]->max - i < 4) return;
986 for (j = 0; j < i + 1; j++)
987 sr_pool [sq [j]]->save_fail = sq [i];
988 m->use_fail = sq [i];
991 static void nongreedy2 (subregular *m, subReg s)
993 int i, c;
994 char C1 [259], C2 [259];
995 memset (C1, '0', 258); C1 [258] = 0;
996 memset (C2, '0', 258); C2 [258] = 0;
998 get_first_class (m->expr1, C1);
999 get_first_class (sr_pool [s]->expr1, C2);
1000 for (i = c = 0; i < 258; i++)
1001 if (C2 [i] == '1')
1002 ++c;
1003 /* threshold */
1004 if (c < 5) return;
1005 m->use_fail = s;
1006 sr_pool [s]->save_fail = s;
1009 static void optimize_nongreedy_jump ()
1011 int i;
1012 subregular *m;
1013 subReg s;
1015 make_fixed ();
1016 for (i = 2; i < nsr_pool; i++) {
1017 m = sr_pool [i];
1018 if (m->type != SB_REPN || m->fsckd)
1019 continue;
1020 if (sr_pool [m->expr1]->type != SB_TERM)
1021 continue;
1022 s = next_subreg_norep (m->this);
1023 if (s == -1 || (s = descend_to_repterm (s)) == -1)
1024 distant_jmp (m);
1025 else nongreedy2 (m, s);
1028 /*********************************************************
1029 [4] zero min
1031 Convert repetitioners with non-zero minimum times to
1032 repetitioners with zero minimum by expanding *min times
1033 the subexpression. For example /(f+)/ becomes /(ff*)/.
1035 We'll only do this for '+' and if we have 0 backtrack
1036 from optimization [2]. This is excellent because we don't
1037 count times.
1038 *********************************************************/
1039 static subregular *clone_copy (subregular *m)
1041 subregular *n = (subregular*) malloc (sizeof *n);
1042 *n = *m;
1043 return sr_pool [n->this = nsr_pool++] = n;
1046 static subReg clone_subReg (subReg s, subReg referrer)
1048 int i;
1049 subregular *m = sr_pool [s];
1050 subregular *n = clone_copy (m);
1052 n->referrer = referrer;
1053 switch (m->type) {
1054 case SB_REPN:
1055 case SB_REPG: n->expr1 = clone_subReg (m->expr1, n->this);
1056 ncase SB_AND:
1057 case SB_OR: for (i = 0; i < n->n; i++)
1058 n->expr [i] = clone_subReg (n->expr [i], n->this);
1060 return n->this;
1063 static void optimize_zeromin ()
1065 int i, all = nsr_pool;
1066 subregular *m;
1068 for (i = 2; i < all; i++) {
1069 if (!in2 ((m = sr_pool [i])->type, SB_REPG, SB_REPN))
1070 continue;
1071 if (m->min == 1 && m->max == regexp_INFTY) {
1072 subregular *r = clone_copy (m);
1073 r->from = r->min = 0;
1074 r->store_at = -1;
1075 r->referrer = m->this;
1076 subReg tmp [] = { clone_subReg (r->expr1, m->this), r->this, -1 };
1077 m->type = SB_AND;
1078 m->n = 2;
1079 m->expr = intdup (tmp);
1080 // -opt 3 adjust-
1081 sr_pool [tmp [0]]->save_fail = r->save_fail;
1085 /*********************************************************
1086 [5] SWITCH
1088 For OR expressions with more than 3 operands use a
1089 switch in C. For example /(Mon|Tue|Web|Thu|Fri|Sat|Sun)/
1090 will greatly benefit.
1092 TODO: non-fixed with no repetitioners is OK /cat|bird|penguin/
1093 *********************************************************/
1094 #define SWITCH_THRESHOLD 3
1096 static void rmv_first_class (subReg s)
1098 int i;
1099 subregular *m = sr_pool [s];
1101 switch (m->type) {
1102 case SB_AND:
1103 if (sr_pool [m->expr [0]]->type == SB_TERM) {
1104 sr_pool [m->expr [0]]->type = -1;
1105 for (i = 0; i < m->n; i++) {
1106 m->expr [i] = m->expr [i + 1];
1108 --m->n;
1109 } else rmv_first_class (m->expr [0]);
1110 --m->from; --m->to;
1111 ncase SB_NIL:
1112 ncase SB_TERM: m->type = SB_NIL; m->from = m->to = 0;
1113 ncase SB_OR:
1114 for (i = 0; i < m->n; i++)
1115 rmv_first_class (m->expr [i]);
1116 --m->from; --m->to;
1120 static void optimize_switch (subregular *m)
1122 int i, j;
1123 char C [259];
1124 long long SW [258], v;
1126 if (m->n > 64) return; // not impl
1128 memset (SW, 0, sizeof SW);
1129 for (i = 0; i < m->n; i++) {
1130 memset (C, '0', 258); C [258] = 0;
1131 get_first_class (m->expr [i], C);
1132 rmv_first_class (m->expr [i]);
1133 for (v = 1 << i, j = 0; j < 258; j++)
1134 if (C [j] == '1')
1135 SW [j] |= v;
1137 memcpy (m->swtbl = (long long*) malloc (sizeof SW), SW, sizeof SW);
1140 static void optimize_switches ()
1142 int i, j;
1143 subregular *m;
1145 for (i = 2; i < nsr_pool; i++) {
1146 m = sr_pool [i];
1147 if (m->type != SB_OR || m->n <= SWITCH_THRESHOLD || m->from != m->to)
1148 continue;
1149 for (j = 0; j < m->n; j++)
1150 if (!sr_pool [m->expr [j]]->to)
1151 goto nope;
1152 optimize_switch (m);
1153 nope:;
1156 /*********************************************************
1157 [6] minimum length
1159 For the case our regular expression starts with the ^.*?
1160 and the minimum length of the string in order to have
1161 a positive match *is* accountable (non neligible), we
1162 set a threshold to avoid studying strings which are
1163 too small to produce a match anyway.
1164 *********************************************************/
1165 static int minlen;
1167 static Token R_minlen;
1168 static void optimize_minlen ()
1170 int i, j;
1171 subregular *m = sr_pool [root], *n;
1172 minlen = 0;
1173 if (regexp_packed) return;
1174 if (m->type != SB_AND) return;
1175 n = sr_pool [m->expr [0]];
1176 if (n->type != SB_TERM || n->expr1 != RE_SPECIAL_SSTART) return;
1177 n = sr_pool [m->expr [1]];
1178 if (n->type != SB_REPN) return;
1179 n = sr_pool [n->expr1];
1180 if (n->type != SB_TERM) return;
1181 for (j = 0, i = 2; i < m->n; i++)
1182 j += sr_pool [m->expr [i]]->from;
1183 if (j < 5) return;
1184 minlen = j;
1185 sr_pool [m->expr [1]]->usemin = 1;
1188 /*********************************************************
1189 [7] use strncmp/strncasecmp
1191 For long fixed strings, the code gets smaller and
1192 this is good for cache.
1193 *********************************************************/
1194 static int isplain (subReg s, int i)
1196 subregular *m = sr_pool [sr_pool [s]->expr [i]];
1197 return m->type == SB_TERM && m->expr1 < 255 && !m->granted;
1200 static void dostrfunc (subReg s, int f, int t)
1202 int i;
1203 subregular *m = sr_pool [s], *m2;
1204 char tmp [255];
1206 for (i = 0; f + i <= t; i++)
1207 tmp [i] = sr_pool [m->expr [f + i]]->expr1;
1208 tmp [i] = 0;
1209 m2 = sr_pool [m->expr [f]];
1210 m2->strn = strdup (tmp);
1211 m2->from = m2->to = strlen (tmp);
1212 for (i = 1; t + i <= m->n; i++)
1213 m->expr [f + i] = m->expr [t + i];
1214 m->n -= t - f;
1217 #define DOSTR_ABOVE 2
1219 static void optimize_strfuncs ()
1221 int i, j, k;
1222 subregular *m;
1224 for (i = 2; i < nsr_pool; i++)
1225 if ((m = sr_pool [i])->type == SB_AND)
1226 for (j = 0; j < m->n;) {
1227 while (j < m->n && !isplain (i, j)) j++;
1228 if (j == m->n)
1229 break;
1230 for (k = j; k < m->n && isplain (i, k); k++);
1231 if (k - j > DOSTR_ABOVE)
1232 dostrfunc (i, j, k - 1);
1233 ++j;
1236 /*++++++++++++++++++++++++++++++++++++++++++++++++++
1237 ++++++++++++++++++++++++++++++++++++++++++++++++++*/
1238 /*********************************************************
1239 Do all optimizations in the correct order
1240 *********************************************************/
1241 void print_subregularz ();
1242 static void optimize_regexp ()
1244 #ifdef DEBUG
1245 #define STATUS(x) \
1246 if (debugflag.REGEXP_DEBUG) {\
1247 print_subregularz ();\
1248 PRINTF (x);\
1250 #else
1251 #define STATUS(x)
1252 #endif
1254 // must be first. we don't do the others on fsckd
1256 STATUS ("\nOptimize: set fsckd repetitioners:\n")
1257 optimize_fsckd_repetitioners ();
1260 // most important optimization
1262 STATUS ("\nOptimize. Backtracks:\n")
1263 optimize_backtracking_repetitioners ();
1266 // make it fixed ^ if possible. root changes
1268 STATUS ("\nOptimize. Non-greedy jumps:\n")
1269 optimize_nongreedy_jump ();
1270 optimize_minlen ();
1273 // must be after above. others are not prepared
1274 // for an SB_OR subregular with a swtbl
1276 STATUS ("\nOptimize. Switch:\n")
1277 optimize_switches ();
1280 // put it here to avoid zeromin on
1281 // subexpressions implemented with switch ()
1283 STATUS ("\nOptimize. zeromin:\n")
1284 optimize_zeromin ();
1287 // use strncmp() and strncasecmp() for fixed sequences
1288 if (regexp_strfuncs) {
1289 STATUS ("\nOptimize strfuncs():\n");
1290 optimize_strfuncs ();
1292 STATUS (" ");
1298 /*********************************************************
1299 finalize all the info in the sr_pool[]
1300 *********************************************************/
1301 #define OBV 10000
1305 // disable extractors inside repetitioners
1307 static subReg *e_off;
1308 static int n_e_off;
1310 static void disable_extractors (subregular *m)
1312 int i;
1313 if (m->store_at >= 0) {
1314 e_off [n_e_off++] = m->store_at;
1315 m->store_at = -1;
1317 switch (m->type) {
1318 case SB_REPG:
1319 case SB_REPN: disable_extractors (sr_pool [m->expr1]);
1320 ncase SB_OR:
1321 case SB_AND:
1322 for (i = 0; i < m->n; i++)
1323 disable_extractors (sr_pool [m->expr [i]]);
1327 static void disable_repet_extracts ()
1329 int i, j, k, l;
1331 e_off = (subReg*) alloca (nsr_pool * sizeof *e_off);
1332 n_e_off = 0;
1333 for (i = 0; i < nsr_pool; i++)
1334 if (in2 (sr_pool [i]->type, SB_REPG, SB_REPN))
1335 disable_extractors (sr_pool [sr_pool [i]->expr1]);
1336 if (!n_e_off) return;
1337 for (i = 0; i < nsr_pool; i++)
1338 if ((l = sr_pool [i]->store_at) >= 0) {
1339 for (j = k = 0; j < n_e_off; j++)
1340 if (l > e_off [j]) ++k;
1341 sr_pool [i]->store_at -= k;
1343 nextr -= n_e_off;
1347 // possible badly set matchers: (cat)|(dog)|(baboon)
1350 static struct {
1351 subReg ss, se;
1352 int fixedlen;
1353 bool ambig;
1354 } extract_point [64];
1356 static bool ambiguous_extracts;
1358 static void ambig_store (subregular *m)
1360 if (m->store_at >= 0)
1361 ambiguous_extracts = extract_point [m->store_at].ambig = 1;
1362 if (in2 (m->type, SB_OR, SB_AND)) {
1363 int i;
1364 for (i = 0; i < m->n; i++)
1365 ambig_store (sr_pool [m->expr [i]]);
1369 static void check_match_in_or ()
1371 int i, j;
1372 subregular *m;
1373 ambiguous_extracts = 0;
1374 for (i = 0; i < nsr_pool; i++)
1375 if ((m = sr_pool [i])->type == SB_OR)
1376 for (j = 0; j < m->n; j++)
1377 ambig_store (sr_pool [m->expr [j]]);
1380 // connect return states and mark extractor points
1383 static void rstate_connect (subReg s, subReg rstate)
1385 int i;
1386 subregular *m = sr_pool [s], *m1;
1387 if (m->store_at >= 0) {
1388 int fixedlen = m->from == m->to ? m->to : -1;
1389 extract_point [m->store_at].ss = m->this;
1390 extract_point [m->store_at].ambig = 0;
1391 m->save_success = 1;
1392 if ((extract_point [m->store_at].fixedlen = fixedlen) == -1) {
1393 if (!rstate) rstate = 1;
1394 extract_point [m->store_at].se = rstate;
1395 sr_pool [rstate]->save_success = 1;
1398 m->rstate = rstate;
1399 switch (m->type) {
1400 case SB_TERM:
1401 case SB_NIL:
1402 ncase SB_AND:
1403 for (i = 0; i < m->n - 1; i++)
1404 rstate_connect (m->expr [i], m->expr [i + 1]);
1405 rstate_connect (m->expr [i], rstate);
1406 m->rstate = m->expr [0];
1407 ncase SB_OR:
1408 for (i = 0; i < m->n; i++)
1409 rstate_connect (m->expr [i], rstate);
1410 ncase SB_REPN:
1411 case SB_REPG:
1412 m1 = sr_pool [m->expr1];
1413 if (m->fsckd) {
1414 rstate_connect (m->expr1, m->store_at >= 0 ? s + OBV : s);
1415 if (!m1->from)
1416 rerror ("can match infinite times the empty string anywhere!");
1417 } else rstate_connect (m->expr1, m->store_at >= 0 || m1->from != m1->to);
1421 static bool fixed_start;
1422 static void do_fixed_start ()
1424 subregular *m1 = sr_pool [root], *m2;
1425 fixed_start = 0;
1426 if (m1->type == SB_AND) {
1427 m2 = sr_pool [m1->expr [0]];
1428 if (m2->type == SB_TERM && m2->expr1 == RE_SPECIAL_SSTART) {
1429 m2->type = -1;
1430 intcpy (m1->expr, m1->expr + 1);
1431 --m1->n;
1432 fixed_start = 1;
1438 // do everything we need for the code generator
1440 static void prepare_regexp (char *re)
1442 int r [256];
1444 nextr = 0;
1445 nclasses = 0;
1446 nsr_pool = 0;
1448 // escape and set RE_SPECIAL values
1449 regexp_descape (re, r);
1451 // compile classes and install class numbers
1452 compile_classes (r);
1454 // make sr_pool []
1455 mallocsubr (SB_NIL, -1, 0, 0, -1);
1456 mallocsubr (SB_NIL, -1, 0, 0, -1);
1457 root = make_subregulars (r, -1);
1458 set_granted ();
1460 // optimize sr_pool []
1461 optimize_regexp ();
1463 // connect rstates for the code generator
1464 do_fixed_start ();
1465 disable_repet_extracts ();
1466 rstate_connect (root, 0);
1467 check_match_in_or ();
1468 #ifdef DEBUG
1469 if (debugflag.REGEXP_DEBUG) {
1470 PRINTF ("\nrstates:\n");
1471 print_subregularz ();
1473 #endif
1475 //######################################################################
1477 //######################################################################
1478 // Code Generation
1480 // everything we need is in the subregulars sr_pool[]
1481 //######################################################################
1482 static Token catRE (char*);
1483 static Token state_var(char*,int),state_func(int),new_value_char(int),new_value_hex(int), new_value_eint(int);
1484 static Token R_ctbl, R_name;
1485 static Token RGType = RESERVED_char;
1486 #define BRING(x) Token x = RESERVED_ ## x;
1487 #define XCODE(...) { Token xcd [] = { __VA_ARGS__, -1 }; outprintf (REGEXP, ISTR (xcd), -1); }
1488 #define IF(...) if (__VA_ARGS__) {
1489 #define ELIF(...) } else if (__VA_ARGS__) {
1490 #define ELSE } else {
1491 #define ENDIF }
1493 #define PROTO(x) R (static), R (inline), R (int), state_func (x), '(', R (const), \
1494 R (unsigned), RGType, '*', a, ')'
1495 #define RETURN0 R (return), R (0)
1496 #define RETURN1 R (return), R (1)
1497 #define JMPVAR(n) state_var ("j", n)
1498 #define STORVAR(n) state_var ("s", n)
1499 #define DECLAREJMP(x,n) outprintf (REGEXPD, R (static), R (const), R (unsigned), RGType, '*', x = JMPVAR (n), ';', -1);
1500 #define DECLARESTOR(x,n) outprintf (REGEXPD, R (static), R (const), R (unsigned), RGType, '*', x = STORVAR(n), ';', -1);
1501 #define STATEF(x) state_func (x), '(', a, ')'
1502 #define STATEFA1(x) state_func (x), '(', a, '+', RESERVED_1, ')'
1503 #define PA '*', a
1504 #define COMPOUND(...) '{', __VA_ARGS__, '}'
1506 static OUTSTREAM REGEXP, REGEXPD;
1507 static bool ctbl_used;
1509 static void gen_end_state (subregular*);
1510 static void gen_action_state (subregular*);
1511 static void gen_or_state (subregular*);
1512 static void gen_switch_state (subregular*);
1513 static void gen_greedy_state (subregular*);
1514 static void gen_nongreedy_state (subregular*);
1515 static void gen_nongreedy_loop_state (subregular*);
1516 static void gen_greedy_loop_state (subregular*);
1517 static void gen_save_interstate (subregular*);
1518 static void proto (subReg);
1520 static void gen_state (subReg s)
1522 if (s <= 1 || s >= OBV) return;
1523 subregular *m = sr_pool [s];
1525 if (m->gst == 2) return;
1526 if (m->gst == 1) { proto (m->this); return; }
1527 m->gst = 1;
1528 if (m->save_success)
1529 gen_save_interstate (m);
1530 switch (m->type) {
1531 case SB_AND:
1532 case SB_NIL: gen_end_state (m);
1533 ncase SB_TERM: gen_action_state (m);
1534 ncase SB_OR: m->swtbl ? gen_switch_state (m) : gen_or_state (m);
1535 ncase SB_REPG: m->fsckd ? gen_greedy_state (m) : gen_greedy_loop_state (m);
1536 ncase SB_REPN: m->fsckd ? gen_nongreedy_state (m) : gen_nongreedy_loop_state (m);
1538 m->proton = 1;
1539 m->gst = 2;
1542 static void gen_regexp_code ()
1544 int i;
1546 if (minlen) {
1547 R_minlen = catRE ("mb");
1548 outprintf (REGEXPD, R (static), R (const), R (unsigned), RGType, '*',
1549 R_minlen, ';', -1);
1551 BRING (a);
1552 XCODE (PROTO(0), '{', RETURN1, ';', '}')
1553 if (nextr) {
1554 XCODE (PROTO(1), '{', STORVAR (1), '=', a, ';', RETURN1, ';', '}')
1555 DECLARESTOR (i, 1);
1557 ctbl_used = false;
1558 gen_state (root);
1561 static void gen_save_interstate (subregular *m)
1563 BRING (a)
1564 int ist = m->this + OBV;
1565 int s;
1567 XCODE (PROTO (ist), ';')
1568 DECLARESTOR(s, m->this);
1569 XCODE (PROTO (m->this), '{', R (if), '(', STATEF (ist), ')', '{', s, '=', a, ';')
1570 XCODE (RETURN1, ';', '}', RETURN0, ';', '}')
1571 m->this = ist;
1574 static void proto (subReg s)
1576 if (s > 0 && (s >= OBV || !sr_pool [s]->proton)) {
1577 BRING (a)
1578 outprintf (REGEXPD, PROTO (s), ';', -1);
1579 sr_pool [s]->proton = 1;
1583 //////////////////////////////////////////////////////////
1584 // each of these 7 functions generates a state function //
1585 //////////////////////////////////////////////////////////
1587 //--------------------------------------------
1588 // greedy repetitioner -- unroll
1589 //--------------------------------------------
1590 static void gen_greedy_loop_state (subregular *m)
1592 BRING (a) BRING (x) BRING (i) BRING (X)
1593 int min = m->min, max = m->max;
1594 bool havemin = min != 0, havemax = max != regexp_INFTY, havetimes = havemin || havemax;
1595 bool fixed = sr_pool [m->expr1]->from == sr_pool [m->expr1]->to;
1596 Token Nmin = new_value_int (min), Nmax = new_value_int (max);
1597 Token Nfix = new_value_int (sr_pool [m->expr1]->from);
1598 Token j=j;
1599 bool save_displ = m->backtrack != 0 && !fixed;
1600 bool unlimited = regexp_INFTY > regexp_partition && max == regexp_INFTY && save_displ;
1601 bool xused = havetimes || unlimited || m->backtrack != 0;
1603 havetimes |= unlimited;
1605 gen_state (m->expr1); gen_state (m->rstate);
1607 IF (m->save_fail)
1608 DECLAREJMP (j, m->save_fail)
1609 ENDIF
1610 XCODE (PROTO (m->this), '{')
1611 IF (xused) // if count times
1612 XCODE (R (int), x, '=', R (0), ';')
1613 ENDIF
1614 IF (save_displ) // if saving displacement for non-fixed with backtracking
1615 int loopmax = havemax ? max : regexp_partition;
1616 XCODE (RGType, X, '[', new_value_int (loopmax), ']', ';', X, '[', R (0), ']', '=', a, ';')
1617 ENDIF
1618 XCODE (R (for), '(', ';', ';', ')', '{', R (if), '(', STATEF (m->expr1), ')', '{')
1620 /////// if macth subexpr
1621 IF (fixed) // increase a by fixed displacement
1622 XCODE (a, ASSIGNA, Nfix, ';')
1623 ELSE // or by state1 success
1624 XCODE (a, '=', STORVAR (1), ';')
1625 ENDIF
1626 IF (xused) // if counting times, times++
1627 XCODE (PLUSPLUS, x, ';')
1628 ENDIF
1629 IF (save_displ)
1630 XCODE (X, '[', x, ']', '=', a, ';')
1631 ENDIF
1633 IF (max > 1) // if the repetitioner is '?' don't go for more
1634 IF (havemax) // no more than maximum
1635 XCODE (R (if), '(', x, '<', Nmax, ')')
1636 ELIF (unlimited) // and no more than the sizeof the displacement tbl
1637 XCODE (R (if), '(', x, '<', new_value_int (regexp_partition), ')')
1638 ENDIF
1639 XCODE (R (continue), ';')
1640 ENDIF
1642 IF (unlimited) // recursion because sizeof displacement tbl exceeded
1643 XCODE (R (if), '(', STATEF (m->this), ')', RETURN1, ';')
1644 ENDIF
1645 XCODE ('}', R (break), ';', '}')
1647 IF (m->save_fail) // save maximum match if have to
1648 XCODE (j, '=', a, ';')
1649 ENDIF
1651 IF (m->backtrack < 0) // if not backtracking loop down to minimum
1652 XCODE (R (while), '(', x, MINUSMINUS, GEQCMP, Nmin, ')')
1653 ELIF (m->backtrack > 0) // or down to minimum but no more than 'backtrack' value (>0)
1654 XCODE (R (int), i, '=', new_value_int (m->backtrack + 1), ';')
1655 XCODE (R (while), '(', x, MINUSMINUS, GEQCMP, Nmin, ANDAND, i, MINUSMINUS, ')')
1656 ENDIF
1657 IF (!m->backtrack && havemin) // if just once (no loop) ensure >= min
1658 XCODE (R (if), '(', x, GEQCMP, Nmin, ANDAND, state_func (m->rstate), '(')
1659 ELSE // ... otherwise, it is definitelly >= min
1660 XCODE (R (if), '(', state_func (m->rstate), '(')
1661 ENDIF
1662 IF (fixed) // if fixed pass 'a' to next-state
1663 XCODE (a, ')', ')', RETURN1, ';')
1664 IF (m->backtrack) // if looping, decrease 'a' by fixed displacement
1665 XCODE (R (else), a, ASSIGNS, Nfix, ';')
1666 ENDIF
1667 ELSE // otherwise, if non fixed use the values from the displacement tbl
1668 XCODE (X, '[', x, '+', R (1), ']', ')', ')', RETURN1, ';')
1669 ENDIF
1670 XCODE (RETURN0, ';', '}')
1673 //--------------------------------------------
1674 // non greedy repetitioner -- unroll
1675 //--------------------------------------------
1676 static void gen_nongreedy_loop_state (subregular *m)
1678 BRING (a) BRING (x)
1679 int min = m->min, max = m->max;
1680 bool havemin = min != 0, havemax = max != regexp_INFTY, havetimes = havemin || havemax;
1681 bool fixed = sr_pool [m->expr1]->from == sr_pool [m->expr1]->to;
1682 Token Nmin = new_value_int (min), Nmax = new_value_int (max);
1683 Token Nfix = new_value_int (sr_pool [m->expr1]->to);
1684 Token j;
1686 gen_state (m->expr1); gen_state (m->rstate);
1688 XCODE (PROTO (m->this), '{')
1689 IF (havetimes) // we are counting times
1690 XCODE (R (int), x, '=', R (0), ';')
1691 ENDIF
1692 XCODE (R (for), '(', ';', ';', ')', '{')
1693 IF (minlen)
1694 XCODE (R (if), '(', a, '>', R_minlen, ')', RETURN0, ';')
1695 ENDIF
1696 ////////// early match
1697 IF (havemin) // ok if more than minimum
1698 XCODE (R (if), '(', x, GEQCMP, Nmin, ')', '{')
1699 ENDIF
1700 XCODE (R (if), '(', STATEF (m->rstate), ')', RETURN1, ';')
1701 IF (m->use_fail) // jump after failuer of next state
1702 IF (havetimes)
1703 XCODE (x, ASSIGNA, JMPVAR (m->use_fail), '-', a, ';')
1704 ENDIF
1705 XCODE (a, '=', JMPVAR (m->use_fail), ';')
1706 ENDIF
1707 IF (havemin)
1708 XCODE ('}')
1709 ENDIF
1710 ////////// repeat
1711 XCODE (R (if), '(', '!', '(')
1712 IF (havemax) // if beyond maximum, fail
1713 XCODE (x, '<', Nmax, ANDAND)
1714 ENDIF
1715 XCODE (STATEF (m->expr1), ')', ')')
1716 IF (m->save_fail) // in failure case save it as maximum walk
1717 DECLAREJMP (j, m->save_fail)
1718 XCODE (COMPOUND (j, '=', a, ';', RETURN0, ';'))
1719 ELSE // ... or just return 0
1720 XCODE (RETURN0, ';')
1721 ENDIF
1722 IF (fixed) // increase 'a' by fixed displacement
1723 XCODE (a, ASSIGNA, Nfix, ';')
1724 ELSE
1725 /* XXXXXX non fixed */
1726 /* not implemented, so we mark non-greedy on non-fixed fsckd. rare anyway */
1727 ENDIF
1728 IF (havetimes) // if counting times at all, increment
1729 XCODE (PLUSPLUS, x, ';')
1730 ENDIF
1731 XCODE ('}', '}')
1733 //--------------------------------------------
1734 // non greedy repetitioner -- recursive
1735 //--------------------------------------------
1736 static void gen_nongreedy_state (subregular *m)
1738 BRING (a) BRING (x)
1739 int min = m->min, max = m->max;
1740 bool havemin = min != 0, havemax = max != regexp_INFTY, havetimes = havemin || havemax;
1741 Token Nmin = new_value_int (min), Nmax = new_value_int (max);
1743 gen_state (m->expr1); gen_state (m->rstate);
1745 XCODE (PROTO (m->this), '{')
1746 IF (havetimes)
1747 XCODE (R (static), R (int), x, ';')
1748 ENDIF
1749 IF (havemin)
1750 XCODE (R (if), '(', x, GEQCMP, Nmin, ')')
1751 ENDIF
1752 XCODE (R (if), '(', STATEF (m->rstate), ')', '{')
1753 IF (havetimes)
1754 XCODE (x, '=', R (0), ';')
1755 ENDIF
1756 XCODE (RETURN1, ';', '}')
1757 IF (havemax)
1758 XCODE (R (if), '(', x, '>', Nmax, ')', COMPOUND (x, '=', R (0), ';', RETURN0, ';'))
1759 ENDIF
1760 IF (havetimes)
1761 XCODE (PLUSPLUS, x, ';')
1762 ENDIF
1763 XCODE (R (return), STATEF (m->expr1), ';', '}')
1766 //--------------------------------------------
1767 // greedy repetitioner -- recursive
1768 //--------------------------------------------
1769 static void gen_greedy_state (subregular *m)
1771 BRING (a) BRING (x)
1772 int min = m->min, max = m->max;
1773 bool havemin = min != 0, havemax = max != regexp_INFTY, havetimes = havemin || havemax;
1774 Token Nmin = new_value_int (min), Nmax = new_value_int (max);
1776 gen_state (m->expr1); gen_state (m->rstate);
1778 XCODE (PROTO (m->this), '{')
1779 IF (havetimes)
1780 XCODE (R (static), R (int), x, ';')
1781 ENDIF
1782 IF (havemax)
1783 XCODE (R (if), '(', x, '<', Nmax, ')', '{')
1784 ENDIF
1785 IF (havetimes)
1786 XCODE (PLUSPLUS, x, ';')
1787 ENDIF
1788 XCODE (R (if), '(', STATEF (m->expr1), ')', '{')
1789 IF (havetimes)
1790 XCODE (MINUSMINUS, x, ';')
1791 ENDIF
1792 XCODE (RETURN1, ';', '}')
1793 IF (havetimes)
1794 XCODE (R (else), MINUSMINUS, x, ';')
1795 ENDIF
1796 IF (havemax)
1797 XCODE ('}')
1798 ENDIF
1799 IF (havemin)
1800 XCODE (R (if), '(', x, '<', Nmin, ')', RETURN0, ';')
1801 ENDIF
1802 XCODE (R (return), STATEF (m->rstate), ';', '}')
1805 //--------------------------------------------
1806 // "this or that" with a switch()
1807 //--------------------------------------------
1808 static void gen_switch_state (subregular *m)
1810 int i, j, h;
1811 long long utbl [258], v;
1812 BRING (a)
1814 memcpy (utbl, m->swtbl, sizeof utbl);
1815 for (i = 0; i < 258; i++) {
1816 if (!(v = utbl [i])) continue;
1817 for (j = i; j < 258; j++)
1818 if (utbl [j] == v) utbl [j] &= ~v;
1819 for (h = j = 0; j < 64; j++)
1820 if (v & (1LL << j))
1821 gen_state (m->expr [j]);
1824 memcpy (utbl, m->swtbl, sizeof utbl);
1825 XCODE (PROTO (m->this), '{', R (switch), '(', '*', a, ')', '{')
1826 for (i = 0; i < 258; i++) {
1827 if (!(v = utbl [i])) continue;
1828 for (j = i; j < 258; j++)
1829 if (utbl [j] == v) {
1830 XCODE (R (case), new_value_int (j), ':')
1831 utbl [j] = 0;
1833 XCODE (R (return))
1834 for (h = j = 0; j < 64; j++)
1835 if (v & (1LL << j)) {
1836 if (h) XCODE (OROR)
1837 h = 1;
1838 XCODE (STATEFA1 (m->expr [j]))
1840 XCODE (';')
1842 XCODE (R (default), ':', RETURN0, ';', '}', '}')
1845 //--------------------------------------------
1846 // return "this or that"
1847 //--------------------------------------------
1848 static void gen_or_state (subregular *m)
1850 int i;
1851 BRING (a)
1853 for (i = 0; i < m->n; i++)
1854 gen_state (m->expr [i]);
1856 XCODE (PROTO (m->this), '{', R(return))
1857 for (i = 0; i < m->n; i++)
1858 XCODE (STATEF (m->expr [i]), i == m->n - 1 ? ';' : OROR)
1859 XCODE ('}')
1862 //--------------------------------------------
1863 // do actual check for a character match
1864 //--------------------------------------------
1865 static void gen_action_state (subregular *m)
1867 int atom = m->expr1, b, s, i, adv = RESERVED_1;
1868 BRING (a) BRING (x)
1870 gen_state (m->rstate);
1872 XCODE (PROTO (m->this), '{')
1874 IF (!m->granted)
1875 IF (m->strn)
1876 adv = new_value_int (strlen (m->strn));
1877 XCODE (R (if), '(', '!', regexp_case ? INTERN_strncmp : INTERN_strncasecmp)
1878 XCODE ('(', a, ',', new_value_string (m->strn), ',', adv, ')')
1879 ELIF (atom >= RE_SPECIAL_CLASS && (b = count1s (atom - RE_SPECIAL_CLASS)) > 2 && b < 253 && regexp_noctbl)
1880 bool ones = b < 127;
1881 XCODE (R (int), x, '=', ones ? R (0) : R (1), ';', R (switch), '(', PA, ')')
1882 b = 1 << (atom - RE_SPECIAL_CLASS);
1883 for (i = 1; i < 256; i++)
1884 if (nz (ctbl [i] & b) == ones) {
1885 XCODE (R (case))
1886 #ifdef CASE_RANGERS // the heroic squad who fights the nasty alien outlaws
1887 int j, c;
1888 for (j = i + 1, c = 0; j < 256; j++)
1889 if (nz (ctbl [j] & b) == ones) ++c;
1890 else break;
1891 if (c >= 2) {
1892 XCODE (new_value_eint (i), ELLIPSIS, new_value_eint (--j), ':')
1893 i = j;
1894 } else
1895 #endif
1896 XCODE (new_value_eint (i), ':')
1898 XCODE (x, '=', ones ? R (1) : R (0), ';', R (if), '(', x);
1899 ELSE
1900 XCODE (R(if), '(')
1901 IF (atom == RE_SPECIAL_DOT)
1902 XCODE (PA)
1903 ELIF (atom == RE_SPECIAL_SEND)
1904 XCODE ('!', PA)
1905 ELIF (atom >= RE_SPECIAL_CLASS)
1906 s = count1s (b = atom - RE_SPECIAL_CLASS);
1907 IF (s <= 2)
1908 for (i = 1; !(ctbl [i] & (1 << b)); i++);
1909 XCODE (PA, EQCMP, new_value_int (i))
1910 IF (s == 2)
1911 for (++i; !(ctbl [i] & (1 << b)); i++);
1912 XCODE (OROR, PA, EQCMP, new_value_int (i))
1913 ENDIF
1914 ELIF (s == 255)
1915 rerror ("character class whith matches nothing!");
1916 ELIF (s >= 253)
1917 for (i = 1; ctbl [i] & ~(1 << b); i++);
1918 XCODE (PA, NEQCMP, new_value_int (i))
1919 IF (s == 253)
1920 for (++i; ctbl [i] & ~(1 << b); i++);
1921 XCODE (ANDAND, PA, NEQCMP, new_value_int (i))
1922 ENDIF
1923 ELSE
1924 Token bit = new_value_hex (1 << (atom - RE_SPECIAL_CLASS));
1925 ctbl_used = true;
1926 XCODE (R_ctbl, '[', PA, ']', '&', bit)
1927 ENDIF /* class implemented with ctbl or eq/neq */
1928 ELIF (atom < ' ' || atom > '~')
1929 XCODE (PA, EQCMP, new_value_int (atom))
1930 ELSE
1931 XCODE (PA, EQCMP, new_value_char (atom))
1932 IF (!regexp_case && isalpha (atom))
1933 XCODE (OROR, PA, EQCMP, new_value_char (atom < 'a' ? tolower (atom) : toupper (atom)))
1934 ENDIF /* do lower/upper case */
1935 ENDIF /* end if */
1936 ENDIF /* switch or if */
1937 XCODE (')')
1938 ENDIF /* !granted */
1941 IF (m->save_fail) // on the way to a jump saving repetitioner....
1942 XCODE (m->granted ? BLANKT : '{', R (if),'(', STATEFA1 (m->rstate), ')', RETURN1, ';')
1943 XCODE (MINUSMINUS, JMPVAR (m->save_fail), ';', RETURN0, ';', m->granted ? BLANKT : '}')
1944 IF (!m->granted)
1945 XCODE (JMPVAR (m->save_fail), '=', a, ';')
1946 ENDIF
1947 ELSE
1948 XCODE (R (return), state_func (m->rstate), '(', a, '+', adv, ')', ';')
1949 ENDIF
1951 IF (!m->granted)
1952 XCODE (RETURN0, ';')
1953 ENDIF
1954 XCODE ('}')
1957 //--------------------------------------------
1958 // redirect to other state
1959 //--------------------------------------------
1960 static void gen_end_state (subregular *m)
1962 BRING (a)
1963 gen_state (m->rstate);
1964 XCODE (PROTO (m->this), '{', R (return), STATEF (m->rstate), ';', '}')
1966 //######################################################################
1968 //######################################################################
1970 static Token catREsym (char *s)
1972 char tmp [200];
1973 sprintf (tmp, "%s%s", expand (R_name), s);
1974 return enter_symbol (strdup (tmp));
1976 static Token catRE (char *s)
1978 char tmp [200];
1979 sprintf (tmp, "%s%s", expand (R_name), s);
1980 return new_symbol (strdup (tmp));
1982 static Token state_var (char *s, int i)
1984 char tmp [200];
1985 sprintf (tmp, "%s%s%i", expand (R_name), s, i);
1986 return new_symbol (strdup (tmp));
1988 static Token state_func (int i)
1990 char tmp [200];
1991 sprintf (tmp, "%s_state%i", expand (R_name), i);
1992 return new_symbol (strdup (tmp));
1994 static Token new_value_char (int i)
1996 char tmp [20];
1997 sprintf (tmp, "'%c'", i);
1998 return new_symbol (strdup (tmp));
2000 static Token new_value_eint (int i)
2002 char tmp [20];
2003 if (i < ' ' || i > '~')
2004 sprintf (tmp, "'\\x%x'", i);
2005 else
2006 sprintf (tmp, "'%c'", i);
2007 return new_symbol (strdup (tmp));
2010 static Token new_value_hex (int i)
2012 char tmp [20];
2013 sprintf (tmp, "0x%x", i);
2014 return new_symbol (strdup (tmp));
2017 //######################################################################
2019 //######################################################################
2020 // Entries to this module, inits and globals
2021 //######################################################################
2022 static struct {
2023 bool istatic, iextern, iinline;
2024 } regexp_dclql;
2025 #define STATIC regexp_dclql.istatic ? R (static) : BLANKT
2026 #define EXTERN regexp_dclql.iextern ? R (extern) : BLANKT
2027 #define INLINE regexp_dclql.iinline ? R (inline) : BLANKT
2029 static void export_ctbl ()
2031 char tmp [20];
2032 int i;
2034 if (!nclasses || !ctbl_used) return;
2035 if (nclasses > 31)
2036 rerror ("Too many character classes. Can do only 32");
2038 outprintf (REGEXPD, R (const), R (static), !regexp_packed ? R (int)
2039 : nclasses < 8 ? R (char) : nclasses < 16 ? R (short)
2040 : R (int), R_ctbl, '[', ']', '=', '{', -1);
2041 for (i = 0; i < 258; i++) {
2042 sprintf (tmp, "0x%x,", ctbl [i]);
2043 outprintf (REGEXPD, new_symbol (strdup (tmp)), -1);
2045 outprintf (REGEXPD, '}', ';', -1);
2048 static typeID typeID_matchf;
2049 static void export_once ()
2051 static int have;
2052 if (have) return;
2053 have = 1;
2056 // declare and make known the charp_len structure which holds extracts
2058 BRING (p) BRING (i)
2059 outprintf (GLOBAL, R (struct), R (charp_len), '{', R (unsigned), R (char), '*',
2060 p, ';', R (int), i, ';', '}', ';', -1);
2061 recID rr = enter_struct (RESERVED_charp_len, true, 0, 0, 0, 0);
2062 add_variable_member (rr, p, typeID_charP, 0, false, false);
2063 add_variable_member (rr, i, typeID_int, 0, false, false);
2064 complete_structure (0, rr);
2065 int p0 [] = { rr, '*', -1 };
2066 int p1 [] = { B_SINT, '(', typeID_charP, enter_type (p0), INTERNAL_ARGEND, -1 };
2067 typeID_matchf = enter_type (p1);
2070 static void make_match_function (char *re)
2072 BRING (a) BRING (i) BRING (X) BRING (p)
2073 int j;
2074 Token f = catRE ("_match"), n;
2075 Token recipe = catRE ("_recipe");
2077 IF (!regexp_anon)
2078 XCODE (STATIC, R (const), R (char), recipe, '[', ']', '=',
2079 new_symbol (escape_c_string (re, strlen (re))), ';')
2080 ENDIF
2081 XCODE (STATIC, INLINE, R (int), f, '(', R (const), R (char), '*', a, ',', R (struct))
2082 XCODE (R (charp_len), X, '[', ']', ')', '{')
2083 IF (!fixed_start)
2084 rerror ("Not prepared to handle this kind of regexp. Please move '^' out");
2085 ENDIF
2086 IF (minlen)
2087 XCODE (R_minlen, '=', a, '+', '(', R (strlen), '(', a, ')', '-')
2088 XCODE (new_value_int (minlen), ')', ';')
2089 ENDIF
2090 IF (nextr)
2091 IF (ambiguous_extracts)
2092 for (j = 0; j < nextr; j++)
2093 if (extract_point [j].ambig)
2094 XCODE (STORVAR (extract_point [j].ss), '=', R (0), ';')
2095 ENDIF
2096 XCODE (R (if), '(', STATEF (root), ')', '{', R (if), '(', X, ')', '{')
2097 for (j = 0; j < nextr; j++) {
2098 #define UNCONST '(', R (unsigned), RGType, '*', ')'
2099 n = new_value_int (j);
2100 IF (extract_point [j].fixedlen == -1)
2101 XCODE (X, '[', n, ']', '.', i, '=', STORVAR (extract_point [j].se), '-', '(')
2102 XCODE (X, '[', n, ']', '.', p, '=', UNCONST, STORVAR (extract_point [j].ss),')',';')
2103 ELSE
2104 XCODE (X, '[', n, ']', '.', p, '=', UNCONST, STORVAR (extract_point [j].ss), ';')
2105 XCODE (X, '[', n, ']', '.', i, '=', new_value_int(extract_point [j].fixedlen),';')
2106 ENDIF
2108 XCODE ('}', RETURN1, ';', '}', RETURN0, ';', '}')
2109 ELSE
2110 XCODE (R (return), STATEF (root), ';', '}')
2111 ENDIF
2114 static Token regexp_declarations (bool def)
2116 export_once ();
2117 BRING (X) BRING (a);
2118 Token f = catREsym ("_match");
2119 Token fproto [] = {
2120 STATIC, EXTERN, INLINE,
2121 R (int), f, '(', R (const), R (char), '*', a, ',', R (struct),
2122 R (charp_len), X, '[', ']', ')', -1
2124 Token recipe = catREsym ("_recipe");
2125 Token xargs [] = { a, X, -1 };
2126 Token *dflt1 = mallocint (2);
2127 dflt1 [0] = R (0);
2128 dflt1 [1] = -1;
2129 Token **dflt = (Token**) malloc (2 * sizeof *dflt);
2130 dflt [0] = dflt1;
2131 dflt [1] = 0;
2132 xdeclare_function (&Global, f, f, typeID_matchf, fproto, xargs, 0, dflt, 0);
2133 enter_global_object (recipe, typeID_charP);
2134 if (!def) {
2135 regexp_dclql.iextern = !regexp_dclql.istatic;
2136 outprintf (GVARS, EXTERN, STATIC, R (const), R (char), recipe, '[', ']', ';', -1);
2138 return f;
2141 static Token regexp_compiler (char *re)
2143 unsigned int sctbl [258];
2144 ctbl = sctbl;
2145 subregular *ssr_pool [512];
2146 sr_pool = ssr_pool;
2148 export_once ();
2149 R_ctbl = catRE ("_ctbl");
2150 char tmp [200];
2151 sprintf (tmp, "/* REGEXP: %s */\n", re);
2152 outprintf (REGEXPD, new_symbol (strdup (tmp)), -1);
2153 regexp_recipe = re;
2154 #ifdef DEBUG
2155 if (debugflag.REGEXP_DEBUG)
2156 PRINTF ("Regexp: %s\n", re);
2157 #endif
2158 prepare_regexp (re);
2159 gen_regexp_code ();
2160 free_subregs ();
2161 export_ctbl ();
2162 make_match_function (re);
2163 return regexp_declarations (1);
2166 static NormPtr parse_regexp_dcl (NormPtr p)
2168 if (!issymbol (R_name = CODE [p++]))
2169 parse_error (p, "No name for RegExp");
2170 if (CODE [p] != '(') {
2171 regexp_declarations (0);
2172 return p;
2175 if (CODE [p++] != '(')
2176 parse_error (p, "Syntax error in RegExp");
2177 if (!ISVALUE (CODE [p]) || type_of_const (CODE [p]) != typeID_charP)
2178 parse_error (p, "No regular expression string!");
2179 char *re = expand (CODE [p++]);
2180 char *req = (char*) alloca (strlen (re));
2181 strcpy (req, re + 1);
2182 req [strlen (req) - 1] = 0;
2184 regexp_strfuncs = regexp_case = 1;
2185 regexp_anon = regexp_packed = regexp_noextract = regexp_noctbl = 0;
2186 if (CODE [p] == ',')
2187 for (++p; ISSYMBOL (CODE [p]); p++)
2188 if (!tokcmp (CODE [p], "NOCASE")) regexp_case = 0;
2189 else if (!tokcmp (CODE [p], "PACKED")) regexp_packed = 1;
2190 else if (!tokcmp (CODE [p], "NOEX")) regexp_noextract = 1;
2191 else if (!tokcmp (CODE [p], "NOCTBL")) regexp_noctbl = 1;
2192 else if (!tokcmp (CODE [p], "NOSTRFUNC")) regexp_strfuncs = 0;
2193 else parse_error (p, "Unrecognized regexp option");
2194 if (CODE [p++] != ')')
2195 parse_error (p, "RegExp definition");
2197 REGEXP = new_stream ();
2198 REGEXPD = new_stream ();
2199 regexp_compiler (req);
2200 concate_streams (concate_streams (REGEXP_CODE, REGEXPD), REGEXP);
2202 return p;
2205 static NormPtr parse_regexp_uclass (NormPtr p)
2207 if (CODE [++p] != '(') goto aerr;
2208 if (!isvalue (CODE [++p])) goto aerr;
2209 char *cc = expand (CODE [p++]);
2210 if (CODE [p++] != ',') goto aerr;
2211 if (!ISVALUE (CODE [p])) goto aerr;
2212 char *cd = expand (CODE [p++]), ce [64];
2213 if (CODE [p++] != ')') goto aerr;
2214 if (cc [0] != '\'' || !islower (cc [1])
2215 || cd [0] != '"' || cd [1] != '[') goto aerr;
2216 strcpy (ce, cd + 1);
2217 ce [strlen (ce) - 1] = 0;
2218 /* where are regexps when you need them? */
2219 if (cc [1] == 'r' || cc [1] == 'a' || cc [1] == 't'
2220 || cc [1] == 'n' || cc [1] == 'f' || cc [1] == 'x')
2221 parse_error (p, "Abbreviation of reserved character");
2222 uclass [OFFS (cc [1])].def = strdup (ce);
2223 return p;
2224 aerr:
2225 parse_error (p, "abbrev syntax error: abbrev ('lowercase char', \"[string]\") ");
2226 return 0;
2229 /******************************
2230 Perl style =~ in expressions
2231 ******************************/
2232 Token perlop_regexp (Token recipe)
2234 regexp_dclql.istatic = 1;
2235 regexp_dclql.iextern = 0;
2236 regexp_dclql.iinline = 1;
2237 regexp_noctbl = 0;
2238 regexp_strfuncs = regexp_anon = regexp_case = regexp_packed = regexp_noextract = 1;
2239 char *re = expand (recipe);
2240 char *req = (char*) alloca (strlen (re));
2241 strcpy (req, re + 1);
2242 req [strlen (req) - 1] = 0;
2243 REGEXP = new_stream ();
2244 REGEXPD = new_stream ();
2245 R_name = name_anon_regexp ();
2246 recipe = regexp_compiler (req);
2247 concate_streams (concate_streams (REGEXP_CODE, REGEXPD), REGEXP);
2248 return recipe;
2251 /******************************
2252 Parse a RegExp, declaration
2253 ******************************/
2254 NormPtr parse_RegExp (NormPtr p, int ql)
2256 regexp_dclql.istatic = ql & 1;
2257 regexp_dclql.iextern = ql & 2;
2258 regexp_dclql.iinline = ql & 4;
2259 do {
2260 p = CODE [p] == RESERVED_abbrev ? parse_regexp_uclass (p) : parse_regexp_dcl (p);
2261 if (CODE [p] != ',' && CODE [p] != ';')
2262 parse_error (p, "RegExp declaration separator");
2263 } while (CODE [p++] != ';');
2264 return p;