regex: updates from neatvi
[neatlibc.git] / regex.c
blobe76abe469bcfad2f7df6f03e0bb8a79a5654aa1c
1 #include <ctype.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "regex.h"
7 #define NGRPS 64 /* maximum number of groups */
8 #define NREPS 128 /* maximum repetitions */
9 #define NDEPT 256 /* re_rec() recursion depth limit */
11 #define MAX(a, b) ((a) < (b) ? (b) : (a))
12 #define LEN(a) (sizeof(a) / sizeof((a)[0]))
14 /* regular expressions atoms */
15 #define RA_CHR '\0' /* character literal */
16 #define RA_BEG '^' /* string start */
17 #define RA_END '$' /* string end */
18 #define RA_ANY '.' /* any character */
19 #define RA_BRK '[' /* bracket expression */
20 #define RA_WBEG '<' /* word start */
21 #define RA_WEND '>' /* word end */
23 /* regular expression node types */
24 #define RN_ATOM '\0' /* regular expression */
25 #define RN_CAT 'c' /* concatenation */
26 #define RN_ALT '|' /* alternate expressions */
27 #define RN_GRP '(' /* pattern group */
29 /* regular expression program instructions */
30 #define RI_ATOM '\0' /* regular expression */
31 #define RI_FORK 'f' /* fork the execution */
32 #define RI_JUMP 'j' /* jump to the given instruction */
33 #define RI_MARK 'm' /* mark the current position */
34 #define RI_MATCH 'q' /* the pattern is matched */
36 /* regular expression atom */
37 struct ratom {
38 int ra; /* atom type (RA_*) */
39 char *s; /* atom argument */
42 /* regular expression instruction */
43 struct rinst {
44 struct ratom ra; /* regular expression atom (RI_ATOM) */
45 int ri; /* instruction type (RI_*) */
46 int a1, a2; /* destination of RI_FORK and RI_JUMP */
47 int mark; /* mark (RI_MARK) */
50 /* regular expression program */
51 struct regex {
52 struct rinst *p; /* the program */
53 int n; /* number of instructions */
54 int flg; /* regcomp() flags */
57 /* regular expression matching state */
58 struct rstate {
59 char *s; /* the current position in the string */
60 char *o; /* the beginning of the string */
61 int mark[NGRPS * 2]; /* marks for RI_MARK */
62 int pc; /* program counter */
63 int flg; /* flags passed to regcomp() and regexec() */
64 int dep; /* re_rec() depth */
67 /* regular expression tree; used for parsing */
68 struct rnode {
69 struct ratom ra; /* regular expression atom (RN_ATOM) */
70 struct rnode *c1, *c2; /* children */
71 int mincnt, maxcnt; /* number of repetitions */
72 int grp; /* group number */
73 int rn; /* node type (RN_*) */
76 static struct rnode *rnode_make(int rn, struct rnode *c1, struct rnode *c2)
78 struct rnode *rnode = malloc(sizeof(*rnode));
79 memset(rnode, 0, sizeof(*rnode));
80 rnode->rn = rn;
81 rnode->c1 = c1;
82 rnode->c2 = c2;
83 rnode->mincnt = 1;
84 rnode->maxcnt = 1;
85 return rnode;
88 static void rnode_free(struct rnode *rnode)
90 if (rnode->c1)
91 rnode_free(rnode->c1);
92 if (rnode->c2)
93 rnode_free(rnode->c2);
94 free(rnode->ra.s);
95 free(rnode);
98 static int uc_len(char *s)
100 int c = (unsigned char) s[0];
101 if (~c & 0xc0) /* ASCII or invalid */
102 return c > 0;
103 if (~c & 0x20)
104 return 2;
105 if (~c & 0x10)
106 return 3;
107 if (~c & 0x08)
108 return 4;
109 return 1;
112 static int uc_dec(char *s)
114 int c = (unsigned char) s[0];
115 if (~c & 0xc0) /* ASCII or invalid */
116 return c;
117 if (~c & 0x20)
118 return ((c & 0x1f) << 6) | (s[1] & 0x3f);
119 if (~c & 0x10)
120 return ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
121 if (~c & 0x08)
122 return ((c & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
123 return c;
126 static void ratom_copy(struct ratom *dst, struct ratom *src)
128 dst->ra = src->ra;
129 dst->s = NULL;
130 if (src->s) {
131 int len = strlen(src->s);
132 dst->s = malloc(len + 1);
133 memcpy(dst->s, src->s, len + 1);
137 static int brk_len(char *s)
139 int n = 1;
140 if (s[n] == '^') /* exclusion mark */
141 n++;
142 if (s[n] == ']') /* handling []a] */
143 n++;
144 while (s[n] && s[n] != ']') {
145 if (s[n] == '[' && (s[n + 1] == ':' || s[n + 1] == '='))
146 while (s[n] && s[n] != ']')
147 n++;
148 if (s[n])
149 n++;
151 return s[n] == ']' ? n + 1 : n;
154 static void ratom_readbrk(struct ratom *ra, char **pat)
156 int len = brk_len(*pat);
157 ra->ra = RA_BRK;
158 ra->s = malloc(len + 1);
159 memcpy(ra->s, *pat, len);
160 ra->s[len] = '\0';
161 *pat += len;
164 static void ratom_read(struct ratom *ra, char **pat)
166 char *s;
167 int len;
168 switch ((unsigned char) **pat) {
169 case '.':
170 ra->ra = RA_ANY;
171 (*pat)++;
172 break;
173 case '^':
174 ra->ra = RA_BEG;
175 (*pat)++;
176 break;
177 case '$':
178 ra->ra = RA_END;
179 (*pat)++;
180 break;
181 case '[':
182 ratom_readbrk(ra, pat);
183 break;
184 case '\\':
185 if ((*pat)[1] == '<' || (*pat)[1] == '>') {
186 ra->ra = (*pat)[1] == '<' ? RA_WBEG : RA_WEND;
187 *pat += 2;
188 break;
190 (*pat)++;
191 default:
192 ra->ra = RA_CHR;
193 s = *pat;
194 while ((s == *pat) || !strchr(".^$[(|)*?+{\\", (unsigned char) s[0])) {
195 int l = uc_len(s);
196 if (s != *pat && s[l] != '\0' && strchr("*?+{", (unsigned char) s[l]))
197 break;
198 s += uc_len(s);
200 len = s - *pat;
201 ra->s = malloc(len + 1);
202 memcpy(ra->s, *pat, len);
203 ra->s[len] = '\0';
204 *pat += len;
208 static char *uc_beg(char *beg, char *s)
210 while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
211 s--;
212 return s;
215 static int isword(char *s)
217 int c = (unsigned char) s[0];
218 return isalnum(c) || c == '_' || c > 127;
221 static char *brk_classes[][2] = {
222 {":alnum:", "a-zA-Z0-9"},
223 {":alpha:", "a-zA-Z"},
224 {":blank:", " \t"},
225 {":digit:", "0-9"},
226 {":lower:", "a-z"},
227 {":print:", "\x20-\x7e"},
228 {":punct:", "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"},
229 {":space:", " \t\r\n\v\f"},
230 {":upper:", "A-Z"},
231 {":word:", "a-zA-Z0-9_"},
232 {":xdigit:", "a-fA-F0-9"},
235 static int brk_match(char *brk, int c, int flg)
237 int beg, end;
238 int i;
239 int not = brk[0] == '^';
240 char *p = not ? brk + 1 : brk;
241 char *p0 = p;
242 if (flg & REG_ICASE && c < 128 && isupper(c))
243 c = tolower(c);
244 while (*p && (p == p0 || *p != ']')) {
245 if (p[0] == '[' && p[1] == ':') {
246 for (i = 0; i < LEN(brk_classes); i++) {
247 char *cc = brk_classes[i][0];
248 char *cp = brk_classes[i][1];
249 if (!strncmp(cc, p + 1, strlen(cc)))
250 if (!brk_match(cp, c, flg))
251 return not;
253 p += brk_len(p);
254 continue;
256 beg = uc_dec(p);
257 p += uc_len(p);
258 end = beg;
259 if (p[0] == '-' && p[1] && p[1] != ']') {
260 p++;
261 end = uc_dec(p);
262 p += uc_len(p);
264 if (flg & REG_ICASE && beg < 128 && isupper(beg))
265 beg = tolower(beg);
266 if (flg & REG_ICASE && end < 128 && isupper(end))
267 end = tolower(end);
268 if (c >= beg && c <= end)
269 return not;
271 return !not;
274 static int ratom_match(struct ratom *ra, struct rstate *rs)
276 if (ra->ra == RA_CHR && !(rs->flg & REG_ICASE)) {
277 char *s = ra->s;
278 char *r = rs->s;
279 while (*s && *s == *r)
280 s++, r++;
281 if (*s)
282 return 1;
283 rs->s = r;
284 return 0;
286 if (ra->ra == RA_CHR) {
287 int pos = 0;
288 while (ra->s[pos]) {
289 int c1 = uc_dec(ra->s + pos);
290 int c2 = uc_dec(rs->s + pos);
291 if (rs->flg & REG_ICASE && c1 < 128 && isupper(c1))
292 c1 = tolower(c1);
293 if (rs->flg & REG_ICASE && c2 < 128 && isupper(c2))
294 c2 = tolower(c2);
295 if (c1 != c2)
296 return 1;
297 pos += uc_len(ra->s + pos);
299 rs->s += pos;
300 return 0;
302 if (ra->ra == RA_ANY) {
303 if (!rs->s[0] || (rs->s[0] == '\n' && !!(rs->flg & REG_NEWLINE)))
304 return 1;
305 rs->s += uc_len(rs->s);
306 return 0;
308 if (ra->ra == RA_BRK) {
309 int c = uc_dec(rs->s);
310 if (!c || (c == '\n' && !!(rs->flg & REG_NEWLINE) && ra->s[1] == '^'))
311 return 1;
312 rs->s += uc_len(rs->s);
313 return brk_match(ra->s + 1, c, rs->flg);
315 if (ra->ra == RA_BEG && rs->s == rs->o)
316 return !!(rs->flg & REG_NOTBOL);
317 if (ra->ra == RA_BEG && rs->s > rs->o && rs->s[-1] == '\n')
318 return !(rs->flg & REG_NEWLINE);
319 if (ra->ra == RA_END && rs->s[0] == '\0')
320 return !!(rs->flg & REG_NOTEOL);
321 if (ra->ra == RA_END && rs->s[0] == '\n')
322 return !(rs->flg & REG_NEWLINE);
323 if (ra->ra == RA_WBEG)
324 return !((rs->s == rs->o || !isword(uc_beg(rs->o, rs->s - 1))) &&
325 isword(rs->s));
326 if (ra->ra == RA_WEND)
327 return !(rs->s != rs->o && isword(uc_beg(rs->o, rs->s - 1)) &&
328 (!rs->s[0] || !isword(rs->s)));
329 return 1;
332 static struct rnode *rnode_parse(char **pat);
334 static struct rnode *rnode_grp(char **pat)
336 struct rnode *rnode = NULL;
337 if ((*pat)[0] != '(')
338 return NULL;
339 *pat += 1;
340 if ((*pat)[0] != ')') {
341 rnode = rnode_parse(pat);
342 if (!rnode)
343 return NULL;
345 if ((*pat)[0] != ')') {
346 rnode_free(rnode);
347 return NULL;
349 *pat += 1;
350 return rnode_make(RN_GRP, rnode, NULL);
353 static struct rnode *rnode_atom(char **pat)
355 struct rnode *rnode;
356 if (!**pat)
357 return NULL;
358 if ((*pat)[0] == '|' || (*pat)[0] == ')')
359 return NULL;
360 if ((*pat)[0] == '(') {
361 rnode = rnode_grp(pat);
362 } else {
363 rnode = rnode_make(RN_ATOM, NULL, NULL);
364 ratom_read(&rnode->ra, pat);
366 if (!rnode)
367 return NULL;
368 if ((*pat)[0] == '*' || (*pat)[0] == '?') {
369 rnode->mincnt = 0;
370 rnode->maxcnt = (*pat)[0] == '*' ? -1 : 1;
371 *pat += 1;
373 if ((*pat)[0] == '+') {
374 rnode->mincnt = 1;
375 rnode->maxcnt = -1;
376 *pat += 1;
378 if ((*pat)[0] == '{') {
379 rnode->mincnt = 0;
380 rnode->maxcnt = 0;
381 *pat += 1;
382 while (isdigit((unsigned char) **pat))
383 rnode->mincnt = rnode->mincnt * 10 + *(*pat)++ - '0';
384 if (**pat == ',') {
385 (*pat)++;
386 if ((*pat)[0] == '}')
387 rnode->maxcnt = -1;
388 while (isdigit((unsigned char) **pat))
389 rnode->maxcnt = rnode->maxcnt * 10 + *(*pat)++ - '0';
390 } else {
391 rnode->maxcnt = rnode->mincnt;
393 *pat += 1;
394 if (rnode->mincnt > NREPS || rnode->maxcnt > NREPS) {
395 rnode_free(rnode);
396 return NULL;
399 return rnode;
402 static struct rnode *rnode_seq(char **pat)
404 struct rnode *c1 = rnode_atom(pat);
405 struct rnode *c2;
406 if (!c1)
407 return NULL;
408 c2 = rnode_seq(pat);
409 return c2 ? rnode_make(RN_CAT, c1, c2) : c1;
412 static struct rnode *rnode_parse(char **pat)
414 struct rnode *c1 = rnode_seq(pat);
415 struct rnode *c2;
416 if ((*pat)[0] != '|')
417 return c1;
418 *pat += 1;
419 c2 = rnode_parse(pat);
420 return c2 ? rnode_make(RN_ALT, c1, c2) : c1;
423 static int rnode_count(struct rnode *rnode)
425 int n = 1;
426 if (!rnode)
427 return 0;
428 if (rnode->rn == RN_CAT)
429 n = rnode_count(rnode->c1) + rnode_count(rnode->c2);
430 if (rnode->rn == RN_ALT)
431 n = rnode_count(rnode->c1) + rnode_count(rnode->c2) + 2;
432 if (rnode->rn == RN_GRP)
433 n = rnode_count(rnode->c1) + 2;
434 if (rnode->mincnt == 0 && rnode->maxcnt == 0)
435 return 0;
436 if (rnode->mincnt == 1 && rnode->maxcnt == 1)
437 return n;
438 if (rnode->maxcnt < 0) {
439 n = (rnode->mincnt + 1) * n + 1;
440 } else {
441 n = (rnode->mincnt + rnode->maxcnt) * n +
442 rnode->maxcnt - rnode->mincnt;
444 if (!rnode->mincnt)
445 n++;
446 return n;
449 static int rnode_grpnum(struct rnode *rnode, int num)
451 int cur = 0;
452 if (!rnode)
453 return 0;
454 if (rnode->rn == RN_GRP)
455 rnode->grp = num + cur++;
456 cur += rnode_grpnum(rnode->c1, num + cur);
457 cur += rnode_grpnum(rnode->c2, num + cur);
458 return cur;
461 static int re_insert(struct regex *p, int ri)
463 p->p[p->n++].ri = ri;
464 return p->n - 1;
467 static void rnode_emit(struct rnode *n, struct regex *p);
469 static void rnode_emitnorep(struct rnode *n, struct regex *p)
471 int fork, done, mark;
472 if (n->rn == RN_ALT) {
473 fork = re_insert(p, RI_FORK);
474 p->p[fork].a1 = p->n;
475 rnode_emit(n->c1, p);
476 done = re_insert(p, RI_JUMP);
477 p->p[fork].a2 = p->n;
478 rnode_emit(n->c2, p);
479 p->p[done].a1 = p->n;
481 if (n->rn == RN_CAT) {
482 rnode_emit(n->c1, p);
483 rnode_emit(n->c2, p);
485 if (n->rn == RN_GRP) {
486 mark = re_insert(p, RI_MARK);
487 p->p[mark].mark = 2 * n->grp;
488 rnode_emit(n->c1, p);
489 mark = re_insert(p, RI_MARK);
490 p->p[mark].mark = 2 * n->grp + 1;
492 if (n->rn == RN_ATOM) {
493 int atom = re_insert(p, RI_ATOM);
494 ratom_copy(&p->p[atom].ra, &n->ra);
498 static void rnode_emit(struct rnode *n, struct regex *p)
500 int last;
501 int jmpend[NREPS];
502 int jmpend_cnt = 0;
503 int i;
504 if (!n)
505 return;
506 if (n->mincnt == 0 && n->maxcnt == 0)
507 return;
508 if (n->mincnt == 1 && n->maxcnt == 1) {
509 rnode_emitnorep(n, p);
510 return;
512 if (n->mincnt == 0) {
513 int fork = re_insert(p, RI_FORK);
514 p->p[fork].a1 = p->n;
515 jmpend[jmpend_cnt++] = fork;
517 for (i = 0; i < MAX(1, n->mincnt); i++) {
518 last = p->n;
519 rnode_emitnorep(n, p);
521 if (n->maxcnt < 0) {
522 int fork;
523 fork = re_insert(p, RI_FORK);
524 p->p[fork].a1 = last;
525 p->p[fork].a2 = p->n;
527 for (i = MAX(1, n->mincnt); i < n->maxcnt; i++) {
528 int fork = re_insert(p, RI_FORK);
529 p->p[fork].a1 = p->n;
530 jmpend[jmpend_cnt++] = fork;
531 rnode_emitnorep(n, p);
533 for (i = 0; i < jmpend_cnt; i++)
534 p->p[jmpend[i]].a2 = p->n;
537 int regcomp(regex_t *preg, char *pat, int flg)
539 struct rnode *rnode = rnode_parse(&pat);
540 struct regex *re;
541 int n = rnode_count(rnode) + 3;
542 int mark;
543 if (!rnode)
544 return 1;
545 rnode_grpnum(rnode, 1);
546 re = malloc(sizeof(*re));
547 memset(re, 0, sizeof(*re));
548 re->p = malloc(n * sizeof(re->p[0]));
549 memset(re->p, 0, n * sizeof(re->p[0]));
550 mark = re_insert(re, RI_MARK);
551 re->p[mark].mark = 0;
552 rnode_emit(rnode, re);
553 mark = re_insert(re, RI_MARK);
554 re->p[mark].mark = 1;
555 mark = re_insert(re, RI_MATCH);
556 rnode_free(rnode);
557 re->flg = flg;
558 *preg = re;
559 return 0;
562 void regfree(regex_t *preg)
564 struct regex *re = *preg;
565 int i;
566 for (i = 0; i < re->n; i++)
567 if (re->p[i].ri == RI_ATOM)
568 free(re->p[i].ra.s);
569 free(re->p);
570 free(re);
573 static int re_rec(struct regex *re, struct rstate *rs)
575 struct rinst *ri = NULL;
576 if (rs->dep >= NDEPT)
577 return 1;
578 rs->dep++;
579 while (1) {
580 ri = &re->p[rs->pc];
581 if (ri->ri == RI_ATOM) {
582 if (ratom_match(&ri->ra, rs))
583 return 1;
584 rs->pc++;
585 continue;
587 if (ri->ri == RI_MARK) {
588 if (ri->mark < NGRPS)
589 rs->mark[ri->mark] = rs->s - rs->o;
590 rs->pc++;
591 continue;
593 if (ri->ri == RI_JUMP) {
594 rs->pc = ri->a1;
595 continue;
597 if (ri->ri == RI_FORK) {
598 struct rstate base = *rs;
599 rs->pc = ri->a1;
600 if (!re_rec(re, rs))
601 return 0;
602 *rs = base;
603 rs->pc = ri->a2;
604 continue;
606 break;
608 rs->dep--;
609 return ri->ri != RI_MATCH;
612 static int re_recmatch(struct regex *re, struct rstate *rs, int nsub, regmatch_t *psub)
614 int i;
615 rs->pc = 0;
616 rs->dep = 0;
617 for (i = 0; i < LEN(rs->mark) && i < nsub * 2; i++)
618 rs->mark[i] = -1;
619 if (!re_rec(re, rs)) {
620 for (i = 0; i < nsub; i++) {
621 psub[i].rm_so = i * 2 < LEN(rs->mark) ? rs->mark[i * 2] : -1;
622 psub[i].rm_eo = i * 2 < LEN(rs->mark) ? rs->mark[i * 2 + 1] : -1;
624 return 0;
626 return 1;
629 int regexec(regex_t *preg, char *s, int nsub, regmatch_t psub[], int flg)
631 struct regex *re = *preg;
632 struct rstate rs;
633 memset(&rs, 0, sizeof(rs));
634 rs.flg = re->flg | flg;
635 rs.o = s;
636 while (*s) {
637 rs.s = s;
638 s += uc_len(s);
639 if (!re_recmatch(re, &rs, flg & REG_NOSUB ? 0 : nsub, psub))
640 return 0;
642 return 1;
645 int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size)
647 return 0;