regex: assume unknown characters to be word characters
[neatlibc.git] / regex.c
blob8de9234080a83e0d5337334c6c4659ffd71ccbd4
1 #include <ctype.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "regex.h"
7 #define NGRPS 64
8 #define NREPS 128
10 #define MAX(a, b) ((a) < (b) ? (b) : (a))
11 #define LEN(a) (sizeof(a) / sizeof((a)[0]))
13 /* regular expressions atoms */
14 #define RA_CHR '\0' /* character literal */
15 #define RA_BEG '^' /* string start */
16 #define RA_END '$' /* string end */
17 #define RA_ANY '.' /* any character */
18 #define RA_BRK '[' /* bracket expression */
19 #define RA_WBEG '<' /* word start */
20 #define RA_WEND '>' /* word end */
22 /* regular expression node types */
23 #define RN_ATOM '\0' /* regular expression */
24 #define RN_CAT 'c' /* concatenation */
25 #define RN_ALT '|' /* alternate expressions */
26 #define RN_GRP '(' /* pattern group */
28 /* regular expression program instructions */
29 #define RI_ATOM '\0' /* regular expression */
30 #define RI_FORK 'f' /* fork the execution */
31 #define RI_JUMP 'j' /* jump to the given instruction */
32 #define RI_MARK 'm' /* mark the current position */
33 #define RI_MATCH 'q' /* the pattern is matched */
35 /* regular expression atom */
36 struct ratom {
37 int ra; /* atom type (RA_*) */
38 char *s; /* atom argument */
41 /* regular expression instruction */
42 struct rinst {
43 int ri; /* instruction type (RI_*) */
44 struct ratom ra; /* regular expression atom (RI_ATOM) */
45 int a1, a2; /* destination of RE_FORK and RI_JUMP */
46 int mark; /* mark (RI_MARK) */
49 /* regular expression program */
50 struct regex {
51 struct rinst *p; /* the program */
52 int n; /* number of instructions */
53 int grpcnt; /* number of groups */
54 int flg; /* regcomp() flags */
57 /* regular expression matching state */
58 struct rstate {
59 int mark[NGRPS * 2]; /* marks for RI_MARK */
60 int pc; /* program counter */
61 char *s; /* the current position in the string */
62 char *o; /* the beginning of the string */
63 int flg; /* flags passed to regcomp() and regexec() */
66 /* regular expression tree; used for parsing */
67 struct rnode {
68 int rn; /* node type (RN_*) */
69 struct ratom ra; /* regular expression atom (RN_ATOM) */
70 int mincnt, maxcnt; /* number of repetitions */
71 struct rnode *c1, *c2; /* children */
74 static struct rnode *rnode_make(int rn, struct rnode *c1, struct rnode *c2)
76 struct rnode *rnode = malloc(sizeof(*rnode));
77 memset(rnode, 0, sizeof(*rnode));
78 rnode->rn = rn;
79 rnode->c1 = c1;
80 rnode->c2 = c2;
81 rnode->mincnt = 1;
82 rnode->maxcnt = 1;
83 return rnode;
86 static void rnode_free(struct rnode *rnode)
88 if (rnode->c1)
89 rnode_free(rnode->c1);
90 if (rnode->c2)
91 rnode_free(rnode->c2);
92 free(rnode->ra.s);
93 free(rnode);
96 static int uc_len(char *s)
98 int c = (unsigned char) s[0];
99 if (c > 0 && c <= 0x7f)
100 return 1;
101 if (c >= 0xfc)
102 return 6;
103 if (c >= 0xf8)
104 return 5;
105 if (c >= 0xf0)
106 return 4;
107 if (c >= 0xe0)
108 return 3;
109 if (c >= 0xc0)
110 return 2;
111 return c != 0;
114 static int uc_dec(char *s)
116 int result;
117 int l = uc_len(s);
118 if (l <= 1)
119 return (unsigned char) *s;
120 result = (0x3f >> --l) & (unsigned char) *s++;
121 while (l--)
122 result = (result << 6) | ((unsigned char) *s++ & 0x3f);
123 return result;
126 static void ratom_copy(struct ratom *dst, struct ratom *src)
128 dst->ra = src->ra;
129 dst->s = NULL;
130 if (src->s) {
131 int len = strlen(src->s);
132 dst->s = malloc(len + 1);
133 memcpy(dst->s, src->s, len + 1);
137 static void ratom_readbrk(struct ratom *ra, char **pat)
139 char *beg = ++*pat;
140 int len;
141 ra->ra = RA_BRK;
142 if (**pat == '^') /* exclusion mark */
143 (*pat)++;
144 if (**pat) /* handling []a] */
145 (*pat)++;
146 while (**pat && *(*pat)++ != ']')
148 len = *pat - beg - 1;
149 if (len >= 0) {
150 ra->s = malloc(len + 1);
151 memcpy(ra->s, beg, len);
152 ra->s[len] = '\0';
156 static void ratom_read(struct ratom *ra, char **pat)
158 int len;
159 switch ((unsigned char) **pat) {
160 case '.':
161 ra->ra = RA_ANY;
162 (*pat)++;
163 break;
164 case '^':
165 ra->ra = RA_BEG;
166 (*pat)++;
167 break;
168 case '$':
169 ra->ra = RA_END;
170 (*pat)++;
171 break;
172 case '[':
173 ratom_readbrk(ra, pat);
174 break;
175 case '\\':
176 if ((*pat)[1] == '<' || (*pat)[1] == '>') {
177 ra->ra = (*pat)[1] == '<' ? RA_WBEG : RA_WEND;
178 *pat += 2;
179 break;
181 (*pat)++;
182 default:
183 ra->ra = RA_CHR;
184 len = uc_len(*pat);
185 ra->s = malloc(8);
186 memcpy(ra->s, *pat, len);
187 ra->s[len] = '\0';
188 *pat += len;
192 static char *uc_beg(char *beg, char *s)
194 while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
195 s--;
196 return s;
199 static int isword(char *s)
201 int c = (unsigned char) s[0];
202 return isalnum(c) || c == '_' || c > 127;
205 static char *charclasses[][2] = {
206 {":alnum:", "a-zA-Z0-9"},
207 {":alpha:", "a-zA-Z"},
208 {":blank:", " \t"},
209 {":digit:", "0-9"},
210 {":lower:", "a-z"},
211 {":print:", "\x20-\x7e"},
212 {":punct:", "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"},
213 {":space:", " \t\r\n\v\f"},
214 {":upper:", "A-Z"},
215 {":word:", "a-zA-Z0-9_"},
216 {":xdigit:", "a-fA-F0-9"},
219 static int ratom_match(struct ratom *ra, struct rstate *rs)
221 int i;
222 if (ra->ra == RA_CHR) {
223 int c1 = uc_dec(ra->s);
224 int c2 = uc_dec(rs->s);
225 if (rs->flg & REG_ICASE && c1 < 128 && isupper(c1))
226 c1 = tolower(c1);
227 if (rs->flg & REG_ICASE && c2 < 128 && isupper(c2))
228 c2 = tolower(c2);
229 if (c1 != c2)
230 return 1;
231 rs->s += uc_len(ra->s);
232 return 0;
234 if (ra->ra == RA_BRK) {
235 int not = ra->s[0] == '^';
236 char *p = not ? ra->s + 1 : ra->s;
237 int c, clen;
238 int beg, end;
239 if (!rs->s[0])
240 return 1;
241 if (p[0] == ':')
242 for (i = 0; i < LEN(charclasses); i++)
243 if (!strcmp(charclasses[i][0], p))
244 p = charclasses[i][1];
245 c = uc_dec(rs->s);
246 rs->s += uc_len(rs->s);
247 if (rs->flg & REG_ICASE && c < 128 && isupper(c))
248 c = tolower(c);
249 while (*p) {
250 beg = uc_dec(p);
251 p += uc_len(p);
252 end = beg;
253 if (p[0] == '-' && p[1]) {
254 p++;
255 end = uc_dec(p);
256 p += uc_len(p);
258 if (rs->flg & REG_ICASE && beg < 128 && isupper(beg))
259 beg = tolower(beg);
260 if (rs->flg & REG_ICASE && end < 128 && isupper(end))
261 end = tolower(end);
262 if (c >= beg && c <= end)
263 return not;
265 return !not;
267 if (ra->ra == RA_ANY) {
268 if (!rs->s[0])
269 return 1;
270 rs->s += uc_len(rs->s);
271 return 0;
273 if (ra->ra == RA_BEG && !(rs->flg & REG_NOTBOL))
274 return rs->s != rs->o;
275 if (ra->ra == RA_END && !(rs->flg & REG_NOTEOL))
276 return rs->s[0] != '\0';
277 if (ra->ra == RA_WBEG)
278 return !((rs->s == rs->o || !isword(uc_beg(rs->o, rs->s - 1))) &&
279 isword(rs->s));
280 if (ra->ra == RA_WEND)
281 return !(rs->s != rs->o && isword(uc_beg(rs->o, rs->s - 1)) &&
282 (!rs->s[0] || !isword(rs->s)));
283 return 1;
286 static struct rnode *rnode_parse(char **pat);
288 static struct rnode *rnode_grp(char **pat)
290 struct rnode *rnode;
291 if ((*pat)[0] != '(')
292 return NULL;
293 *pat += 1;
294 rnode = rnode_parse(pat);
295 if ((*pat)[0] != ')') {
296 rnode_free(rnode);
297 return NULL;
299 *pat += 1;
300 return rnode_make(RN_GRP, rnode, NULL);
303 static struct rnode *rnode_atom(char **pat)
305 struct rnode *rnode;
306 if (!**pat)
307 return NULL;
308 if ((*pat)[0] == '|' || (*pat)[0] == ')')
309 return NULL;
310 if ((*pat)[0] == '(') {
311 rnode = rnode_grp(pat);
312 } else {
313 rnode = rnode_make(RN_ATOM, NULL, NULL);
314 ratom_read(&rnode->ra, pat);
316 if ((*pat)[0] == '*' || (*pat)[0] == '?') {
317 rnode->mincnt = 0;
318 rnode->maxcnt = (*pat)[0] == '*' ? -1 : 1;
319 (*pat)++;
321 if ((*pat)[0] == '+') {
322 rnode->mincnt = 1;
323 rnode->maxcnt = -1;
324 *pat += 1;
326 if ((*pat)[0] == '{') {
327 rnode->mincnt = 0;
328 rnode->maxcnt = 0;
329 *pat += 1;
330 while (isdigit((unsigned char) **pat))
331 rnode->mincnt = rnode->mincnt * 10 + *(*pat)++ - '0';
332 if (**pat == ',') {
333 (*pat)++;
334 if ((*pat)[0] == '}')
335 rnode->maxcnt = -1;
336 while (isdigit((unsigned char) **pat))
337 rnode->maxcnt = rnode->maxcnt * 10 + *(*pat)++ - '0';
338 } else {
339 rnode->maxcnt = rnode->mincnt;
341 *pat += 1;
342 if (rnode->mincnt > NREPS || rnode->maxcnt > NREPS) {
343 rnode_free(rnode);
344 return NULL;
347 return rnode;
350 static struct rnode *rnode_seq(char **pat)
352 struct rnode *c1 = rnode_atom(pat);
353 struct rnode *c2;
354 if (!c1)
355 return NULL;
356 c2 = rnode_seq(pat);
357 return c2 ? rnode_make(RN_CAT, c1, c2) : c1;
360 static struct rnode *rnode_parse(char **pat)
362 struct rnode *c1 = rnode_seq(pat);
363 struct rnode *c2;
364 if ((*pat)[0] != '|')
365 return c1;
366 *pat += 1;
367 c2 = rnode_parse(pat);
368 return c2 ? rnode_make(RN_ALT, c1, c2) : c1;
371 static int rnode_count(struct rnode *rnode)
373 int n = 1;
374 if (!rnode)
375 return 0;
376 if (rnode->rn == RN_CAT)
377 n = rnode_count(rnode->c1) + rnode_count(rnode->c2);
378 if (rnode->rn == RN_ALT)
379 n = rnode_count(rnode->c1) + rnode_count(rnode->c2) + 2;
380 if (rnode->rn == RN_GRP)
381 n = rnode_count(rnode->c1) + 2;
382 if (rnode->mincnt == 0 && rnode->maxcnt == 0)
383 return 0;
384 if (rnode->mincnt == 1 && rnode->maxcnt == 1)
385 return n;
386 if (rnode->maxcnt < 0) {
387 n = (rnode->mincnt + 1) * n + 1;
388 } else {
389 n = (rnode->mincnt + rnode->maxcnt) * n +
390 rnode->maxcnt - rnode->mincnt;
392 if (!rnode->mincnt)
393 n++;
394 return n;
397 static int re_insert(struct regex *p, int ri)
399 p->p[p->n++].ri = ri;
400 return p->n - 1;
403 static void rnode_emit(struct rnode *n, struct regex *p);
405 static void rnode_emitnorep(struct rnode *n, struct regex *p)
407 int fork, done, mark;
408 if (n->rn == RN_ALT) {
409 fork = re_insert(p, RI_FORK);
410 p->p[fork].a1 = p->n;
411 rnode_emit(n->c1, p);
412 done = re_insert(p, RI_JUMP);
413 p->p[fork].a2 = p->n;
414 rnode_emit(n->c2, p);
415 p->p[done].a1 = p->n;
417 if (n->rn == RN_CAT) {
418 rnode_emit(n->c1, p);
419 rnode_emit(n->c2, p);
421 if (n->rn == RN_GRP) {
422 int grp = p->grpcnt++ + 1;
423 mark = re_insert(p, RI_MARK);
424 p->p[mark].mark = 2 * grp;
425 rnode_emit(n->c1, p);
426 mark = re_insert(p, RI_MARK);
427 p->p[mark].mark = 2 * grp + 1;
429 if (n->rn == RN_ATOM) {
430 int atom = re_insert(p, RI_ATOM);
431 ratom_copy(&p->p[atom].ra, &n->ra);
435 static void rnode_emit(struct rnode *n, struct regex *p)
437 int last;
438 int jmpend[NREPS];
439 int jmpend_cnt = 0;
440 int i;
441 if (!n)
442 return;
443 if (n->mincnt == 0 && n->maxcnt == 0)
444 return;
445 if (n->mincnt == 1 && n->maxcnt == 1) {
446 rnode_emitnorep(n, p);
447 return;
449 if (n->mincnt == 0) {
450 int fork = re_insert(p, RI_FORK);
451 p->p[fork].a1 = p->n;
452 jmpend[jmpend_cnt++] = fork;
454 for (i = 0; i < MAX(1, n->mincnt); i++) {
455 last = p->n;
456 rnode_emitnorep(n, p);
458 if (n->maxcnt < 0) {
459 int fork;
460 fork = re_insert(p, RI_FORK);
461 p->p[fork].a1 = last;
462 p->p[fork].a2 = p->n;
464 for (i = MAX(1, n->mincnt); i < n->maxcnt; i++) {
465 int fork = re_insert(p, RI_FORK);
466 p->p[fork].a1 = p->n;
467 jmpend[jmpend_cnt++] = fork;
468 rnode_emitnorep(n, p);
470 for (i = 0; i < jmpend_cnt; i++)
471 p->p[jmpend[i]].a2 = p->n;
474 int regcomp(regex_t *preg, char *pat, int flg)
476 struct rnode *rnode = rnode_parse(&pat);
477 struct regex *re = malloc(sizeof(*re));
478 int n = rnode_count(rnode) + 3;
479 int mark;
480 memset(re, 0, sizeof(*re));
481 re->p = malloc(n * sizeof(re->p[0]));
482 memset(re->p, 0, n * sizeof(re->p[0]));
483 mark = re_insert(re, RI_MARK);
484 re->p[mark].mark = 0;
485 rnode_emit(rnode, re);
486 mark = re_insert(re, RI_MARK);
487 re->p[mark].mark = 1;
488 mark = re_insert(re, RI_MATCH);
489 rnode_free(rnode);
490 re->flg = flg;
491 *preg = re;
492 return 0;
495 void regfree(regex_t *preg)
497 struct regex *re = *preg;
498 int i;
499 for (i = 0; i < re->n; i++)
500 if (re->p[i].ri == RI_ATOM)
501 free(re->p[i].ra.s);
502 free(re->p);
503 free(re);
506 static int re_rec(struct regex *re, struct rstate *rs)
508 struct rinst *ri = &re->p[rs->pc];
509 if (ri->ri == RI_ATOM) {
510 if (ratom_match(&ri->ra, rs))
511 return 1;
512 rs->pc++;
513 return re_rec(re, rs);
515 if (ri->ri == RI_MARK) {
516 if (ri->mark < NGRPS)
517 rs->mark[ri->mark] = rs->s - rs->o;
518 rs->pc++;
519 return re_rec(re, rs);
521 if (ri->ri == RI_JUMP) {
522 rs->pc = ri->a1;
523 return re_rec(re, rs);
525 if (ri->ri == RI_FORK) {
526 struct rstate base = *rs;
527 rs->pc = ri->a1;
528 if (!re_rec(re, rs))
529 return 0;
530 *rs = base;
531 rs->pc = ri->a2;
532 return re_rec(re, rs);
534 if (ri->ri == RI_MATCH)
535 return 0;
536 return 1;
539 static int re_recmatch(struct regex *re, struct rstate *rs, int nsub, regmatch_t *psub)
541 int i;
542 rs->pc = 0;
543 for (i = 0; i < LEN(rs->mark); i++)
544 rs->mark[i] = -1;
545 if (!re_rec(re, rs)) {
546 for (i = 0; i < nsub; i++) {
547 psub[i].rm_so = i * 2 < LEN(rs->mark) ? rs->mark[i * 2] : -1;
548 psub[i].rm_eo = i * 2 < LEN(rs->mark) ? rs->mark[i * 2 + 1] : -1;
550 return 0;
552 return 1;
555 int regexec(regex_t *preg, char *s, int nsub, regmatch_t psub[], int flg)
557 struct regex *re = *preg;
558 struct rstate rs;
559 memset(&rs, 0, sizeof(rs));
560 rs.flg = re->flg | flg;
561 rs.o = s;
562 while (*s) {
563 rs.s = s++;
564 if (!re_recmatch(re, &rs, flg & REG_NOSUB ? 0 : nsub, psub))
565 return 0;
567 return 1;
570 int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size)
572 return 0;