uc: make uc_dec() more compact by removing its loop
[neatvi.git] / regex.c
bloba2a6653240fbd884f0a5733922c2fb94fa9d8ea9
1 #include <ctype.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "regex.h"
7 #define NGRPS 64 /* maximum number of groups */
8 #define NREPS 128 /* maximum repetitions */
9 #define NDEPT 256 /* re_rec() recursion depth limit */
11 #define MAX(a, b) ((a) < (b) ? (b) : (a))
12 #define LEN(a) (sizeof(a) / sizeof((a)[0]))
14 /* regular expressions atoms */
15 #define RA_CHR '\0' /* character literal */
16 #define RA_BEG '^' /* string start */
17 #define RA_END '$' /* string end */
18 #define RA_ANY '.' /* any character */
19 #define RA_BRK '[' /* bracket expression */
20 #define RA_WBEG '<' /* word start */
21 #define RA_WEND '>' /* word end */
23 /* regular expression node types */
24 #define RN_ATOM '\0' /* regular expression */
25 #define RN_CAT 'c' /* concatenation */
26 #define RN_ALT '|' /* alternate expressions */
27 #define RN_GRP '(' /* pattern group */
29 /* regular expression program instructions */
30 #define RI_ATOM '\0' /* regular expression */
31 #define RI_FORK 'f' /* fork the execution */
32 #define RI_JUMP 'j' /* jump to the given instruction */
33 #define RI_MARK 'm' /* mark the current position */
34 #define RI_MATCH 'q' /* the pattern is matched */
36 /* regular expression atom */
37 struct ratom {
38 int ra; /* atom type (RA_*) */
39 char *s; /* atom argument */
42 /* regular expression instruction */
43 struct rinst {
44 int ri; /* instruction type (RI_*) */
45 struct ratom ra; /* regular expression atom (RI_ATOM) */
46 int a1, a2; /* destination of RI_FORK and RI_JUMP */
47 int mark; /* mark (RI_MARK) */
50 /* regular expression program */
51 struct regex {
52 struct rinst *p; /* the program */
53 int n; /* number of instructions */
54 int grpcnt; /* number of groups */
55 int flg; /* regcomp() flags */
58 /* regular expression matching state */
59 struct rstate {
60 int mark[NGRPS * 2]; /* marks for RI_MARK */
61 int pc; /* program counter */
62 char *s; /* the current position in the string */
63 char *o; /* the beginning of the string */
64 int flg; /* flags passed to regcomp() and regexec() */
65 int dep; /* re_rec() depth */
68 /* regular expression tree; used for parsing */
69 struct rnode {
70 int rn; /* node type (RN_*) */
71 struct ratom ra; /* regular expression atom (RN_ATOM) */
72 int mincnt, maxcnt; /* number of repetitions */
73 struct rnode *c1, *c2; /* children */
76 static struct rnode *rnode_make(int rn, struct rnode *c1, struct rnode *c2)
78 struct rnode *rnode = malloc(sizeof(*rnode));
79 memset(rnode, 0, sizeof(*rnode));
80 rnode->rn = rn;
81 rnode->c1 = c1;
82 rnode->c2 = c2;
83 rnode->mincnt = 1;
84 rnode->maxcnt = 1;
85 return rnode;
88 static void rnode_free(struct rnode *rnode)
90 if (rnode->c1)
91 rnode_free(rnode->c1);
92 if (rnode->c2)
93 rnode_free(rnode->c2);
94 free(rnode->ra.s);
95 free(rnode);
98 static int uc_len(char *s)
100 int c = (unsigned char) s[0];
101 if (~c & 0x80) /* ASCII */
102 return c > 0;
103 if (~c & 0x40) /* invalid UTF-8 */
104 return 1;
105 if (~c & 0x20)
106 return 2;
107 if (~c & 0x10)
108 return 3;
109 if (~c & 0x08)
110 return 4;
111 return 1;
114 static int uc_dec(char *s)
116 int c = (unsigned char) s[0];
117 if (!(c & 0x80))
118 return c;
119 if (!(c & 0x20))
120 return ((c & 0x1f) << 6) | (s[1] & 0x3f);
121 if (!(c & 0x10))
122 return ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
123 if (!(c & 0x08))
124 return ((c & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
125 return c;
128 static void ratom_copy(struct ratom *dst, struct ratom *src)
130 dst->ra = src->ra;
131 dst->s = NULL;
132 if (src->s) {
133 int len = strlen(src->s);
134 dst->s = malloc(len + 1);
135 memcpy(dst->s, src->s, len + 1);
139 static int brk_len(char *s)
141 int n = 1;
142 if (s[n] == '^') /* exclusion mark */
143 n++;
144 if (s[n] == ']') /* handling []a] */
145 n++;
146 while (s[n] && s[n] != ']') {
147 if (s[n] == '[' && (s[n + 1] == ':' || s[n + 1] == '='))
148 while (s[n] && s[n] != ']')
149 n++;
150 if (s[n])
151 n++;
153 return s[n] == ']' ? n + 1 : n;
156 static void ratom_readbrk(struct ratom *ra, char **pat)
158 int len = brk_len(*pat);
159 ra->ra = RA_BRK;
160 ra->s = malloc(len + 1);
161 memcpy(ra->s, *pat, len);
162 ra->s[len] = '\0';
163 *pat += len;
166 static void ratom_read(struct ratom *ra, char **pat)
168 int len;
169 switch ((unsigned char) **pat) {
170 case '.':
171 ra->ra = RA_ANY;
172 (*pat)++;
173 break;
174 case '^':
175 ra->ra = RA_BEG;
176 (*pat)++;
177 break;
178 case '$':
179 ra->ra = RA_END;
180 (*pat)++;
181 break;
182 case '[':
183 ratom_readbrk(ra, pat);
184 break;
185 case '\\':
186 if ((*pat)[1] == '<' || (*pat)[1] == '>') {
187 ra->ra = (*pat)[1] == '<' ? RA_WBEG : RA_WEND;
188 *pat += 2;
189 break;
191 (*pat)++;
192 default:
193 ra->ra = RA_CHR;
194 len = uc_len(*pat);
195 ra->s = malloc(8);
196 memcpy(ra->s, *pat, len);
197 ra->s[len] = '\0';
198 *pat += len;
202 static char *uc_beg(char *beg, char *s)
204 while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
205 s--;
206 return s;
209 static int isword(char *s)
211 int c = (unsigned char) s[0];
212 return isalnum(c) || c == '_' || c > 127;
215 static char *brk_classes[][2] = {
216 {":alnum:", "a-zA-Z0-9"},
217 {":alpha:", "a-zA-Z"},
218 {":blank:", " \t"},
219 {":digit:", "0-9"},
220 {":lower:", "a-z"},
221 {":print:", "\x20-\x7e"},
222 {":punct:", "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"},
223 {":space:", " \t\r\n\v\f"},
224 {":upper:", "A-Z"},
225 {":word:", "a-zA-Z0-9_"},
226 {":xdigit:", "a-fA-F0-9"},
229 static int brk_match(char *brk, int c, int flg)
231 int beg, end;
232 int i;
233 int not = brk[0] == '^';
234 char *p = not ? brk + 1 : brk;
235 char *p0 = p;
236 if (flg & REG_ICASE && c < 128 && isupper(c))
237 c = tolower(c);
238 while (*p && (p == p0 || *p != ']')) {
239 if (p[0] == '[' && p[1] == ':') {
240 for (i = 0; i < LEN(brk_classes); i++) {
241 char *cc = brk_classes[i][0];
242 char *cp = brk_classes[i][1];
243 if (!strncmp(cc, p + 1, strlen(cc)))
244 if (!brk_match(cp, c, flg))
245 return not;
247 p += brk_len(p);
248 continue;
250 beg = uc_dec(p);
251 p += uc_len(p);
252 end = beg;
253 if (p[0] == '-' && p[1] && p[1] != ']') {
254 p++;
255 end = uc_dec(p);
256 p += uc_len(p);
258 if (flg & REG_ICASE && beg < 128 && isupper(beg))
259 beg = tolower(beg);
260 if (flg & REG_ICASE && end < 128 && isupper(end))
261 end = tolower(end);
262 if (c >= beg && c <= end)
263 return not;
265 return !not;
268 static int ratom_match(struct ratom *ra, struct rstate *rs)
270 if (ra->ra == RA_CHR) {
271 int c1 = uc_dec(ra->s);
272 int c2 = uc_dec(rs->s);
273 if (rs->flg & REG_ICASE && c1 < 128 && isupper(c1))
274 c1 = tolower(c1);
275 if (rs->flg & REG_ICASE && c2 < 128 && isupper(c2))
276 c2 = tolower(c2);
277 if (c1 != c2)
278 return 1;
279 rs->s += uc_len(ra->s);
280 return 0;
282 if (ra->ra == RA_ANY) {
283 if (!rs->s[0] || (rs->s[0] == '\n' && !(rs->flg & REG_NOTEOL)))
284 return 1;
285 rs->s += uc_len(rs->s);
286 return 0;
288 if (ra->ra == RA_BRK) {
289 int c = uc_dec(rs->s);
290 if (!c || (c == '\n' && !(rs->flg & REG_NOTEOL)))
291 return 1;
292 rs->s += uc_len(rs->s);
293 return brk_match(ra->s + 1, c, rs->flg);
295 if (ra->ra == RA_BEG && !(rs->flg & REG_NOTBOL))
296 return !(rs->s == rs->o || rs->s[-1] == '\n');
297 if (ra->ra == RA_END && !(rs->flg & REG_NOTEOL))
298 return rs->s[0] != '\0' && rs->s[0] != '\n';
299 if (ra->ra == RA_WBEG)
300 return !((rs->s == rs->o || !isword(uc_beg(rs->o, rs->s - 1))) &&
301 isword(rs->s));
302 if (ra->ra == RA_WEND)
303 return !(rs->s != rs->o && isword(uc_beg(rs->o, rs->s - 1)) &&
304 (!rs->s[0] || !isword(rs->s)));
305 return 1;
308 static struct rnode *rnode_parse(char **pat);
310 static struct rnode *rnode_grp(char **pat)
312 struct rnode *rnode;
313 if ((*pat)[0] != '(')
314 return NULL;
315 *pat += 1;
316 rnode = rnode_parse(pat);
317 if (!rnode)
318 return NULL;
319 if ((*pat)[0] != ')') {
320 rnode_free(rnode);
321 return NULL;
323 *pat += 1;
324 return rnode_make(RN_GRP, rnode, NULL);
327 static struct rnode *rnode_atom(char **pat)
329 struct rnode *rnode;
330 if (!**pat)
331 return NULL;
332 if ((*pat)[0] == '|' || (*pat)[0] == ')')
333 return NULL;
334 if ((*pat)[0] == '(') {
335 rnode = rnode_grp(pat);
336 } else {
337 rnode = rnode_make(RN_ATOM, NULL, NULL);
338 ratom_read(&rnode->ra, pat);
340 if (!rnode)
341 return NULL;
342 if ((*pat)[0] == '*' || (*pat)[0] == '?') {
343 rnode->mincnt = 0;
344 rnode->maxcnt = (*pat)[0] == '*' ? -1 : 1;
345 *pat += 1;
347 if ((*pat)[0] == '+') {
348 rnode->mincnt = 1;
349 rnode->maxcnt = -1;
350 *pat += 1;
352 if ((*pat)[0] == '{') {
353 rnode->mincnt = 0;
354 rnode->maxcnt = 0;
355 *pat += 1;
356 while (isdigit((unsigned char) **pat))
357 rnode->mincnt = rnode->mincnt * 10 + *(*pat)++ - '0';
358 if (**pat == ',') {
359 (*pat)++;
360 if ((*pat)[0] == '}')
361 rnode->maxcnt = -1;
362 while (isdigit((unsigned char) **pat))
363 rnode->maxcnt = rnode->maxcnt * 10 + *(*pat)++ - '0';
364 } else {
365 rnode->maxcnt = rnode->mincnt;
367 *pat += 1;
368 if (rnode->mincnt > NREPS || rnode->maxcnt > NREPS) {
369 rnode_free(rnode);
370 return NULL;
373 return rnode;
376 static struct rnode *rnode_seq(char **pat)
378 struct rnode *c1 = rnode_atom(pat);
379 struct rnode *c2;
380 if (!c1)
381 return NULL;
382 c2 = rnode_seq(pat);
383 return c2 ? rnode_make(RN_CAT, c1, c2) : c1;
386 static struct rnode *rnode_parse(char **pat)
388 struct rnode *c1 = rnode_seq(pat);
389 struct rnode *c2;
390 if ((*pat)[0] != '|')
391 return c1;
392 *pat += 1;
393 c2 = rnode_parse(pat);
394 return c2 ? rnode_make(RN_ALT, c1, c2) : c1;
397 static int rnode_count(struct rnode *rnode)
399 int n = 1;
400 if (!rnode)
401 return 0;
402 if (rnode->rn == RN_CAT)
403 n = rnode_count(rnode->c1) + rnode_count(rnode->c2);
404 if (rnode->rn == RN_ALT)
405 n = rnode_count(rnode->c1) + rnode_count(rnode->c2) + 2;
406 if (rnode->rn == RN_GRP)
407 n = rnode_count(rnode->c1) + 2;
408 if (rnode->mincnt == 0 && rnode->maxcnt == 0)
409 return 0;
410 if (rnode->mincnt == 1 && rnode->maxcnt == 1)
411 return n;
412 if (rnode->maxcnt < 0) {
413 n = (rnode->mincnt + 1) * n + 1;
414 } else {
415 n = (rnode->mincnt + rnode->maxcnt) * n +
416 rnode->maxcnt - rnode->mincnt;
418 if (!rnode->mincnt)
419 n++;
420 return n;
423 static int re_insert(struct regex *p, int ri)
425 p->p[p->n++].ri = ri;
426 return p->n - 1;
429 static void rnode_emit(struct rnode *n, struct regex *p);
431 static void rnode_emitnorep(struct rnode *n, struct regex *p)
433 int fork, done, mark;
434 if (n->rn == RN_ALT) {
435 fork = re_insert(p, RI_FORK);
436 p->p[fork].a1 = p->n;
437 rnode_emit(n->c1, p);
438 done = re_insert(p, RI_JUMP);
439 p->p[fork].a2 = p->n;
440 rnode_emit(n->c2, p);
441 p->p[done].a1 = p->n;
443 if (n->rn == RN_CAT) {
444 rnode_emit(n->c1, p);
445 rnode_emit(n->c2, p);
447 if (n->rn == RN_GRP) {
448 int grp = p->grpcnt++ + 1;
449 mark = re_insert(p, RI_MARK);
450 p->p[mark].mark = 2 * grp;
451 rnode_emit(n->c1, p);
452 mark = re_insert(p, RI_MARK);
453 p->p[mark].mark = 2 * grp + 1;
455 if (n->rn == RN_ATOM) {
456 int atom = re_insert(p, RI_ATOM);
457 ratom_copy(&p->p[atom].ra, &n->ra);
461 static void rnode_emit(struct rnode *n, struct regex *p)
463 int last;
464 int jmpend[NREPS];
465 int jmpend_cnt = 0;
466 int i;
467 if (!n)
468 return;
469 if (n->mincnt == 0 && n->maxcnt == 0)
470 return;
471 if (n->mincnt == 1 && n->maxcnt == 1) {
472 rnode_emitnorep(n, p);
473 return;
475 if (n->mincnt == 0) {
476 int fork = re_insert(p, RI_FORK);
477 p->p[fork].a1 = p->n;
478 jmpend[jmpend_cnt++] = fork;
480 for (i = 0; i < MAX(1, n->mincnt); i++) {
481 last = p->n;
482 rnode_emitnorep(n, p);
484 if (n->maxcnt < 0) {
485 int fork;
486 fork = re_insert(p, RI_FORK);
487 p->p[fork].a1 = last;
488 p->p[fork].a2 = p->n;
490 for (i = MAX(1, n->mincnt); i < n->maxcnt; i++) {
491 int fork = re_insert(p, RI_FORK);
492 p->p[fork].a1 = p->n;
493 jmpend[jmpend_cnt++] = fork;
494 rnode_emitnorep(n, p);
496 for (i = 0; i < jmpend_cnt; i++)
497 p->p[jmpend[i]].a2 = p->n;
500 int regcomp(regex_t *preg, char *pat, int flg)
502 struct rnode *rnode = rnode_parse(&pat);
503 struct regex *re;
504 int n = rnode_count(rnode) + 3;
505 int mark;
506 if (!rnode)
507 return 1;
508 re = malloc(sizeof(*re));
509 memset(re, 0, sizeof(*re));
510 re->p = malloc(n * sizeof(re->p[0]));
511 memset(re->p, 0, n * sizeof(re->p[0]));
512 mark = re_insert(re, RI_MARK);
513 re->p[mark].mark = 0;
514 rnode_emit(rnode, re);
515 mark = re_insert(re, RI_MARK);
516 re->p[mark].mark = 1;
517 mark = re_insert(re, RI_MATCH);
518 rnode_free(rnode);
519 re->flg = flg;
520 *preg = re;
521 return 0;
524 void regfree(regex_t *preg)
526 struct regex *re = *preg;
527 int i;
528 for (i = 0; i < re->n; i++)
529 if (re->p[i].ri == RI_ATOM)
530 free(re->p[i].ra.s);
531 free(re->p);
532 free(re);
535 static int re_rec(struct regex *re, struct rstate *rs)
537 struct rinst *ri = NULL;
538 if (++(rs->dep) >= NDEPT)
539 return 1;
540 while (1) {
541 ri = &re->p[rs->pc];
542 if (ri->ri == RI_ATOM) {
543 if (ratom_match(&ri->ra, rs))
544 return 1;
545 rs->pc++;
546 continue;
548 if (ri->ri == RI_MARK) {
549 if (ri->mark < NGRPS)
550 rs->mark[ri->mark] = rs->s - rs->o;
551 rs->pc++;
552 continue;
554 if (ri->ri == RI_JUMP) {
555 rs->pc = ri->a1;
556 continue;
558 if (ri->ri == RI_FORK) {
559 struct rstate base = *rs;
560 rs->pc = ri->a1;
561 if (!re_rec(re, rs))
562 return 0;
563 *rs = base;
564 rs->pc = ri->a2;
565 continue;
567 break;
569 return ri->ri != RI_MATCH;
572 static int re_recmatch(struct regex *re, struct rstate *rs, int nsub, regmatch_t *psub)
574 int i;
575 rs->pc = 0;
576 for (i = 0; i < LEN(rs->mark); i++)
577 rs->mark[i] = -1;
578 rs->dep = 0;
579 if (!re_rec(re, rs)) {
580 for (i = 0; i < nsub; i++) {
581 psub[i].rm_so = i * 2 < LEN(rs->mark) ? rs->mark[i * 2] : -1;
582 psub[i].rm_eo = i * 2 < LEN(rs->mark) ? rs->mark[i * 2 + 1] : -1;
584 return 0;
586 return 1;
589 int regexec(regex_t *preg, char *s, int nsub, regmatch_t psub[], int flg)
591 struct regex *re = *preg;
592 struct rstate rs;
593 memset(&rs, 0, sizeof(rs));
594 rs.flg = re->flg | flg;
595 rs.o = s;
596 while (*s) {
597 rs.s = s;
598 s += uc_len(s);
599 if (!re_recmatch(re, &rs, flg & REG_NOSUB ? 0 : nsub, psub))
600 return 0;
602 return 1;
605 int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size)
607 return 0;