ex: preserve cursor position in :g
[neatvi.git] / regex.c
blob3c13b3d66c0a99384f0b4372838a520d7e675703
1 #include <ctype.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "regex.h"
7 #define NGRPS 64
8 #define NREPS 128
10 #define MAX(a, b) ((a) < (b) ? (b) : (a))
11 #define LEN(a) (sizeof(a) / sizeof((a)[0]))
13 /* regular expressions atoms */
14 #define RA_CHR '\0' /* character literal */
15 #define RA_BEG '^' /* string start */
16 #define RA_END '$' /* string end */
17 #define RA_ANY '.' /* any character */
18 #define RA_BRK '[' /* bracket expression */
19 #define RA_WBEG '<' /* word start */
20 #define RA_WEND '>' /* word end */
22 /* regular expression node types */
23 #define RN_ATOM '\0' /* regular expression */
24 #define RN_CAT 'c' /* concatenation */
25 #define RN_ALT '|' /* alternate expressions */
26 #define RN_GRP '(' /* pattern group */
28 /* regular expression program instructions */
29 #define RI_ATOM '\0' /* regular expression */
30 #define RI_FORK 'f' /* fork the execution */
31 #define RI_JUMP 'j' /* jump to the given instruction */
32 #define RI_MARK 'm' /* mark the current position */
33 #define RI_MATCH 'q' /* the pattern is matched */
35 /* regular expression atom */
36 struct ratom {
37 int ra; /* atom type (RA_*) */
38 char *s; /* atom argument */
41 /* regular expression instruction */
42 struct rinst {
43 int ri; /* instruction type (RI_*) */
44 struct ratom ra; /* regular expression atom (RI_ATOM) */
45 int a1, a2; /* destination of RE_FORK and RI_JUMP */
46 int mark; /* mark (RI_MARK) */
49 /* regular expression program */
50 struct regex {
51 struct rinst *p; /* the program */
52 int n; /* number of instructions */
53 int grpcnt; /* number of groups */
54 int flg; /* regcomp() flags */
57 /* regular expression matching state */
58 struct rstate {
59 int mark[NGRPS * 2]; /* marks for RI_MARK */
60 int pc; /* program counter */
61 char *s; /* the current position in the string */
62 char *o; /* the beginning of the string */
63 int flg; /* flags passed to regcomp() and regexec() */
66 /* regular expression tree; used for parsing */
67 struct rnode {
68 int rn; /* node type (RN_*) */
69 struct ratom ra; /* regular expression atom (RN_ATOM) */
70 int mincnt, maxcnt; /* number of repetitions */
71 struct rnode *c1, *c2; /* children */
74 static struct rnode *rnode_make(int rn, struct rnode *c1, struct rnode *c2)
76 struct rnode *rnode = malloc(sizeof(*rnode));
77 memset(rnode, 0, sizeof(*rnode));
78 rnode->rn = rn;
79 rnode->c1 = c1;
80 rnode->c2 = c2;
81 rnode->mincnt = 1;
82 rnode->maxcnt = 1;
83 return rnode;
86 static void rnode_free(struct rnode *rnode)
88 if (rnode->c1)
89 rnode_free(rnode->c1);
90 if (rnode->c2)
91 rnode_free(rnode->c2);
92 free(rnode->ra.s);
93 free(rnode);
96 static int uc_len(char *s)
98 int c = (unsigned char) s[0];
99 if (~c & 0x80) /* ASCII */
100 return c > 0;
101 if (~c & 0x40) /* invalid UTF-8 */
102 return 1;
103 if (~c & 0x20)
104 return 2;
105 if (~c & 0x10)
106 return 3;
107 if (~c & 0x08)
108 return 4;
109 return 1;
112 static int uc_dec(char *s)
114 int result;
115 int l = uc_len(s);
116 if (l <= 1)
117 return (unsigned char) *s;
118 result = (0x3f >> --l) & (unsigned char) *s++;
119 while (l--)
120 result = (result << 6) | ((unsigned char) *s++ & 0x3f);
121 return result;
124 static void ratom_copy(struct ratom *dst, struct ratom *src)
126 dst->ra = src->ra;
127 dst->s = NULL;
128 if (src->s) {
129 int len = strlen(src->s);
130 dst->s = malloc(len + 1);
131 memcpy(dst->s, src->s, len + 1);
135 static int brk_len(char *s)
137 int n = 1;
138 if (s[n] == '^') /* exclusion mark */
139 n++;
140 if (s[n] == ']') /* handling []a] */
141 n++;
142 while (s[n] && s[n] != ']') {
143 if (s[n] == '[' && (s[n + 1] == ':' || s[n + 1] == '='))
144 while (s[n] && s[n] != ']')
145 n++;
146 if (s[n])
147 n++;
149 return s[n] == ']' ? n + 1 : n;
152 static void ratom_readbrk(struct ratom *ra, char **pat)
154 int len = brk_len(*pat);
155 ra->ra = RA_BRK;
156 ra->s = malloc(len + 1);
157 memcpy(ra->s, *pat, len);
158 ra->s[len] = '\0';
159 *pat += len;
162 static void ratom_read(struct ratom *ra, char **pat)
164 int len;
165 switch ((unsigned char) **pat) {
166 case '.':
167 ra->ra = RA_ANY;
168 (*pat)++;
169 break;
170 case '^':
171 ra->ra = RA_BEG;
172 (*pat)++;
173 break;
174 case '$':
175 ra->ra = RA_END;
176 (*pat)++;
177 break;
178 case '[':
179 ratom_readbrk(ra, pat);
180 break;
181 case '\\':
182 if ((*pat)[1] == '<' || (*pat)[1] == '>') {
183 ra->ra = (*pat)[1] == '<' ? RA_WBEG : RA_WEND;
184 *pat += 2;
185 break;
187 (*pat)++;
188 default:
189 ra->ra = RA_CHR;
190 len = uc_len(*pat);
191 ra->s = malloc(8);
192 memcpy(ra->s, *pat, len);
193 ra->s[len] = '\0';
194 *pat += len;
198 static char *uc_beg(char *beg, char *s)
200 while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
201 s--;
202 return s;
205 static int isword(char *s)
207 int c = (unsigned char) s[0];
208 return isalnum(c) || c == '_' || c > 127;
211 static char *brk_classes[][2] = {
212 {":alnum:", "a-zA-Z0-9"},
213 {":alpha:", "a-zA-Z"},
214 {":blank:", " \t"},
215 {":digit:", "0-9"},
216 {":lower:", "a-z"},
217 {":print:", "\x20-\x7e"},
218 {":punct:", "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"},
219 {":space:", " \t\r\n\v\f"},
220 {":upper:", "A-Z"},
221 {":word:", "a-zA-Z0-9_"},
222 {":xdigit:", "a-fA-F0-9"},
225 static int brk_match(char *brk, int c, int flg)
227 int beg, end;
228 int i;
229 int not = brk[0] == '^';
230 char *p = not ? brk + 1 : brk;
231 char *p0 = p;
232 if (flg & REG_ICASE && c < 128 && isupper(c))
233 c = tolower(c);
234 while (*p && (p == p0 || *p != ']')) {
235 if (p[0] == '[' && p[1] == ':') {
236 for (i = 0; i < LEN(brk_classes); i++) {
237 char *cc = brk_classes[i][0];
238 char *cp = brk_classes[i][1];
239 if (!strncmp(cc, p + 1, strlen(cc)))
240 if (!brk_match(cp, c, flg))
241 return not;
243 p += brk_len(p);
244 continue;
246 beg = uc_dec(p);
247 p += uc_len(p);
248 end = beg;
249 if (p[0] == '-' && p[1] && p[1] != ']') {
250 p++;
251 end = uc_dec(p);
252 p += uc_len(p);
254 if (flg & REG_ICASE && beg < 128 && isupper(beg))
255 beg = tolower(beg);
256 if (flg & REG_ICASE && end < 128 && isupper(end))
257 end = tolower(end);
258 if (c >= beg && c <= end)
259 return not;
261 return !not;
264 static int ratom_match(struct ratom *ra, struct rstate *rs)
266 if (ra->ra == RA_CHR) {
267 int c1 = uc_dec(ra->s);
268 int c2 = uc_dec(rs->s);
269 if (rs->flg & REG_ICASE && c1 < 128 && isupper(c1))
270 c1 = tolower(c1);
271 if (rs->flg & REG_ICASE && c2 < 128 && isupper(c2))
272 c2 = tolower(c2);
273 if (c1 != c2)
274 return 1;
275 rs->s += uc_len(ra->s);
276 return 0;
278 if (ra->ra == RA_ANY) {
279 if (!rs->s[0])
280 return 1;
281 rs->s += uc_len(rs->s);
282 return 0;
284 if (ra->ra == RA_BRK) {
285 int c = uc_dec(rs->s);
286 if (!c)
287 return 1;
288 rs->s += uc_len(rs->s);
289 return brk_match(ra->s + 1, c, rs->flg);
291 if (ra->ra == RA_BEG && !(rs->flg & REG_NOTBOL))
292 return !(rs->s == rs->o || rs->s[-1] == '\n');
293 if (ra->ra == RA_END && !(rs->flg & REG_NOTEOL))
294 return rs->s[0] != '\0' && rs->s[0] != '\n';
295 if (ra->ra == RA_WBEG)
296 return !((rs->s == rs->o || !isword(uc_beg(rs->o, rs->s - 1))) &&
297 isword(rs->s));
298 if (ra->ra == RA_WEND)
299 return !(rs->s != rs->o && isword(uc_beg(rs->o, rs->s - 1)) &&
300 (!rs->s[0] || !isword(rs->s)));
301 return 1;
304 static struct rnode *rnode_parse(char **pat);
306 static struct rnode *rnode_grp(char **pat)
308 struct rnode *rnode;
309 if ((*pat)[0] != '(')
310 return NULL;
311 *pat += 1;
312 rnode = rnode_parse(pat);
313 if (!rnode)
314 return NULL;
315 if ((*pat)[0] != ')') {
316 rnode_free(rnode);
317 return NULL;
319 *pat += 1;
320 return rnode_make(RN_GRP, rnode, NULL);
323 static struct rnode *rnode_atom(char **pat)
325 struct rnode *rnode;
326 if (!**pat)
327 return NULL;
328 if ((*pat)[0] == '|' || (*pat)[0] == ')')
329 return NULL;
330 if ((*pat)[0] == '(') {
331 rnode = rnode_grp(pat);
332 } else {
333 rnode = rnode_make(RN_ATOM, NULL, NULL);
334 ratom_read(&rnode->ra, pat);
336 if (!rnode)
337 return NULL;
338 if ((*pat)[0] == '*' || (*pat)[0] == '?') {
339 rnode->mincnt = 0;
340 rnode->maxcnt = (*pat)[0] == '*' ? -1 : 1;
341 (*pat)++;
343 if ((*pat)[0] == '+') {
344 rnode->mincnt = 1;
345 rnode->maxcnt = NREPS;
346 *pat += 1;
348 if ((*pat)[0] == '{') {
349 rnode->mincnt = 0;
350 rnode->maxcnt = 0;
351 *pat += 1;
352 while (isdigit((unsigned char) **pat))
353 rnode->mincnt = rnode->mincnt * 10 + *(*pat)++ - '0';
354 if (**pat == ',') {
355 (*pat)++;
356 if ((*pat)[0] == '}')
357 rnode->maxcnt = NREPS;
358 while (isdigit((unsigned char) **pat))
359 rnode->maxcnt = rnode->maxcnt * 10 + *(*pat)++ - '0';
360 } else {
361 rnode->maxcnt = rnode->mincnt;
363 *pat += 1;
364 if (rnode->mincnt > NREPS || rnode->maxcnt > NREPS) {
365 rnode_free(rnode);
366 return NULL;
369 return rnode;
372 static struct rnode *rnode_seq(char **pat)
374 struct rnode *c1 = rnode_atom(pat);
375 struct rnode *c2;
376 if (!c1)
377 return NULL;
378 c2 = rnode_seq(pat);
379 return c2 ? rnode_make(RN_CAT, c1, c2) : c1;
382 static struct rnode *rnode_parse(char **pat)
384 struct rnode *c1 = rnode_seq(pat);
385 struct rnode *c2;
386 if ((*pat)[0] != '|')
387 return c1;
388 *pat += 1;
389 c2 = rnode_parse(pat);
390 return c2 ? rnode_make(RN_ALT, c1, c2) : c1;
393 static int rnode_count(struct rnode *rnode)
395 int n = 1;
396 if (!rnode)
397 return 0;
398 if (rnode->rn == RN_CAT)
399 n = rnode_count(rnode->c1) + rnode_count(rnode->c2);
400 if (rnode->rn == RN_ALT)
401 n = rnode_count(rnode->c1) + rnode_count(rnode->c2) + 2;
402 if (rnode->rn == RN_GRP)
403 n = rnode_count(rnode->c1) + 2;
404 if (rnode->mincnt == 0 && rnode->maxcnt == 0)
405 return 0;
406 if (rnode->mincnt == 1 && rnode->maxcnt == 1)
407 return n;
408 if (rnode->maxcnt < 0) {
409 n = (rnode->mincnt + 1) * n + 1;
410 } else {
411 n = (rnode->mincnt + rnode->maxcnt) * n +
412 rnode->maxcnt - rnode->mincnt;
414 if (!rnode->mincnt)
415 n++;
416 return n;
419 static int re_insert(struct regex *p, int ri)
421 p->p[p->n++].ri = ri;
422 return p->n - 1;
425 static void rnode_emit(struct rnode *n, struct regex *p);
427 static void rnode_emitnorep(struct rnode *n, struct regex *p)
429 int fork, done, mark;
430 if (n->rn == RN_ALT) {
431 fork = re_insert(p, RI_FORK);
432 p->p[fork].a1 = p->n;
433 rnode_emit(n->c1, p);
434 done = re_insert(p, RI_JUMP);
435 p->p[fork].a2 = p->n;
436 rnode_emit(n->c2, p);
437 p->p[done].a1 = p->n;
439 if (n->rn == RN_CAT) {
440 rnode_emit(n->c1, p);
441 rnode_emit(n->c2, p);
443 if (n->rn == RN_GRP) {
444 int grp = p->grpcnt++ + 1;
445 mark = re_insert(p, RI_MARK);
446 p->p[mark].mark = 2 * grp;
447 rnode_emit(n->c1, p);
448 mark = re_insert(p, RI_MARK);
449 p->p[mark].mark = 2 * grp + 1;
451 if (n->rn == RN_ATOM) {
452 int atom = re_insert(p, RI_ATOM);
453 ratom_copy(&p->p[atom].ra, &n->ra);
457 static void rnode_emit(struct rnode *n, struct regex *p)
459 int last;
460 int jmpend[NREPS];
461 int jmpend_cnt = 0;
462 int i;
463 if (!n)
464 return;
465 if (n->mincnt == 0 && n->maxcnt == 0)
466 return;
467 if (n->mincnt == 1 && n->maxcnt == 1) {
468 rnode_emitnorep(n, p);
469 return;
471 if (n->mincnt == 0) {
472 int fork = re_insert(p, RI_FORK);
473 p->p[fork].a1 = p->n;
474 jmpend[jmpend_cnt++] = fork;
476 for (i = 0; i < MAX(1, n->mincnt); i++) {
477 last = p->n;
478 rnode_emitnorep(n, p);
480 if (n->maxcnt < 0) {
481 int fork;
482 fork = re_insert(p, RI_FORK);
483 p->p[fork].a1 = last;
484 p->p[fork].a2 = p->n;
486 for (i = MAX(1, n->mincnt); i < n->maxcnt; i++) {
487 int fork = re_insert(p, RI_FORK);
488 p->p[fork].a1 = p->n;
489 jmpend[jmpend_cnt++] = fork;
490 rnode_emitnorep(n, p);
492 for (i = 0; i < jmpend_cnt; i++)
493 p->p[jmpend[i]].a2 = p->n;
496 int regcomp(regex_t *preg, char *pat, int flg)
498 struct rnode *rnode = rnode_parse(&pat);
499 struct regex *re;
500 int n = rnode_count(rnode) + 3;
501 int mark;
502 if (!rnode)
503 return 1;
504 re = malloc(sizeof(*re));
505 memset(re, 0, sizeof(*re));
506 re->p = malloc(n * sizeof(re->p[0]));
507 memset(re->p, 0, n * sizeof(re->p[0]));
508 mark = re_insert(re, RI_MARK);
509 re->p[mark].mark = 0;
510 rnode_emit(rnode, re);
511 mark = re_insert(re, RI_MARK);
512 re->p[mark].mark = 1;
513 mark = re_insert(re, RI_MATCH);
514 rnode_free(rnode);
515 re->flg = flg;
516 *preg = re;
517 return 0;
520 void regfree(regex_t *preg)
522 struct regex *re = *preg;
523 int i;
524 for (i = 0; i < re->n; i++)
525 if (re->p[i].ri == RI_ATOM)
526 free(re->p[i].ra.s);
527 free(re->p);
528 free(re);
531 static int re_rec(struct regex *re, struct rstate *rs)
533 struct rinst *ri = &re->p[rs->pc];
534 if (ri->ri == RI_ATOM) {
535 if (ratom_match(&ri->ra, rs))
536 return 1;
537 rs->pc++;
538 return re_rec(re, rs);
540 if (ri->ri == RI_MARK) {
541 if (ri->mark < NGRPS)
542 rs->mark[ri->mark] = rs->s - rs->o;
543 rs->pc++;
544 return re_rec(re, rs);
546 if (ri->ri == RI_JUMP) {
547 rs->pc = ri->a1;
548 return re_rec(re, rs);
550 if (ri->ri == RI_FORK) {
551 struct rstate base = *rs;
552 rs->pc = ri->a1;
553 if (!re_rec(re, rs))
554 return 0;
555 *rs = base;
556 rs->pc = ri->a2;
557 return re_rec(re, rs);
559 if (ri->ri == RI_MATCH)
560 return 0;
561 return 1;
564 static int re_recmatch(struct regex *re, struct rstate *rs, int nsub, regmatch_t *psub)
566 int i;
567 rs->pc = 0;
568 for (i = 0; i < LEN(rs->mark); i++)
569 rs->mark[i] = -1;
570 if (!re_rec(re, rs)) {
571 for (i = 0; i < nsub; i++) {
572 psub[i].rm_so = i * 2 < LEN(rs->mark) ? rs->mark[i * 2] : -1;
573 psub[i].rm_eo = i * 2 < LEN(rs->mark) ? rs->mark[i * 2 + 1] : -1;
575 return 0;
577 return 1;
580 int regexec(regex_t *preg, char *s, int nsub, regmatch_t psub[], int flg)
582 struct regex *re = *preg;
583 struct rstate rs;
584 memset(&rs, 0, sizeof(rs));
585 rs.flg = re->flg | flg;
586 rs.o = s;
587 while (*s) {
588 rs.s = s;
589 s += uc_len(s);
590 if (!re_recmatch(re, &rs, flg & REG_NOSUB ? 0 : nsub, psub))
591 return 0;
593 return 1;
596 int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size)
598 return 0;