7 #define NGRPS 64 /* maximum number of groups */
8 #define NREPS 128 /* maximum repetitions */
9 #define NDEPT 256 /* re_rec() recursion depth limit */
11 #define MAX(a, b) ((a) < (b) ? (b) : (a))
12 #define LEN(a) (sizeof(a) / sizeof((a)[0]))
14 /* regular expressions atoms */
15 #define RA_CHR '\0' /* character literal */
16 #define RA_BEG '^' /* string start */
17 #define RA_END '$' /* string end */
18 #define RA_ANY '.' /* any character */
19 #define RA_BRK '[' /* bracket expression */
20 #define RA_WBEG '<' /* word start */
21 #define RA_WEND '>' /* word end */
23 /* regular expression node types */
24 #define RN_ATOM '\0' /* regular expression */
25 #define RN_CAT 'c' /* concatenation */
26 #define RN_ALT '|' /* alternate expressions */
27 #define RN_GRP '(' /* pattern group */
29 /* regular expression program instructions */
30 #define RI_ATOM '\0' /* regular expression */
31 #define RI_FORK 'f' /* fork the execution */
32 #define RI_JUMP 'j' /* jump to the given instruction */
33 #define RI_MARK 'm' /* mark the current position */
34 #define RI_MATCH 'q' /* the pattern is matched */
36 /* regular expression atom */
38 int ra
; /* atom type (RA_*) */
39 char *s
; /* atom argument */
42 /* regular expression instruction */
44 struct ratom ra
; /* regular expression atom (RI_ATOM) */
45 int ri
; /* instruction type (RI_*) */
46 int a1
, a2
; /* destination of RI_FORK and RI_JUMP */
47 int mark
; /* mark (RI_MARK) */
50 /* regular expression program */
52 struct rinst
*p
; /* the program */
53 int n
; /* number of instructions */
54 int flg
; /* regcomp() flags */
57 /* regular expression matching state */
59 char *s
; /* the current position in the string */
60 char *o
; /* the beginning of the string */
61 int mark
[NGRPS
* 2]; /* marks for RI_MARK */
62 int pc
; /* program counter */
63 int flg
; /* flags passed to regcomp() and regexec() */
64 int dep
; /* re_rec() depth */
67 /* regular expression tree; used for parsing */
69 struct ratom ra
; /* regular expression atom (RN_ATOM) */
70 struct rnode
*c1
, *c2
; /* children */
71 int mincnt
, maxcnt
; /* number of repetitions */
72 int grp
; /* group number */
73 int rn
; /* node type (RN_*) */
76 static struct rnode
*rnode_make(int rn
, struct rnode
*c1
, struct rnode
*c2
)
78 struct rnode
*rnode
= malloc(sizeof(*rnode
));
79 memset(rnode
, 0, sizeof(*rnode
));
88 static void rnode_free(struct rnode
*rnode
)
91 rnode_free(rnode
->c1
);
93 rnode_free(rnode
->c2
);
98 static int uc_len(char *s
)
100 int c
= (unsigned char) s
[0];
101 if (~c
& 0xc0) /* ASCII or invalid */
112 static int uc_dec(char *s
)
114 int c
= (unsigned char) s
[0];
115 if (~c
& 0xc0) /* ASCII or invalid */
118 return ((c
& 0x1f) << 6) | (s
[1] & 0x3f);
120 return ((c
& 0x0f) << 12) | ((s
[1] & 0x3f) << 6) | (s
[2] & 0x3f);
122 return ((c
& 0x07) << 18) | ((s
[1] & 0x3f) << 12) | ((s
[2] & 0x3f) << 6) | (s
[3] & 0x3f);
126 static void ratom_copy(struct ratom
*dst
, struct ratom
*src
)
131 int len
= strlen(src
->s
);
132 dst
->s
= malloc(len
+ 1);
133 memcpy(dst
->s
, src
->s
, len
+ 1);
137 static int brk_len(char *s
)
140 if (s
[n
] == '^') /* exclusion mark */
142 if (s
[n
] == ']') /* handling []a] */
144 while (s
[n
] && s
[n
] != ']') {
145 if (s
[n
] == '[' && (s
[n
+ 1] == ':' || s
[n
+ 1] == '='))
146 while (s
[n
] && s
[n
] != ']')
151 return s
[n
] == ']' ? n
+ 1 : n
;
154 static void ratom_readbrk(struct ratom
*ra
, char **pat
)
156 int len
= brk_len(*pat
);
158 ra
->s
= malloc(len
+ 1);
159 memcpy(ra
->s
, *pat
, len
);
164 static void ratom_read(struct ratom
*ra
, char **pat
)
168 switch ((unsigned char) **pat
) {
182 ratom_readbrk(ra
, pat
);
185 if ((*pat
)[1] == '<' || (*pat
)[1] == '>') {
186 ra
->ra
= (*pat
)[1] == '<' ? RA_WBEG
: RA_WEND
;
194 while ((s
== *pat
) || !strchr(".^$[(|)*?+{\\", (unsigned char) s
[0])) {
196 if (s
!= *pat
&& s
[l
] != '\0' && strchr("*?+{", (unsigned char) s
[l
]))
201 ra
->s
= malloc(len
+ 1);
202 memcpy(ra
->s
, *pat
, len
);
208 static char *uc_beg(char *beg
, char *s
)
210 while (s
> beg
&& (((unsigned char) *s
) & 0xc0) == 0x80)
215 static int isword(char *s
)
217 int c
= (unsigned char) s
[0];
218 return isalnum(c
) || c
== '_' || c
> 127;
221 static char *brk_classes
[][2] = {
222 {":alnum:", "a-zA-Z0-9"},
223 {":alpha:", "a-zA-Z"},
227 {":print:", "\x20-\x7e"},
228 {":punct:", "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"},
229 {":space:", " \t\r\n\v\f"},
231 {":word:", "a-zA-Z0-9_"},
232 {":xdigit:", "a-fA-F0-9"},
235 static int brk_match(char *brk
, int c
, int flg
)
239 int not = brk
[0] == '^';
240 char *p
= not ? brk
+ 1 : brk
;
242 if (flg
& REG_ICASE
&& c
< 128 && isupper(c
))
244 while (*p
&& (p
== p0
|| *p
!= ']')) {
245 if (p
[0] == '[' && p
[1] == ':') {
246 for (i
= 0; i
< LEN(brk_classes
); i
++) {
247 char *cc
= brk_classes
[i
][0];
248 char *cp
= brk_classes
[i
][1];
249 if (!strncmp(cc
, p
+ 1, strlen(cc
)))
250 if (!brk_match(cp
, c
, flg
))
259 if (p
[0] == '-' && p
[1] && p
[1] != ']') {
264 if (flg
& REG_ICASE
&& beg
< 128 && isupper(beg
))
266 if (flg
& REG_ICASE
&& end
< 128 && isupper(end
))
268 if (c
>= beg
&& c
<= end
)
274 static int ratom_match(struct ratom
*ra
, struct rstate
*rs
)
276 if (ra
->ra
== RA_CHR
&& !(rs
->flg
& REG_ICASE
)) {
279 while (*s
&& *s
== *r
)
286 if (ra
->ra
== RA_CHR
) {
289 int c1
= uc_dec(ra
->s
+ pos
);
290 int c2
= uc_dec(rs
->s
+ pos
);
291 if (rs
->flg
& REG_ICASE
&& c1
< 128 && isupper(c1
))
293 if (rs
->flg
& REG_ICASE
&& c2
< 128 && isupper(c2
))
297 pos
+= uc_len(ra
->s
+ pos
);
302 if (ra
->ra
== RA_ANY
) {
303 if (!rs
->s
[0] || (rs
->s
[0] == '\n' && !!(rs
->flg
& REG_NEWLINE
)))
305 rs
->s
+= uc_len(rs
->s
);
308 if (ra
->ra
== RA_BRK
) {
309 int c
= uc_dec(rs
->s
);
310 if (!c
|| (c
== '\n' && !!(rs
->flg
& REG_NEWLINE
) && ra
->s
[1] == '^'))
312 rs
->s
+= uc_len(rs
->s
);
313 return brk_match(ra
->s
+ 1, c
, rs
->flg
);
315 if (ra
->ra
== RA_BEG
&& rs
->s
== rs
->o
)
316 return !!(rs
->flg
& REG_NOTBOL
);
317 if (ra
->ra
== RA_BEG
&& rs
->s
> rs
->o
&& rs
->s
[-1] == '\n')
318 return !(rs
->flg
& REG_NEWLINE
);
319 if (ra
->ra
== RA_END
&& rs
->s
[0] == '\0')
320 return !!(rs
->flg
& REG_NOTEOL
);
321 if (ra
->ra
== RA_END
&& rs
->s
[0] == '\n')
322 return !(rs
->flg
& REG_NEWLINE
);
323 if (ra
->ra
== RA_WBEG
)
324 return !((rs
->s
== rs
->o
|| !isword(uc_beg(rs
->o
, rs
->s
- 1))) &&
326 if (ra
->ra
== RA_WEND
)
327 return !(rs
->s
!= rs
->o
&& isword(uc_beg(rs
->o
, rs
->s
- 1)) &&
328 (!rs
->s
[0] || !isword(rs
->s
)));
332 static struct rnode
*rnode_parse(char **pat
);
334 static struct rnode
*rnode_grp(char **pat
)
336 struct rnode
*rnode
= NULL
;
337 if ((*pat
)[0] != '(')
340 if ((*pat
)[0] != ')') {
341 rnode
= rnode_parse(pat
);
345 if ((*pat
)[0] != ')') {
350 return rnode_make(RN_GRP
, rnode
, NULL
);
353 static struct rnode
*rnode_atom(char **pat
)
358 if ((*pat
)[0] == '|' || (*pat
)[0] == ')')
360 if ((*pat
)[0] == '(') {
361 rnode
= rnode_grp(pat
);
363 rnode
= rnode_make(RN_ATOM
, NULL
, NULL
);
364 ratom_read(&rnode
->ra
, pat
);
368 if ((*pat
)[0] == '*' || (*pat
)[0] == '?') {
370 rnode
->maxcnt
= (*pat
)[0] == '*' ? -1 : 1;
373 if ((*pat
)[0] == '+') {
378 if ((*pat
)[0] == '{') {
382 while (isdigit((unsigned char) **pat
))
383 rnode
->mincnt
= rnode
->mincnt
* 10 + *(*pat
)++ - '0';
386 if ((*pat
)[0] == '}')
388 while (isdigit((unsigned char) **pat
))
389 rnode
->maxcnt
= rnode
->maxcnt
* 10 + *(*pat
)++ - '0';
391 rnode
->maxcnt
= rnode
->mincnt
;
394 if (rnode
->mincnt
> NREPS
|| rnode
->maxcnt
> NREPS
) {
402 static struct rnode
*rnode_seq(char **pat
)
404 struct rnode
*c1
= rnode_atom(pat
);
409 return c2
? rnode_make(RN_CAT
, c1
, c2
) : c1
;
412 static struct rnode
*rnode_parse(char **pat
)
414 struct rnode
*c1
= rnode_seq(pat
);
416 if ((*pat
)[0] != '|')
419 c2
= rnode_parse(pat
);
420 return c2
? rnode_make(RN_ALT
, c1
, c2
) : c1
;
423 static int rnode_count(struct rnode
*rnode
)
428 if (rnode
->rn
== RN_CAT
)
429 n
= rnode_count(rnode
->c1
) + rnode_count(rnode
->c2
);
430 if (rnode
->rn
== RN_ALT
)
431 n
= rnode_count(rnode
->c1
) + rnode_count(rnode
->c2
) + 2;
432 if (rnode
->rn
== RN_GRP
)
433 n
= rnode_count(rnode
->c1
) + 2;
434 if (rnode
->mincnt
== 0 && rnode
->maxcnt
== 0)
436 if (rnode
->mincnt
== 1 && rnode
->maxcnt
== 1)
438 if (rnode
->maxcnt
< 0) {
439 n
= (rnode
->mincnt
+ 1) * n
+ 1;
441 n
= (rnode
->mincnt
+ rnode
->maxcnt
) * n
+
442 rnode
->maxcnt
- rnode
->mincnt
;
449 static int rnode_grpnum(struct rnode
*rnode
, int num
)
454 if (rnode
->rn
== RN_GRP
)
455 rnode
->grp
= num
+ cur
++;
456 cur
+= rnode_grpnum(rnode
->c1
, num
+ cur
);
457 cur
+= rnode_grpnum(rnode
->c2
, num
+ cur
);
461 static int re_insert(struct regex
*p
, int ri
)
463 p
->p
[p
->n
++].ri
= ri
;
467 static void rnode_emit(struct rnode
*n
, struct regex
*p
);
469 static void rnode_emitnorep(struct rnode
*n
, struct regex
*p
)
471 int fork
, done
, mark
;
472 if (n
->rn
== RN_ALT
) {
473 fork
= re_insert(p
, RI_FORK
);
474 p
->p
[fork
].a1
= p
->n
;
475 rnode_emit(n
->c1
, p
);
476 done
= re_insert(p
, RI_JUMP
);
477 p
->p
[fork
].a2
= p
->n
;
478 rnode_emit(n
->c2
, p
);
479 p
->p
[done
].a1
= p
->n
;
481 if (n
->rn
== RN_CAT
) {
482 rnode_emit(n
->c1
, p
);
483 rnode_emit(n
->c2
, p
);
485 if (n
->rn
== RN_GRP
) {
486 mark
= re_insert(p
, RI_MARK
);
487 p
->p
[mark
].mark
= 2 * n
->grp
;
488 rnode_emit(n
->c1
, p
);
489 mark
= re_insert(p
, RI_MARK
);
490 p
->p
[mark
].mark
= 2 * n
->grp
+ 1;
492 if (n
->rn
== RN_ATOM
) {
493 int atom
= re_insert(p
, RI_ATOM
);
494 ratom_copy(&p
->p
[atom
].ra
, &n
->ra
);
498 static void rnode_emit(struct rnode
*n
, struct regex
*p
)
506 if (n
->mincnt
== 0 && n
->maxcnt
== 0)
508 if (n
->mincnt
== 1 && n
->maxcnt
== 1) {
509 rnode_emitnorep(n
, p
);
512 if (n
->mincnt
== 0) {
513 int fork
= re_insert(p
, RI_FORK
);
514 p
->p
[fork
].a1
= p
->n
;
515 jmpend
[jmpend_cnt
++] = fork
;
517 for (i
= 0; i
< MAX(1, n
->mincnt
); i
++) {
519 rnode_emitnorep(n
, p
);
523 fork
= re_insert(p
, RI_FORK
);
524 p
->p
[fork
].a1
= last
;
525 p
->p
[fork
].a2
= p
->n
;
527 for (i
= MAX(1, n
->mincnt
); i
< n
->maxcnt
; i
++) {
528 int fork
= re_insert(p
, RI_FORK
);
529 p
->p
[fork
].a1
= p
->n
;
530 jmpend
[jmpend_cnt
++] = fork
;
531 rnode_emitnorep(n
, p
);
533 for (i
= 0; i
< jmpend_cnt
; i
++)
534 p
->p
[jmpend
[i
]].a2
= p
->n
;
537 int regcomp(regex_t
*preg
, char *pat
, int flg
)
539 struct rnode
*rnode
= rnode_parse(&pat
);
541 int n
= rnode_count(rnode
) + 3;
545 rnode_grpnum(rnode
, 1);
546 re
= malloc(sizeof(*re
));
547 memset(re
, 0, sizeof(*re
));
548 re
->p
= malloc(n
* sizeof(re
->p
[0]));
549 memset(re
->p
, 0, n
* sizeof(re
->p
[0]));
550 mark
= re_insert(re
, RI_MARK
);
551 re
->p
[mark
].mark
= 0;
552 rnode_emit(rnode
, re
);
553 mark
= re_insert(re
, RI_MARK
);
554 re
->p
[mark
].mark
= 1;
555 mark
= re_insert(re
, RI_MATCH
);
562 void regfree(regex_t
*preg
)
564 struct regex
*re
= *preg
;
566 for (i
= 0; i
< re
->n
; i
++)
567 if (re
->p
[i
].ri
== RI_ATOM
)
573 static int re_rec(struct regex
*re
, struct rstate
*rs
)
575 struct rinst
*ri
= NULL
;
576 if (rs
->dep
>= NDEPT
)
581 if (ri
->ri
== RI_ATOM
) {
582 if (ratom_match(&ri
->ra
, rs
))
587 if (ri
->ri
== RI_MARK
) {
588 if (ri
->mark
< NGRPS
)
589 rs
->mark
[ri
->mark
] = rs
->s
- rs
->o
;
593 if (ri
->ri
== RI_JUMP
) {
597 if (ri
->ri
== RI_FORK
) {
598 struct rstate base
= *rs
;
609 return ri
->ri
!= RI_MATCH
;
612 static int re_recmatch(struct regex
*re
, struct rstate
*rs
, int nsub
, regmatch_t
*psub
)
617 for (i
= 0; i
< LEN(rs
->mark
) && i
< nsub
* 2; i
++)
619 if (!re_rec(re
, rs
)) {
620 for (i
= 0; i
< nsub
; i
++) {
621 psub
[i
].rm_so
= i
* 2 < LEN(rs
->mark
) ? rs
->mark
[i
* 2] : -1;
622 psub
[i
].rm_eo
= i
* 2 < LEN(rs
->mark
) ? rs
->mark
[i
* 2 + 1] : -1;
629 int regexec(regex_t
*preg
, char *s
, int nsub
, regmatch_t psub
[], int flg
)
631 struct regex
*re
= *preg
;
633 memset(&rs
, 0, sizeof(rs
));
634 rs
.flg
= re
->flg
| flg
;
639 if (!re_recmatch(re
, &rs
, flg
& REG_NOSUB
? 0 : nsub
, psub
))
645 int regerror(int errcode
, regex_t
*preg
, char *errbuf
, int errbuf_size
)