10 #define MAX(a, b) ((a) < (b) ? (b) : (a))
11 #define LEN(a) (sizeof(a) / sizeof((a)[0]))
13 /* regular expressions atoms */
14 #define RA_CHR '\0' /* character literal */
15 #define RA_BEG '^' /* string start */
16 #define RA_END '$' /* string end */
17 #define RA_ANY '.' /* any character */
18 #define RA_BRK '[' /* bracket expression */
19 #define RA_WBEG '<' /* word start */
20 #define RA_WEND '>' /* word end */
22 /* regular expression node types */
23 #define RN_ATOM '\0' /* regular expression */
24 #define RN_CAT 'c' /* concatenation */
25 #define RN_ALT '|' /* alternate expressions */
26 #define RN_GRP '(' /* pattern group */
28 /* regular expression program instructions */
29 #define RI_ATOM '\0' /* regular expression */
30 #define RI_FORK 'f' /* fork the execution */
31 #define RI_JUMP 'j' /* jump to the given instruction */
32 #define RI_MARK 'm' /* mark the current position */
33 #define RI_MATCH 'q' /* the pattern is matched */
35 /* regular expression atom */
37 int ra
; /* atom type (RA_*) */
38 char *s
; /* atom argument */
41 /* regular expression instruction */
43 int ri
; /* instruction type (RI_*) */
44 struct ratom ra
; /* regular expression atom (RI_ATOM) */
45 int a1
, a2
; /* destination of RE_FORK and RI_JUMP */
46 int mark
; /* mark (RI_MARK) */
49 /* regular expression program */
51 struct rinst
*p
; /* the program */
52 int n
; /* number of instructions */
53 int grpcnt
; /* number of groups */
54 int flg
; /* regcomp() flags */
57 /* regular expression matching state */
59 int mark
[NGRPS
* 2]; /* marks for RI_MARK */
60 int pc
; /* program counter */
61 char *s
; /* the current position in the string */
62 char *o
; /* the beginning of the string */
63 int flg
; /* flags passed to regcomp() and regexec() */
66 /* regular expression tree; used for parsing */
68 int rn
; /* node type (RN_*) */
69 struct ratom ra
; /* regular expression atom (RN_ATOM) */
70 int mincnt
, maxcnt
; /* number of repetitions */
71 struct rnode
*c1
, *c2
; /* children */
74 static struct rnode
*rnode_make(int rn
, struct rnode
*c1
, struct rnode
*c2
)
76 struct rnode
*rnode
= malloc(sizeof(*rnode
));
77 memset(rnode
, 0, sizeof(*rnode
));
86 static void rnode_free(struct rnode
*rnode
)
89 rnode_free(rnode
->c1
);
91 rnode_free(rnode
->c2
);
96 static int uc_len(char *s
)
98 int c
= (unsigned char) s
[0];
99 if (c
> 0 && c
<= 0x7f)
114 static int uc_dec(char *s
)
119 return (unsigned char) *s
;
120 result
= (0x3f >> --l
) & (unsigned char) *s
++;
122 result
= (result
<< 6) | ((unsigned char) *s
++ & 0x3f);
126 static void ratom_copy(struct ratom
*dst
, struct ratom
*src
)
131 int len
= strlen(src
->s
);
132 dst
->s
= malloc(len
+ 1);
133 memcpy(dst
->s
, src
->s
, len
+ 1);
137 static void ratom_readbrk(struct ratom
*ra
, char **pat
)
142 if (**pat
== '^') /* exclusion mark */
144 if (**pat
) /* handling []a] */
146 while (**pat
&& *(*pat
)++ != ']')
148 len
= *pat
- beg
- 1;
150 ra
->s
= malloc(len
+ 1);
151 memcpy(ra
->s
, beg
, len
);
156 static void ratom_read(struct ratom
*ra
, char **pat
)
159 switch ((unsigned char) **pat
) {
173 ratom_readbrk(ra
, pat
);
176 if ((*pat
)[1] == '<' || (*pat
)[1] == '>') {
177 ra
->ra
= (*pat
)[1] == '<' ? RA_WBEG
: RA_WEND
;
186 memcpy(ra
->s
, *pat
, len
);
192 static int isword(int c
)
194 return isalpha(c
) || isdigit(c
) || c
== '_';
197 static char *charclasses
[][2] = {
198 {":alnum:", "a-zA-Z0-9"},
199 {":alpha:", "a-zA-Z"},
203 {":print:", "\x20-\x7e"},
204 {":punct:", "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"},
205 {":space:", " \t\r\n\v\f"},
207 {":word:", "a-zA-Z0-9_"},
208 {":xdigit:", "a-fA-F0-9"},
211 static int ratom_match(struct ratom
*ra
, struct rstate
*rs
)
213 typedef unsigned char uc_t
;
215 if (ra
->ra
== RA_CHR
) {
216 int c1
= uc_dec(ra
->s
);
217 int c2
= uc_dec(rs
->s
);
218 if (rs
->flg
& REG_ICASE
&& c1
< 128 && isupper(c1
))
220 if (rs
->flg
& REG_ICASE
&& c2
< 128 && isupper(c2
))
224 rs
->s
+= uc_len(ra
->s
);
227 if (ra
->ra
== RA_BRK
) {
228 int not = ra
->s
[0] == '^';
229 char *p
= not ? ra
->s
+ 1 : ra
->s
;
235 for (i
= 0; i
< LEN(charclasses
); i
++)
236 if (!strcmp(charclasses
[i
][0], p
))
237 p
= charclasses
[i
][1];
239 rs
->s
+= uc_len(rs
->s
);
240 if (rs
->flg
& REG_ICASE
&& c
< 128 && isupper(c
))
246 if (p
[0] == '-' && p
[1]) {
251 if (rs
->flg
& REG_ICASE
&& beg
< 128 && isupper(beg
))
253 if (rs
->flg
& REG_ICASE
&& end
< 128 && isupper(end
))
255 if (c
>= beg
&& c
<= end
)
260 if (ra
->ra
== RA_ANY
) {
263 rs
->s
+= uc_len(rs
->s
);
266 if (ra
->ra
== RA_BEG
&& !(rs
->flg
& REG_NOTBOL
))
267 return rs
->s
!= rs
->o
;
268 if (ra
->ra
== RA_END
&& !(rs
->flg
& REG_NOTEOL
))
269 return rs
->s
[0] != '\0';
270 if (ra
->ra
== RA_WBEG
)
271 return !((rs
->s
== rs
->o
|| !isword((uc_t
) rs
->s
[-1])) &&
272 isword((uc_t
) rs
->s
[0]));
273 if (ra
->ra
== RA_WEND
)
274 return !(rs
->s
!= rs
->o
&& isword((uc_t
) rs
->s
[-1]) &&
275 (!rs
->s
[0] || !isword((uc_t
) rs
->s
[0])));
279 static struct rnode
*rnode_parse(char **pat
);
281 static struct rnode
*rnode_grp(char **pat
)
284 if ((*pat
)[0] != '(')
287 rnode
= rnode_parse(pat
);
288 if ((*pat
)[0] != ')') {
293 return rnode_make(RN_GRP
, rnode
, NULL
);
296 static struct rnode
*rnode_atom(char **pat
)
301 if ((*pat
)[0] == '|' || (*pat
)[0] == ')')
303 if ((*pat
)[0] == '(') {
304 rnode
= rnode_grp(pat
);
306 rnode
= rnode_make(RN_ATOM
, NULL
, NULL
);
307 ratom_read(&rnode
->ra
, pat
);
309 if ((*pat
)[0] == '*' || (*pat
)[0] == '?') {
311 rnode
->maxcnt
= (*pat
)[0] == '*' ? -1 : 1;
314 if ((*pat
)[0] == '+') {
319 if ((*pat
)[0] == '{') {
323 while (isdigit((unsigned char) **pat
))
324 rnode
->mincnt
= rnode
->mincnt
* 10 + *(*pat
)++ - '0';
327 if ((*pat
)[0] == '}')
329 while (isdigit((unsigned char) **pat
))
330 rnode
->maxcnt
= rnode
->maxcnt
* 10 + *(*pat
)++ - '0';
332 rnode
->maxcnt
= rnode
->mincnt
;
335 if (rnode
->mincnt
> NREPS
|| rnode
->maxcnt
> NREPS
) {
343 static struct rnode
*rnode_seq(char **pat
)
345 struct rnode
*c1
= rnode_atom(pat
);
350 return c2
? rnode_make(RN_CAT
, c1
, c2
) : c1
;
353 static struct rnode
*rnode_parse(char **pat
)
355 struct rnode
*c1
= rnode_seq(pat
);
357 if ((*pat
)[0] != '|')
360 c2
= rnode_parse(pat
);
361 return c2
? rnode_make(RN_ALT
, c1
, c2
) : c1
;
364 static int rnode_count(struct rnode
*rnode
)
369 if (rnode
->rn
== RN_CAT
)
370 n
= rnode_count(rnode
->c1
) + rnode_count(rnode
->c2
);
371 if (rnode
->rn
== RN_ALT
)
372 n
= rnode_count(rnode
->c1
) + rnode_count(rnode
->c2
) + 2;
373 if (rnode
->rn
== RN_GRP
)
374 n
= rnode_count(rnode
->c1
) + 2;
375 if (rnode
->mincnt
== 0 && rnode
->maxcnt
== 0)
377 if (rnode
->mincnt
== 1 && rnode
->maxcnt
== 1)
379 if (rnode
->maxcnt
< 0) {
380 n
= (rnode
->mincnt
+ 1) * n
+ 1;
382 n
= (rnode
->mincnt
+ rnode
->maxcnt
) * n
+
383 rnode
->maxcnt
- rnode
->mincnt
;
390 static int re_insert(struct regex
*p
, int ri
)
392 p
->p
[p
->n
++].ri
= ri
;
396 static void rnode_emit(struct rnode
*n
, struct regex
*p
);
398 static void rnode_emitnorep(struct rnode
*n
, struct regex
*p
)
400 int fork
, done
, mark
;
401 if (n
->rn
== RN_ALT
) {
402 fork
= re_insert(p
, RI_FORK
);
403 p
->p
[fork
].a1
= p
->n
;
404 rnode_emit(n
->c1
, p
);
405 done
= re_insert(p
, RI_JUMP
);
406 p
->p
[fork
].a2
= p
->n
;
407 rnode_emit(n
->c2
, p
);
408 p
->p
[done
].a1
= p
->n
;
410 if (n
->rn
== RN_CAT
) {
411 rnode_emit(n
->c1
, p
);
412 rnode_emit(n
->c2
, p
);
414 if (n
->rn
== RN_GRP
) {
415 int grp
= p
->grpcnt
++ + 1;
416 mark
= re_insert(p
, RI_MARK
);
417 p
->p
[mark
].mark
= 2 * grp
;
418 rnode_emit(n
->c1
, p
);
419 mark
= re_insert(p
, RI_MARK
);
420 p
->p
[mark
].mark
= 2 * grp
+ 1;
422 if (n
->rn
== RN_ATOM
) {
423 int atom
= re_insert(p
, RI_ATOM
);
424 ratom_copy(&p
->p
[atom
].ra
, &n
->ra
);
428 static void rnode_emit(struct rnode
*n
, struct regex
*p
)
436 if (n
->mincnt
== 0 && n
->maxcnt
== 0)
438 if (n
->mincnt
== 1 && n
->maxcnt
== 1) {
439 rnode_emitnorep(n
, p
);
442 if (n
->mincnt
== 0) {
443 int fork
= re_insert(p
, RI_FORK
);
444 p
->p
[fork
].a1
= p
->n
;
445 jmpend
[jmpend_cnt
++] = fork
;
447 for (i
= 0; i
< MAX(1, n
->mincnt
); i
++) {
449 rnode_emitnorep(n
, p
);
453 fork
= re_insert(p
, RI_FORK
);
454 p
->p
[fork
].a1
= last
;
455 p
->p
[fork
].a2
= p
->n
;
457 for (i
= MAX(1, n
->mincnt
); i
< n
->maxcnt
; i
++) {
458 int fork
= re_insert(p
, RI_FORK
);
459 p
->p
[fork
].a1
= p
->n
;
460 jmpend
[jmpend_cnt
++] = fork
;
461 rnode_emitnorep(n
, p
);
463 for (i
= 0; i
< jmpend_cnt
; i
++)
464 p
->p
[jmpend
[i
]].a2
= p
->n
;
467 int regcomp(regex_t
*preg
, char *pat
, int flg
)
469 struct rnode
*rnode
= rnode_parse(&pat
);
470 struct regex
*re
= malloc(sizeof(*re
));
471 int n
= rnode_count(rnode
) + 3;
473 memset(re
, 0, sizeof(*re
));
474 re
->p
= malloc(n
* sizeof(re
->p
[0]));
475 memset(re
->p
, 0, n
* sizeof(re
->p
[0]));
476 mark
= re_insert(re
, RI_MARK
);
477 re
->p
[mark
].mark
= 0;
478 rnode_emit(rnode
, re
);
479 mark
= re_insert(re
, RI_MARK
);
480 re
->p
[mark
].mark
= 1;
481 mark
= re_insert(re
, RI_MATCH
);
488 void regfree(regex_t
*preg
)
490 struct regex
*re
= *preg
;
492 for (i
= 0; i
< re
->n
; i
++)
493 if (re
->p
[i
].ri
== RI_ATOM
)
499 static int re_rec(struct regex
*re
, struct rstate
*rs
)
501 struct rinst
*ri
= &re
->p
[rs
->pc
];
502 if (ri
->ri
== RI_ATOM
) {
503 if (ratom_match(&ri
->ra
, rs
))
506 return re_rec(re
, rs
);
508 if (ri
->ri
== RI_MARK
) {
509 if (ri
->mark
< NGRPS
)
510 rs
->mark
[ri
->mark
] = rs
->s
- rs
->o
;
512 return re_rec(re
, rs
);
514 if (ri
->ri
== RI_JUMP
) {
516 return re_rec(re
, rs
);
518 if (ri
->ri
== RI_FORK
) {
519 struct rstate base
= *rs
;
525 return re_rec(re
, rs
);
527 if (ri
->ri
== RI_MATCH
)
532 static int re_recmatch(struct regex
*re
, struct rstate
*rs
, int nsub
, regmatch_t
*psub
)
536 for (i
= 0; i
< LEN(rs
->mark
); i
++)
538 if (!re_rec(re
, rs
)) {
539 for (i
= 0; i
< nsub
; i
++) {
540 psub
[i
].rm_so
= i
* 2 < LEN(rs
->mark
) ? rs
->mark
[i
* 2] : -1;
541 psub
[i
].rm_eo
= i
* 2 < LEN(rs
->mark
) ? rs
->mark
[i
* 2 + 1] : -1;
548 int regexec(regex_t
*preg
, char *s
, int nsub
, regmatch_t psub
[], int flg
)
550 struct regex
*re
= *preg
;
552 memset(&rs
, 0, sizeof(rs
));
553 rs
.flg
= re
->flg
| flg
;
557 if (!re_recmatch(re
, &rs
, flg
& REG_NOSUB
? 0 : nsub
, psub
))
563 int regerror(int errcode
, regex_t
*preg
, char *errbuf
, int errbuf_size
)