From e98ea60574989116f4dce8fb62421eadcc101a59 Mon Sep 17 00:00:00 2001 From: Ali Gholami Rudi Date: Sat, 16 May 2015 13:57:23 +0430 Subject: [PATCH] regex: partial regex.h implementation Only extended regular expressions are supported and error handling is missing. --- regex.c | 561 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ regex.h | 18 +++ 2 files changed, 579 insertions(+) create mode 100644 regex.c create mode 100644 regex.h diff --git a/regex.c b/regex.c new file mode 100644 index 0000000..12b8a2a --- /dev/null +++ b/regex.c @@ -0,0 +1,561 @@ +#include +#include +#include +#include +#include "regex.h" + +#define NGRPS 32 +#define NREPS 64 + +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#define LEN(a) (sizeof(a) / sizeof((a)[0])) + +/* regular expressions atoms */ +#define RA_CHR '\0' /* character literal */ +#define RA_BEG '^' /* string start */ +#define RA_END '$' /* string end */ +#define RA_ANY '.' /* any character */ +#define RA_BRK '[' /* bracket expression */ + +/* regular expression node types */ +#define RN_ATOM '\0' /* regular expression */ +#define RN_CAT 'c' /* concatenation */ +#define RN_ALT '|' /* alternate expressions */ +#define RN_GRP '(' /* pattern group */ + +/* regular expression program instructions */ +#define RI_ATOM '\0' /* regular expression */ +#define RI_FORK 'f' /* fork the execution */ +#define RI_JUMP 'j' /* jump to the given instruction */ +#define RI_MARK 'm' /* mark the current position */ +#define RI_MATCH 'q' /* the pattern is matched */ + +/* regular expression atom */ +struct ratom { + int ra; /* atom type (RA_*) */ + char *s; /* atom argument */ +}; + +/* regular expression instruction */ +struct rinst { + int ri; /* instruction type (RI_*) */ + struct ratom ra; /* regular expression atom (RI_ATOM) */ + int a1, a2; /* destination of RE_FORK and RI_JUMP */ + int mark; /* mark (RI_MARK) */ +}; + +/* regular expression program */ +struct regex { + struct rinst *p; /* the program */ + int n; /* number of instructions */ + int grpcnt; /* number of groups */ + int flg; /* regcomp() flags */ +}; + +/* regular expression matching state */ +struct rstate { + int mark[NGRPS * 2]; /* marks for RI_MARK */ + int pc; /* program counter */ + char *s; /* the current position in the string */ + char *o; /* the beginning of the string */ + int flg; /* flags passed to regcomp() and regexec() */ +}; + +/* regular expression tree; used for parsing */ +struct rnode { + int rn; /* node type (RN_*) */ + struct ratom ra; /* regular expression atom (RN_ATOM) */ + int mincnt, maxcnt; /* number of repetitions */ + struct rnode *c1, *c2; /* children */ +}; + +static struct rnode *rnode_make(int rn, struct rnode *c1, struct rnode *c2) +{ + struct rnode *rnode = malloc(sizeof(*rnode)); + memset(rnode, 0, sizeof(*rnode)); + rnode->rn = rn; + rnode->c1 = c1; + rnode->c2 = c2; + rnode->mincnt = 1; + rnode->maxcnt = 1; + return rnode; +} + +static void rnode_free(struct rnode *rnode) +{ + if (rnode->c1) + rnode_free(rnode->c1); + if (rnode->c2) + rnode_free(rnode->c2); + free(rnode->ra.s); + free(rnode); +} + +static int uc_len(char *s) +{ + int c = (unsigned char) s[0]; + if (c > 0 && c <= 0x7f) + return 1; + if (c >= 0xfc) + return 6; + if (c >= 0xf8) + return 5; + if (c >= 0xf0) + return 4; + if (c >= 0xe0) + return 3; + if (c >= 0xc0) + return 2; + return c != 0; +} + +static int uc_dec(char *s) +{ + int result; + int l = uc_len(s); + if (l <= 1) + return (unsigned char) *s; + result = (0x3f >> --l) & (unsigned char) *s++; + while (l--) + result = (result << 6) | ((unsigned char) *s++ & 0x3f); + return result; +} + +static void ratom_copy(struct ratom *dst, struct ratom *src) +{ + dst->ra = src->ra; + dst->s = NULL; + if (src->s) { + int len = strlen(src->s); + dst->s = malloc(len + 1); + memcpy(dst->s, src->s, len + 1); + } +} + +static void ratom_readbrk(struct ratom *ra, char **pat) +{ + char *beg = ++*pat; + int len; + ra->ra = RA_BRK; + if (**pat == '^') /* exclusion mark */ + (*pat)++; + if (**pat) /* handling []a] */ + (*pat)++; + while (**pat && *(*pat)++ != ']') + ; + len = *pat - beg - 1; + if (len >= 0) { + ra->s = malloc(len + 1); + memcpy(ra->s, beg, len); + ra->s[len] = '\0'; + } +} + +static void ratom_read(struct ratom *ra, char **pat) +{ + int len; + switch ((unsigned char) **pat) { + case '.': + ra->ra = RA_ANY; + (*pat)++; + break; + case '^': + ra->ra = RA_BEG; + (*pat)++; + break; + case '$': + ra->ra = RA_END; + (*pat)++; + break; + case '[': + ratom_readbrk(ra, pat); + break; + case '\\': + (*pat)++; + default: + ra->ra = RA_CHR; + len = uc_len(*pat); + ra->s = malloc(8); + memcpy(ra->s, *pat, len); + ra->s[len] = '\0'; + *pat += len; + } +} + +static int isword(int c) +{ + return isalpha(c) || isdigit(c) || c == '_'; +} + +static char *charclasses[][2] = { + {":alnum:", "a-zA-Z0-9"}, + {":alpha:", "a-zA-Z"}, + {":blank:", " \t"}, + {":digit:", "0-9"}, + {":lower:", "a-z"}, + {":print:", "\x20-\x7e"}, + {":punct:", "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"}, + {":space:", " \t\r\n\v\f"}, + {":upper:", "A-Z"}, + {":word:", "a-zA-Z0-9_"}, + {":xdigit:", "a-fA-F0-9"}, +}; + +static int ratom_match(struct ratom *ra, struct rstate *rs) +{ + int i; + if (ra->ra == RA_CHR) { + int c1 = uc_dec(ra->s); + int c2 = uc_dec(rs->s); + if (rs->flg & REG_ICASE && c1 < 128 && isupper(c1)) + c1 = tolower(c1); + if (rs->flg & REG_ICASE && c2 < 128 && isupper(c2)) + c2 = tolower(c2); + if (c1 != c2) + return 1; + rs->s += uc_len(ra->s); + return 0; + } + if (ra->ra == RA_BRK) { + typedef unsigned char uc_t; + int not = ra->s[0] == '^'; + char *p = not ? ra->s + 1 : ra->s; + int c, clen; + int beg, end; + if (!strcmp(":<:", ra->s)) { + return !((rs->s == rs->o || !isword((uc_t) rs->s[-1])) && + isword((uc_t) rs->s[0])); + } + if (!strcmp(":>:", ra->s)) { + return !(rs->s != rs->o && isword((uc_t) rs->s[-1]) && + (!rs->s[0] || !isword((uc_t) rs->s[0]))); + } + if (!rs->s[0]) + return 1; + if (p[0] == ':') + for (i = 0; i < LEN(charclasses); i++) + if (!strcmp(charclasses[i][0], p)) + p = charclasses[i][1]; + c = uc_dec(rs->s); + rs->s += uc_len(rs->s); + if (rs->flg & REG_ICASE && c < 128 && isupper(c)) + c = tolower(c); + while (*p) { + beg = uc_dec(p); + p += uc_len(p); + end = beg; + if (p[0] == '-' && p[1]) { + p++; + end = uc_dec(p); + p += uc_len(p); + } + if (rs->flg & REG_ICASE && beg < 128 && isupper(beg)) + beg = tolower(beg); + if (rs->flg & REG_ICASE && end < 128 && isupper(end)) + end = tolower(end); + if (c >= beg && c <= end) + return not; + } + return !not; + } + if (ra->ra == RA_ANY) { + if (!rs->s[0]) + return 1; + rs->s += uc_len(rs->s); + return 0; + } + if (ra->ra == RA_BEG && !(rs->flg & REG_NOTBOL)) + return rs->s != rs->o; + if (ra->ra == RA_END && !(rs->flg & REG_NOTEOL)) + return rs->s[0] != '\0'; + return 1; +} + +static struct rnode *rnode_parse(char **pat); + +static struct rnode *rnode_grp(char **pat) +{ + struct rnode *rnode; + if ((*pat)[0] != '(') + return NULL; + *pat += 1; + rnode = rnode_parse(pat); + if ((*pat)[0] != ')') { + rnode_free(rnode); + return NULL; + } + *pat += 1; + return rnode_make(RN_GRP, rnode, NULL); +} + +static struct rnode *rnode_atom(char **pat) +{ + struct rnode *rnode; + if (!**pat) + return NULL; + if ((*pat)[0] == '|' || (*pat)[0] == ')') + return NULL; + if ((*pat)[0] == '(') { + rnode = rnode_grp(pat); + } else { + rnode = rnode_make(RN_ATOM, NULL, NULL); + ratom_read(&rnode->ra, pat); + } + if ((*pat)[0] == '*' || (*pat)[0] == '?') { + rnode->mincnt = 0; + rnode->maxcnt = (*pat)[0] == '*' ? -1 : 1; + (*pat)++; + } + if ((*pat)[0] == '+') { + rnode->mincnt = 1; + rnode->maxcnt = -1; + *pat += 1; + } + if ((*pat)[0] == '{') { + rnode->mincnt = 0; + rnode->maxcnt = 0; + *pat += 1; + while (isdigit((unsigned char) **pat)) + rnode->mincnt = rnode->mincnt * 10 + *(*pat)++ - '0'; + if (**pat == ',') { + (*pat)++; + if ((*pat)[0] == '}') + rnode->maxcnt = -1; + while (isdigit((unsigned char) **pat)) + rnode->maxcnt = rnode->maxcnt * 10 + *(*pat)++ - '0'; + } else { + rnode->maxcnt = rnode->mincnt; + } + *pat += 1; + if (rnode->mincnt > NREPS || rnode->maxcnt > NREPS) { + rnode_free(rnode); + return NULL; + } + } + return rnode; +} + +static struct rnode *rnode_seq(char **pat) +{ + struct rnode *c1 = rnode_atom(pat); + struct rnode *c2; + if (!c1) + return NULL; + c2 = rnode_seq(pat); + return c2 ? rnode_make(RN_CAT, c1, c2) : c1; +} + +static struct rnode *rnode_parse(char **pat) +{ + struct rnode *c1 = rnode_seq(pat); + struct rnode *c2; + if ((*pat)[0] != '|') + return c1; + *pat += 1; + c2 = rnode_parse(pat); + return c2 ? rnode_make(RN_ALT, c1, c2) : c1; +} + +static int rnode_count(struct rnode *rnode) +{ + int n = 1; + if (!rnode) + return 0; + if (rnode->rn == RN_CAT) + n = rnode_count(rnode->c1) + rnode_count(rnode->c2); + if (rnode->rn == RN_ALT) + n = rnode_count(rnode->c1) + rnode_count(rnode->c2) + 2; + if (rnode->rn == RN_GRP) + n = rnode_count(rnode->c1) + 2; + if (rnode->mincnt == 0 && rnode->maxcnt == 0) + return 0; + if (rnode->mincnt == 1 && rnode->maxcnt == 1) + return n; + if (rnode->maxcnt < 0) { + n = (rnode->mincnt + 1) * n + 1; + } else { + n = (rnode->mincnt + rnode->maxcnt) * n + + rnode->maxcnt - rnode->mincnt; + } + if (!rnode->mincnt) + n++; + return n; +} + +static int re_insert(struct regex *p, int ri) +{ + p->p[p->n++].ri = ri; + return p->n - 1; +} + +static void rnode_emit(struct rnode *n, struct regex *p); + +static void rnode_emitnorep(struct rnode *n, struct regex *p) +{ + int fork, done, mark; + if (n->rn == RN_ALT) { + fork = re_insert(p, RI_FORK); + p->p[fork].a1 = p->n; + rnode_emit(n->c1, p); + done = re_insert(p, RI_JUMP); + p->p[fork].a2 = p->n; + rnode_emit(n->c2, p); + p->p[done].a1 = p->n; + } + if (n->rn == RN_CAT) { + rnode_emit(n->c1, p); + rnode_emit(n->c2, p); + } + if (n->rn == RN_GRP) { + int grp = p->grpcnt++ + 1; + mark = re_insert(p, RI_MARK); + p->p[mark].mark = 2 * grp; + rnode_emit(n->c1, p); + mark = re_insert(p, RI_MARK); + p->p[mark].mark = 2 * grp + 1; + } + if (n->rn == RN_ATOM) { + int atom = re_insert(p, RI_ATOM); + ratom_copy(&p->p[atom].ra, &n->ra); + } +} + +static void rnode_emit(struct rnode *n, struct regex *p) +{ + int last; + int jmpend[NREPS]; + int jmpend_cnt = 0; + int i; + if (!n) + return; + if (n->mincnt == 0 && n->maxcnt == 0) + return; + if (n->mincnt == 1 && n->maxcnt == 1) { + rnode_emitnorep(n, p); + return; + } + if (n->mincnt == 0) { + int fork = re_insert(p, RI_FORK); + p->p[fork].a1 = p->n; + jmpend[jmpend_cnt++] = fork; + } + for (i = 0; i < MAX(1, n->mincnt); i++) { + last = p->n; + rnode_emitnorep(n, p); + } + if (n->maxcnt < 0) { + int fork; + fork = re_insert(p, RI_FORK); + p->p[fork].a1 = last; + p->p[fork].a2 = p->n; + } + for (i = MAX(1, n->mincnt); i < n->maxcnt; i++) { + int fork = re_insert(p, RI_FORK); + p->p[fork].a1 = p->n; + jmpend[jmpend_cnt++] = fork; + rnode_emitnorep(n, p); + } + for (i = 0; i < jmpend_cnt; i++) + p->p[jmpend[i]].a2 = p->n; +} + +int regcomp(regex_t *preg, char *pat, int flg) +{ + struct rnode *rnode = rnode_parse(&pat); + struct regex *re = malloc(sizeof(*re)); + int n = rnode_count(rnode) + 3; + int mark; + memset(re, 0, sizeof(*re)); + re->p = malloc(n * sizeof(re->p[0])); + memset(re->p, 0, n * sizeof(re->p[0])); + mark = re_insert(re, RI_MARK); + re->p[mark].mark = 0; + rnode_emit(rnode, re); + mark = re_insert(re, RI_MARK); + re->p[mark].mark = 1; + mark = re_insert(re, RI_MATCH); + rnode_free(rnode); + re->flg = flg; + *preg = re; + return 0; +} + +void regfree(regex_t *preg) +{ + struct regex *re = *preg; + int i; + for (i = 0; i < re->n; i++) + if (re->p[i].ri == RI_ATOM) + free(re->p[i].ra.s); + free(re->p); + free(re); +} + +static int re_rec(struct regex *re, struct rstate *rs) +{ + struct rinst *ri = &re->p[rs->pc]; + if (ri->ri == RI_ATOM) { + if (ratom_match(&ri->ra, rs)) + return 1; + rs->pc++; + return re_rec(re, rs); + } + if (ri->ri == RI_MARK) { + if (ri->mark < NGRPS) + rs->mark[ri->mark] = rs->s - rs->o; + rs->pc++; + return re_rec(re, rs); + } + if (ri->ri == RI_JUMP) { + rs->pc = ri->a1; + return re_rec(re, rs); + } + if (ri->ri == RI_FORK) { + struct rstate base = *rs; + rs->pc = ri->a1; + if (!re_rec(re, rs)) + return 0; + *rs = base; + rs->pc = ri->a2; + return re_rec(re, rs); + } + if (ri->ri == RI_MATCH) + return 0; + return 1; +} + +static int re_recmatch(struct regex *re, struct rstate *rs, int nsub, regmatch_t *psub) +{ + int i; + rs->pc = 0; + for (i = 0; i < LEN(rs->mark); i++) + rs->mark[i] = -1; + if (!re_rec(re, rs)) { + for (i = 0; i < nsub; i++) { + psub[i].rm_so = i * 2 < LEN(rs->mark) ? rs->mark[i * 2] : -1; + psub[i].rm_eo = i * 2 < LEN(rs->mark) ? rs->mark[i * 2 + 1] : -1; + } + return 0; + } + return 1; +} + +int regexec(regex_t *preg, char *s, int nsub, regmatch_t psub[], int flg) +{ + struct regex *re = *preg; + struct rstate rs; + memset(&rs, 0, sizeof(rs)); + rs.flg = re->flg | flg; + rs.o = s; + while (*s) { + rs.s = s++; + if (!re_recmatch(re, &rs, flg & REG_NOSUB ? 0 : nsub, psub)) + return 0; + } + return 1; +} + +int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size) +{ + return 0; +} diff --git a/regex.h b/regex.h new file mode 100644 index 0000000..078f02a --- /dev/null +++ b/regex.h @@ -0,0 +1,18 @@ +#define REG_EXTENDED 0x01 +#define REG_NOSUB 0x02 +#define REG_ICASE 0x04 +#define REG_NEWLINE 0x08 +#define REG_NOTBOL 0x10 +#define REG_NOTEOL 0x20 + +typedef struct { + long rm_so; + long rm_eo; +} regmatch_t; + +typedef struct regex *regex_t; + +int regcomp(regex_t *preg, char *regex, int cflags); +int regexec(regex_t *preg, char *str, int nmatch, regmatch_t pmatch[], int eflags); +int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size); +void regfree(regex_t *preg); -- 2.11.4.GIT