cosmetics in jamrules; removed one warning
[k8lst.git] / src / primlib / relib / trex.c
blob4444da0837d9e7a701127cb9c60a8f12c3e7ebb6
1 /* see copyright notice in trex.h */
2 #include <string.h>
3 #include <stdlib.h>
4 #include <ctype.h>
5 #include <setjmp.h>
6 #include "trex.h"
8 #ifdef _UINCODE
9 # define scisprint iswprint
10 # define scstrlen wcslen
11 # define scprintf wprintf
12 # define _SC(x) L(x)
13 #else
14 # define scisprint isprint
15 # define scstrlen strlen
16 # define scprintf printf
17 # define _SC(x) (x)
18 #endif
21 #ifdef _DEBUG
22 #include <stdio.h>
23 static const TRexChar *g_nnames[] = {
24 _SC("NONE"), _SC("OP_GREEDY"), _SC("OP_OR"),
25 _SC("OP_EXPR"), _SC("OP_NOCAPEXPR"), _SC("OP_DOT"), _SC("OP_CLASS"),
26 _SC("OP_CCLASS"), _SC("OP_NCLASS"), _SC("OP_RANGE"), _SC("OP_CHAR"),
27 _SC("OP_EOL"), _SC("OP_BOL"), _SC("OP_WB")
29 #endif
32 #define OP_GREEDY (MAX_CHAR+1) // * + ? {n}
33 #define OP_OR (MAX_CHAR+2)
34 #define OP_EXPR (MAX_CHAR+3) //parentesis ()
35 #define OP_NOCAPEXPR (MAX_CHAR+4) //parentesis (?:)
36 #define OP_DOT (MAX_CHAR+5)
37 #define OP_CLASS (MAX_CHAR+6)
38 #define OP_CCLASS (MAX_CHAR+7)
39 #define OP_NCLASS (MAX_CHAR+8) //negates class the [^
40 #define OP_RANGE (MAX_CHAR+9)
41 #define OP_CHAR (MAX_CHAR+10)
42 #define OP_EOL (MAX_CHAR+11)
43 #define OP_BOL (MAX_CHAR+12)
44 #define OP_WB (MAX_CHAR+13)
46 #define TREX_SYMBOL_ANY_CHAR ('.')
47 #define TREX_SYMBOL_GREEDY_ONE_OR_MORE ('+')
48 #define TREX_SYMBOL_GREEDY_ZERO_OR_MORE ('*')
49 #define TREX_SYMBOL_GREEDY_ZERO_OR_ONE ('?')
50 #define TREX_SYMBOL_BRANCH ('|')
51 #define TREX_SYMBOL_END_OF_STRING ('$')
52 #define TREX_SYMBOL_BEGINNING_OF_STRING ('^')
53 #define TREX_SYMBOL_ESCAPE_CHAR ('\\')
56 typedef int TRexNodeType;
58 typedef struct tagTRexNode {
59 TRexNodeType type;
60 int left;
61 int right;
62 int next;
63 } TRexNode;
65 struct TRex {
66 const TRexChar *_eol;
67 const TRexChar *_bol;
68 const TRexChar *_p;
69 int _first;
70 int _op;
71 TRexNode *_nodes;
72 int _nallocated;
73 int _nsize;
74 int _nsubexpr;
75 TRexMatch *_matches;
76 int _currsubexp;
77 void *_jmpbuf;
78 const TRexChar **_error;
82 static int trex_list (TRex *exp);
85 static int trex_newnode (TRex *exp, TRexNodeType type) {
86 TRexNode n;
87 int newid;
88 n.type = type;
89 n.next = n.right = n.left = -1;
90 if (type == OP_EXPR) n.right = exp->_nsubexpr++;
91 if(exp->_nallocated < (exp->_nsize+1)) {
92 exp->_nallocated *= 2;
93 exp->_nodes = (TRexNode *)realloc(exp->_nodes, exp->_nallocated*sizeof(TRexNode));
95 exp->_nodes[exp->_nsize++] = n;
96 newid = exp->_nsize-1;
97 return (int)newid;
101 static void trex_error (TRex *exp, const TRexChar *error) {
102 if (exp->_error) *exp->_error = error;
103 longjmp(*((jmp_buf*)exp->_jmpbuf), -1);
107 static void trex_expect (TRex *exp, int n){
108 if ((*exp->_p) != n) trex_error(exp, _SC("expected paren"));
109 exp->_p++;
113 static TRexChar trex_escapechar (TRex *exp) {
114 if (*exp->_p == TREX_SYMBOL_ESCAPE_CHAR) {
115 exp->_p++;
116 switch (*exp->_p) {
117 case 'v': exp->_p++; return '\v';
118 case 'n': exp->_p++; return '\n';
119 case 't': exp->_p++; return '\t';
120 case 'r': exp->_p++; return '\r';
121 case 'f': exp->_p++; return '\f';
122 default: return (*exp->_p++);
125 if (!scisprint(*exp->_p)) trex_error(exp, _SC("letter expected"));
126 return (*exp->_p++);
130 static int trex_charclass (TRex *exp, int classid) {
131 int n = trex_newnode(exp, OP_CCLASS);
132 exp->_nodes[n].left = classid;
133 return n;
137 static int trex_charnode (TRex *exp, TRexBool isclass) {
138 TRexChar t;
139 if (*exp->_p == TREX_SYMBOL_ESCAPE_CHAR) {
140 exp->_p++;
141 switch(*exp->_p) {
142 case 'n': exp->_p++; return trex_newnode(exp, '\n');
143 case 't': exp->_p++; return trex_newnode(exp, '\t');
144 case 'r': exp->_p++; return trex_newnode(exp, '\r');
145 case 'f': exp->_p++; return trex_newnode(exp, '\f');
146 case 'v': exp->_p++; return trex_newnode(exp, '\v');
147 case 'a': case 'A': case 'w': case 'W': case 's': case 'S':
148 case 'd': case 'D': case 'x': case 'X': case 'c': case 'C':
149 case 'p': case 'P': case 'l': case 'u': {
150 t = *exp->_p; exp->_p++;
151 return trex_charclass(exp, t); }
152 case 'b':
153 case 'B':
154 if (!isclass) {
155 int node = trex_newnode(exp, OP_WB);
156 exp->_nodes[node].left = *exp->_p;
157 exp->_p++;
158 return node;
159 } //else default
160 default:
161 t = *exp->_p; exp->_p++;
162 return trex_newnode(exp, t);
165 else if (!scisprint(*exp->_p)) trex_error(exp, _SC("letter expected"));
166 t = *exp->_p; exp->_p++;
167 return trex_newnode(exp, t);
171 static int trex_class (TRex *exp) {
172 int ret = -1;
173 int first = -1, chain;
174 if (*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING) {
175 ret = trex_newnode(exp, OP_NCLASS);
176 exp->_p++;
177 } else ret = trex_newnode(exp, OP_CLASS);
178 if (*exp->_p == ']') trex_error(exp, _SC("empty class"));
179 chain = ret;
180 while (*exp->_p != ']' && exp->_p != exp->_eol) {
181 if (*exp->_p == '-' && first != -1){
182 int r, t;
183 if(*exp->_p++ == ']') trex_error(exp, _SC("unfinished range"));
184 r = trex_newnode(exp, OP_RANGE);
185 if(first>*exp->_p) trex_error(exp, _SC("invalid range"));
186 if(exp->_nodes[first].type == OP_CCLASS) trex_error(exp, _SC("cannot use character classes in ranges"));
187 exp->_nodes[r].left = exp->_nodes[first].type;
188 t = trex_escapechar(exp);
189 exp->_nodes[r].right = t;
190 exp->_nodes[chain].next = r;
191 chain = r;
192 first = -1;
193 } else {
194 if (first!=-1){
195 int c = first;
196 exp->_nodes[chain].next = c;
197 chain = c;
198 first = trex_charnode(exp, TRex_True);
199 } else {
200 first = trex_charnode(exp, TRex_True);
204 if (first!=-1) {
205 int c = first;
206 exp->_nodes[chain].next = c;
207 chain = c;
208 first = -1;
210 /* hack? */
211 exp->_nodes[ret].left = exp->_nodes[ret].next;
212 exp->_nodes[ret].next = -1;
213 return ret;
217 static int trex_parsenumber (TRex *exp) {
218 int ret = *exp->_p-'0';
219 int positions = 10;
220 exp->_p++;
221 while (isdigit(*exp->_p)) {
222 ret = ret*10+(*exp->_p++-'0');
223 if (positions==1000000000) trex_error(exp, _SC("overflow in numeric constant"));
224 positions *= 10;
226 return ret;
230 static int trex_element (TRex *exp) {
231 int ret = -1;
232 switch (*exp->_p) {
233 case '(': {
234 int expr, newn;
235 exp->_p++;
236 if (*exp->_p =='?') {
237 exp->_p++;
238 trex_expect(exp, ':');
239 expr = trex_newnode(exp, OP_NOCAPEXPR);
240 } else {
241 expr = trex_newnode(exp, OP_EXPR);
243 newn = trex_list(exp);
244 exp->_nodes[expr].left = newn;
245 ret = expr;
246 trex_expect(exp, ')');
247 break;
249 case '[':
250 exp->_p++;
251 ret = trex_class(exp);
252 trex_expect(exp, ']');
253 break;
254 case TREX_SYMBOL_END_OF_STRING: exp->_p++; ret = trex_newnode(exp, OP_EOL);break;
255 case TREX_SYMBOL_ANY_CHAR: exp->_p++; ret = trex_newnode(exp, OP_DOT);break;
256 default:
257 ret = trex_charnode(exp, TRex_False);
258 break;
261 //int op;
262 TRexBool isgreedy = TRex_False;
263 unsigned short p0 = 0, p1 = 0;
264 switch (*exp->_p) {
265 case TREX_SYMBOL_GREEDY_ZERO_OR_MORE: p0 = 0; p1 = 0xFFFF; exp->_p++; isgreedy = TRex_True; break;
266 case TREX_SYMBOL_GREEDY_ONE_OR_MORE: p0 = 1; p1 = 0xFFFF; exp->_p++; isgreedy = TRex_True; break;
267 case TREX_SYMBOL_GREEDY_ZERO_OR_ONE: p0 = 0; p1 = 1; exp->_p++; isgreedy = TRex_True; break;
268 case '{':
269 exp->_p++;
270 if (!isdigit(*exp->_p)) trex_error(exp, _SC("number expected"));
271 p0 = (unsigned short)trex_parsenumber(exp);
272 /*******************************/
273 switch (*exp->_p) {
274 case '}': p1 = p0; exp->_p++; break;
275 case ',':
276 exp->_p++;
277 p1 = 0xFFFF;
278 if (isdigit(*exp->_p)) p1 = (unsigned short)trex_parsenumber(exp);
279 trex_expect(exp, '}');
280 break;
281 default:
282 trex_error(exp, _SC(", or } expected"));
284 /*******************************/
285 isgreedy = TRex_True;
286 break;
288 if (isgreedy) {
289 int nnode = trex_newnode(exp, OP_GREEDY);
290 //op = OP_GREEDY;
291 exp->_nodes[nnode].left = ret;
292 exp->_nodes[nnode].right = ((p0)<<16)|p1;
293 ret = nnode;
296 if ((*exp->_p != TREX_SYMBOL_BRANCH) && (*exp->_p != ')') &&
297 (*exp->_p != TREX_SYMBOL_GREEDY_ZERO_OR_MORE) &&
298 (*exp->_p != TREX_SYMBOL_GREEDY_ONE_OR_MORE) &&
299 (*exp->_p != '\0')) {
300 int nnode = trex_element(exp);
301 exp->_nodes[ret].next = nnode;
303 return ret;
307 static int trex_list (TRex *exp) {
308 int ret = -1, e;
309 if (*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING) {
310 exp->_p++;
311 ret = trex_newnode(exp, OP_BOL);
313 e = trex_element(exp);
314 if (ret != -1) exp->_nodes[ret].next = e; else ret = e;
315 if (*exp->_p == TREX_SYMBOL_BRANCH) {
316 int temp, tright;
317 exp->_p++;
318 temp = trex_newnode(exp, OP_OR);
319 exp->_nodes[temp].left = ret;
320 tright = trex_list(exp);
321 exp->_nodes[temp].right = tright;
322 ret = temp;
324 return ret;
328 static TRexBool trex_matchcclass (int cclass, TRexChar c) {
329 switch (cclass) {
330 case 'a': return isalpha(c)?TRex_True:TRex_False;
331 case 'A': return !isalpha(c)?TRex_True:TRex_False;
332 case 'w': return (isalnum(c) || c == '_')?TRex_True:TRex_False;
333 case 'W': return (!isalnum(c) && c != '_')?TRex_True:TRex_False;
334 case 's': return isspace(c)?TRex_True:TRex_False;
335 case 'S': return !isspace(c)?TRex_True:TRex_False;
336 case 'd': return isdigit(c)?TRex_True:TRex_False;
337 case 'D': return !isdigit(c)?TRex_True:TRex_False;
338 case 'x': return isxdigit(c)?TRex_True:TRex_False;
339 case 'X': return !isxdigit(c)?TRex_True:TRex_False;
340 case 'c': return iscntrl(c)?TRex_True:TRex_False;
341 case 'C': return !iscntrl(c)?TRex_True:TRex_False;
342 case 'p': return ispunct(c)?TRex_True:TRex_False;
343 case 'P': return !ispunct(c)?TRex_True:TRex_False;
344 case 'l': return islower(c)?TRex_True:TRex_False;
345 case 'u': return isupper(c)?TRex_True:TRex_False;
347 return TRex_False; /*cannot happen*/
351 static TRexBool trex_matchclass (TRex* exp, TRexNode *node, TRexChar c) {
352 do {
353 switch (node->type) {
354 case OP_RANGE:
355 if (c >= node->left && c <= node->right) return TRex_True;
356 break;
357 case OP_CCLASS:
358 if (trex_matchcclass(node->left, c)) return TRex_True;
359 break;
360 default:
361 if (c == node->type)return TRex_True;
363 } while ((node->next != -1) && (node = &exp->_nodes[node->next]));
364 return TRex_False;
368 static const TRexChar *trex_matchnode (TRex* exp, TRexNode *node, const TRexChar *str, TRexNode *next) {
369 TRexNodeType type = node->type;
370 switch (type) {
371 case OP_GREEDY: {
372 //TRexNode *greedystop = (node->next != -1) ? &exp->_nodes[node->next] : NULL;
373 TRexNode *greedystop = NULL;
374 int p0 = (node->right >> 16)&0x0000FFFF, p1 = node->right&0x0000FFFF, nmaches = 0;
375 const TRexChar *s=str, *good = str;
376 if (node->next != -1) greedystop = &exp->_nodes[node->next]; else greedystop = next;
377 while ((nmaches == 0xFFFF || nmaches < p1)) {
378 const TRexChar *stop;
379 if (!(s = trex_matchnode(exp, &exp->_nodes[node->left], s, greedystop))) break;
380 nmaches++;
381 good = s;
382 if (greedystop) {
383 //checks that 0 matches satisfy the expression(if so skips)
384 //if not would always stop(for instance if is a '?')
385 if (greedystop->type != OP_GREEDY ||
386 (greedystop->type == OP_GREEDY && ((greedystop->right >> 16)&0x0000FFFF) != 0)) {
387 TRexNode *gnext = NULL;
388 if (greedystop->next != -1) gnext = &exp->_nodes[greedystop->next];
389 else if (next && next->next != -1) gnext = &exp->_nodes[next->next];
390 stop = trex_matchnode(exp, greedystop, s, gnext);
391 if (stop) {
392 //if satisfied stop it
393 if (p0 == p1 && p0 == nmaches) break;
394 else if (nmaches >= p0 && p1 == 0xFFFF) break;
395 else if (nmaches >= p0 && nmaches <= p1) break;
399 if (s >= exp->_eol) break;
401 if (p0 == p1 && p0 == nmaches) return good;
402 if (nmaches >= p0 && p1 == 0xFFFF) return good;
403 if (nmaches >= p0 && nmaches <= p1) return good;
404 return NULL;
406 case OP_OR: {
407 const TRexChar *asd = str;
408 TRexNode *temp=&exp->_nodes[node->left];
409 while ((asd = trex_matchnode(exp, temp, asd, NULL))) {
410 if (temp->next != -1) temp = &exp->_nodes[temp->next]; else return asd;
412 asd = str;
413 temp = &exp->_nodes[node->right];
414 while ((asd = trex_matchnode(exp, temp, asd, NULL))) {
415 if (temp->next != -1) temp = &exp->_nodes[temp->next]; else return asd;
417 return NULL;
419 case OP_EXPR:
420 case OP_NOCAPEXPR: {
421 TRexNode *n = &exp->_nodes[node->left];
422 const TRexChar *cur = str;
423 int capture = -1;
424 if (node->type != OP_NOCAPEXPR && node->right == exp->_currsubexp) {
425 capture = exp->_currsubexp;
426 exp->_matches[capture].begin = cur;
427 exp->_currsubexp++;
429 do {
430 TRexNode *subnext = NULL;
431 if (n->next != -1) subnext = &exp->_nodes[n->next]; else subnext = next;
432 if (!(cur = trex_matchnode(exp, n, cur, subnext))) {
433 if (capture != -1){
434 exp->_matches[capture].begin = 0;
435 exp->_matches[capture].len = 0;
437 return NULL;
439 } while ((n->next != -1) && (n = &exp->_nodes[n->next]));
440 if (capture != -1) exp->_matches[capture].len = cur-exp->_matches[capture].begin;
441 return cur;
443 case OP_WB:
444 if ((str == exp->_bol && !isspace(*str)) ||
445 (str == exp->_eol && !isspace(*(str-1))) ||
446 (!isspace(*str) && isspace(*(str+1))) ||
447 (isspace(*str) && !isspace(*(str+1)))) return node->left=='b' ? str : NULL;
448 return node->left=='b' ? NULL : str;
449 case OP_BOL:
450 if (str == exp->_bol) return str;
451 return NULL;
452 case OP_EOL:
453 if (str == exp->_eol) return str;
454 return NULL;
455 case OP_DOT:
456 str++;
457 return str;
458 case OP_NCLASS:
459 case OP_CLASS:
460 if (trex_matchclass(exp, &exp->_nodes[node->left], *str) ?
461 (type == OP_CLASS ? TRex_True : TRex_False) :
462 (type == OP_NCLASS ? TRex_True : TRex_False)) {
463 str++;
464 return str;
466 return NULL;
467 case OP_CCLASS:
468 if (trex_matchcclass(node->left, *str)) {
469 str++;
470 return str;
472 return NULL;
473 default: /* char */
474 if (*str != node->type) return NULL;
475 str++; /* *str++; */
476 return str;
478 return NULL;
482 /* public api */
483 TRex *trex_compile (const TRexChar *pattern, const TRexChar **error) {
484 TRex *exp = (TRex *)malloc(sizeof(TRex));
485 exp->_eol = exp->_bol = NULL;
486 exp->_p = pattern;
487 exp->_nallocated = (int)scstrlen(pattern)*sizeof(TRexChar);
488 exp->_nodes = (TRexNode *)malloc(exp->_nallocated*sizeof(TRexNode));
489 exp->_nsize = 0;
490 exp->_matches = 0;
491 exp->_nsubexpr = 0;
492 exp->_first = trex_newnode(exp, OP_EXPR);
493 exp->_error = error;
494 exp->_jmpbuf = malloc(sizeof(jmp_buf));
495 if (setjmp(*((jmp_buf *)exp->_jmpbuf)) == 0) {
496 int res = trex_list(exp);
497 exp->_nodes[exp->_first].left = res;
498 if (*exp->_p!='\0') trex_error(exp, _SC("unexpected character"));
499 #ifdef _DEBUG
501 int nsize, i;
502 TRexNode *t;
503 nsize = exp->_nsize;
504 t = &exp->_nodes[0];
505 scprintf(_SC("\n"));
506 for (i = 0;i < nsize; i++) {
507 if(exp->_nodes[i].type > MAX_CHAR)
508 scprintf(_SC("[%02d] %10s "), i, g_nnames[exp->_nodes[i].type-MAX_CHAR]);
509 else
510 scprintf(_SC("[%02d] %10c "), i, exp->_nodes[i].type);
511 scprintf(_SC("left %02d right %02d next %02d\n"), exp->_nodes[i].left, exp->_nodes[i].right, exp->_nodes[i].next);
513 scprintf(_SC("\n"));
515 #endif
516 exp->_matches = (TRexMatch *)malloc(exp->_nsubexpr*sizeof(TRexMatch));
517 memset(exp->_matches, 0, exp->_nsubexpr*sizeof(TRexMatch));
518 } else {
519 trex_free(exp);
520 return NULL;
522 return exp;
526 void trex_free (TRex *exp) {
527 if (exp) {
528 if (exp->_nodes) free(exp->_nodes);
529 if (exp->_jmpbuf) free(exp->_jmpbuf);
530 if (exp->_matches) free(exp->_matches);
531 free(exp);
536 TRexBool trex_match (TRex *exp, const TRexChar *text) {
537 const TRexChar *res = NULL;
538 exp->_bol = text;
539 exp->_eol = text+scstrlen(text);
540 exp->_currsubexp = 0;
541 res = trex_matchnode(exp, exp->_nodes, text, NULL);
542 if (res == NULL || res != exp->_eol) return TRex_False;
543 return TRex_True;
547 TRexBool trex_searchrange (TRex *exp, const TRexChar *text_begin, const TRexChar *text_end,
548 const TRexChar **out_begin, const TRexChar **out_end)
550 const TRexChar *cur = NULL;
551 int node = exp->_first;
552 if (text_begin >= text_end) return TRex_False;
553 exp->_bol = text_begin;
554 exp->_eol = text_end;
555 do {
556 cur = text_begin;
557 while (node != -1) {
558 exp->_currsubexp = 0;
559 cur = trex_matchnode(exp, &exp->_nodes[node], cur, NULL);
560 if (!cur) break;
561 node = exp->_nodes[node].next;
563 text_begin++;
564 } while (cur == NULL && text_begin != text_end);
565 if (cur == NULL) return TRex_False;
566 --text_begin;
567 if (out_begin) *out_begin = text_begin;
568 if (out_end) *out_end = cur;
569 return TRex_True;
573 TRexBool trex_search (TRex *exp, const TRexChar *text, const TRexChar **out_begin, const TRexChar **out_end) {
574 return trex_searchrange(exp, text, text+scstrlen(text), out_begin, out_end);
578 int trex_getsubexpcount (TRex *exp) {
579 return exp->_nsubexpr;
583 TRexBool trex_getsubexp (TRex *exp, int n, TRexMatch *subexp) {
584 if (n < 0 || n >= exp->_nsubexpr) return TRex_False;
585 *subexp = exp->_matches[n];
586 return TRex_True;