1 /* see copyright notice in trex.h */
9 # define scisprint iswprint
10 # define scstrlen wcslen
11 # define scprintf wprintf
14 # define scisprint isprint
15 # define scstrlen strlen
16 # define scprintf printf
23 static const TRexChar
*g_nnames
[] = {
24 _SC("NONE"), _SC("OP_GREEDY"), _SC("OP_OR"),
25 _SC("OP_EXPR"), _SC("OP_NOCAPEXPR"), _SC("OP_DOT"), _SC("OP_CLASS"),
26 _SC("OP_CCLASS"), _SC("OP_NCLASS"), _SC("OP_RANGE"), _SC("OP_CHAR"),
27 _SC("OP_EOL"), _SC("OP_BOL"), _SC("OP_WB")
32 #define OP_GREEDY (MAX_CHAR+1) // * + ? {n}
33 #define OP_OR (MAX_CHAR+2)
34 #define OP_EXPR (MAX_CHAR+3) //parentesis ()
35 #define OP_NOCAPEXPR (MAX_CHAR+4) //parentesis (?:)
36 #define OP_DOT (MAX_CHAR+5)
37 #define OP_CLASS (MAX_CHAR+6)
38 #define OP_CCLASS (MAX_CHAR+7)
39 #define OP_NCLASS (MAX_CHAR+8) //negates class the [^
40 #define OP_RANGE (MAX_CHAR+9)
41 #define OP_CHAR (MAX_CHAR+10)
42 #define OP_EOL (MAX_CHAR+11)
43 #define OP_BOL (MAX_CHAR+12)
44 #define OP_WB (MAX_CHAR+13)
46 #define TREX_SYMBOL_ANY_CHAR ('.')
47 #define TREX_SYMBOL_GREEDY_ONE_OR_MORE ('+')
48 #define TREX_SYMBOL_GREEDY_ZERO_OR_MORE ('*')
49 #define TREX_SYMBOL_GREEDY_ZERO_OR_ONE ('?')
50 #define TREX_SYMBOL_BRANCH ('|')
51 #define TREX_SYMBOL_END_OF_STRING ('$')
52 #define TREX_SYMBOL_BEGINNING_OF_STRING ('^')
53 #define TREX_SYMBOL_ESCAPE_CHAR ('\\')
56 typedef int TRexNodeType
;
58 typedef struct tagTRexNode
{
78 const TRexChar
**_error
;
82 static int trex_list (TRex
*exp
);
85 static int trex_newnode (TRex
*exp
, TRexNodeType type
) {
89 n
.next
= n
.right
= n
.left
= -1;
90 if (type
== OP_EXPR
) n
.right
= exp
->_nsubexpr
++;
91 if(exp
->_nallocated
< (exp
->_nsize
+1)) {
92 exp
->_nallocated
*= 2;
93 exp
->_nodes
= (TRexNode
*)realloc(exp
->_nodes
, exp
->_nallocated
*sizeof(TRexNode
));
95 exp
->_nodes
[exp
->_nsize
++] = n
;
96 newid
= exp
->_nsize
-1;
101 static void trex_error (TRex
*exp
, const TRexChar
*error
) {
102 if (exp
->_error
) *exp
->_error
= error
;
103 longjmp(*((jmp_buf*)exp
->_jmpbuf
), -1);
107 static void trex_expect (TRex
*exp
, int n
){
108 if ((*exp
->_p
) != n
) trex_error(exp
, _SC("expected paren"));
113 static TRexChar
trex_escapechar (TRex
*exp
) {
114 if (*exp
->_p
== TREX_SYMBOL_ESCAPE_CHAR
) {
117 case 'v': exp
->_p
++; return '\v';
118 case 'n': exp
->_p
++; return '\n';
119 case 't': exp
->_p
++; return '\t';
120 case 'r': exp
->_p
++; return '\r';
121 case 'f': exp
->_p
++; return '\f';
122 default: return (*exp
->_p
++);
125 if (!scisprint(*exp
->_p
)) trex_error(exp
, _SC("letter expected"));
130 static int trex_charclass (TRex
*exp
, int classid
) {
131 int n
= trex_newnode(exp
, OP_CCLASS
);
132 exp
->_nodes
[n
].left
= classid
;
137 static int trex_charnode (TRex
*exp
, TRexBool isclass
) {
139 if (*exp
->_p
== TREX_SYMBOL_ESCAPE_CHAR
) {
142 case 'n': exp
->_p
++; return trex_newnode(exp
, '\n');
143 case 't': exp
->_p
++; return trex_newnode(exp
, '\t');
144 case 'r': exp
->_p
++; return trex_newnode(exp
, '\r');
145 case 'f': exp
->_p
++; return trex_newnode(exp
, '\f');
146 case 'v': exp
->_p
++; return trex_newnode(exp
, '\v');
147 case 'a': case 'A': case 'w': case 'W': case 's': case 'S':
148 case 'd': case 'D': case 'x': case 'X': case 'c': case 'C':
149 case 'p': case 'P': case 'l': case 'u': {
150 t
= *exp
->_p
; exp
->_p
++;
151 return trex_charclass(exp
, t
); }
155 int node
= trex_newnode(exp
, OP_WB
);
156 exp
->_nodes
[node
].left
= *exp
->_p
;
161 t
= *exp
->_p
; exp
->_p
++;
162 return trex_newnode(exp
, t
);
165 else if (!scisprint(*exp
->_p
)) trex_error(exp
, _SC("letter expected"));
166 t
= *exp
->_p
; exp
->_p
++;
167 return trex_newnode(exp
, t
);
171 static int trex_class (TRex
*exp
) {
173 int first
= -1, chain
;
174 if (*exp
->_p
== TREX_SYMBOL_BEGINNING_OF_STRING
) {
175 ret
= trex_newnode(exp
, OP_NCLASS
);
177 } else ret
= trex_newnode(exp
, OP_CLASS
);
178 if (*exp
->_p
== ']') trex_error(exp
, _SC("empty class"));
180 while (*exp
->_p
!= ']' && exp
->_p
!= exp
->_eol
) {
181 if (*exp
->_p
== '-' && first
!= -1){
183 if(*exp
->_p
++ == ']') trex_error(exp
, _SC("unfinished range"));
184 r
= trex_newnode(exp
, OP_RANGE
);
185 if(first
>*exp
->_p
) trex_error(exp
, _SC("invalid range"));
186 if(exp
->_nodes
[first
].type
== OP_CCLASS
) trex_error(exp
, _SC("cannot use character classes in ranges"));
187 exp
->_nodes
[r
].left
= exp
->_nodes
[first
].type
;
188 t
= trex_escapechar(exp
);
189 exp
->_nodes
[r
].right
= t
;
190 exp
->_nodes
[chain
].next
= r
;
196 exp
->_nodes
[chain
].next
= c
;
198 first
= trex_charnode(exp
, TRex_True
);
200 first
= trex_charnode(exp
, TRex_True
);
206 exp
->_nodes
[chain
].next
= c
;
211 exp
->_nodes
[ret
].left
= exp
->_nodes
[ret
].next
;
212 exp
->_nodes
[ret
].next
= -1;
217 static int trex_parsenumber (TRex
*exp
) {
218 int ret
= *exp
->_p
-'0';
221 while (isdigit(*exp
->_p
)) {
222 ret
= ret
*10+(*exp
->_p
++-'0');
223 if (positions
==1000000000) trex_error(exp
, _SC("overflow in numeric constant"));
230 static int trex_element (TRex
*exp
) {
236 if (*exp
->_p
=='?') {
238 trex_expect(exp
, ':');
239 expr
= trex_newnode(exp
, OP_NOCAPEXPR
);
241 expr
= trex_newnode(exp
, OP_EXPR
);
243 newn
= trex_list(exp
);
244 exp
->_nodes
[expr
].left
= newn
;
246 trex_expect(exp
, ')');
251 ret
= trex_class(exp
);
252 trex_expect(exp
, ']');
254 case TREX_SYMBOL_END_OF_STRING
: exp
->_p
++; ret
= trex_newnode(exp
, OP_EOL
);break;
255 case TREX_SYMBOL_ANY_CHAR
: exp
->_p
++; ret
= trex_newnode(exp
, OP_DOT
);break;
257 ret
= trex_charnode(exp
, TRex_False
);
262 TRexBool isgreedy
= TRex_False
;
263 unsigned short p0
= 0, p1
= 0;
265 case TREX_SYMBOL_GREEDY_ZERO_OR_MORE
: p0
= 0; p1
= 0xFFFF; exp
->_p
++; isgreedy
= TRex_True
; break;
266 case TREX_SYMBOL_GREEDY_ONE_OR_MORE
: p0
= 1; p1
= 0xFFFF; exp
->_p
++; isgreedy
= TRex_True
; break;
267 case TREX_SYMBOL_GREEDY_ZERO_OR_ONE
: p0
= 0; p1
= 1; exp
->_p
++; isgreedy
= TRex_True
; break;
270 if (!isdigit(*exp
->_p
)) trex_error(exp
, _SC("number expected"));
271 p0
= (unsigned short)trex_parsenumber(exp
);
272 /*******************************/
274 case '}': p1
= p0
; exp
->_p
++; break;
278 if (isdigit(*exp
->_p
)) p1
= (unsigned short)trex_parsenumber(exp
);
279 trex_expect(exp
, '}');
282 trex_error(exp
, _SC(", or } expected"));
284 /*******************************/
285 isgreedy
= TRex_True
;
289 int nnode
= trex_newnode(exp
, OP_GREEDY
);
291 exp
->_nodes
[nnode
].left
= ret
;
292 exp
->_nodes
[nnode
].right
= ((p0
)<<16)|p1
;
296 if ((*exp
->_p
!= TREX_SYMBOL_BRANCH
) && (*exp
->_p
!= ')') &&
297 (*exp
->_p
!= TREX_SYMBOL_GREEDY_ZERO_OR_MORE
) &&
298 (*exp
->_p
!= TREX_SYMBOL_GREEDY_ONE_OR_MORE
) &&
299 (*exp
->_p
!= '\0')) {
300 int nnode
= trex_element(exp
);
301 exp
->_nodes
[ret
].next
= nnode
;
307 static int trex_list (TRex
*exp
) {
309 if (*exp
->_p
== TREX_SYMBOL_BEGINNING_OF_STRING
) {
311 ret
= trex_newnode(exp
, OP_BOL
);
313 e
= trex_element(exp
);
314 if (ret
!= -1) exp
->_nodes
[ret
].next
= e
; else ret
= e
;
315 if (*exp
->_p
== TREX_SYMBOL_BRANCH
) {
318 temp
= trex_newnode(exp
, OP_OR
);
319 exp
->_nodes
[temp
].left
= ret
;
320 tright
= trex_list(exp
);
321 exp
->_nodes
[temp
].right
= tright
;
328 static TRexBool
trex_matchcclass (int cclass
, TRexChar c
) {
330 case 'a': return isalpha(c
)?TRex_True
:TRex_False
;
331 case 'A': return !isalpha(c
)?TRex_True
:TRex_False
;
332 case 'w': return (isalnum(c
) || c
== '_')?TRex_True
:TRex_False
;
333 case 'W': return (!isalnum(c
) && c
!= '_')?TRex_True
:TRex_False
;
334 case 's': return isspace(c
)?TRex_True
:TRex_False
;
335 case 'S': return !isspace(c
)?TRex_True
:TRex_False
;
336 case 'd': return isdigit(c
)?TRex_True
:TRex_False
;
337 case 'D': return !isdigit(c
)?TRex_True
:TRex_False
;
338 case 'x': return isxdigit(c
)?TRex_True
:TRex_False
;
339 case 'X': return !isxdigit(c
)?TRex_True
:TRex_False
;
340 case 'c': return iscntrl(c
)?TRex_True
:TRex_False
;
341 case 'C': return !iscntrl(c
)?TRex_True
:TRex_False
;
342 case 'p': return ispunct(c
)?TRex_True
:TRex_False
;
343 case 'P': return !ispunct(c
)?TRex_True
:TRex_False
;
344 case 'l': return islower(c
)?TRex_True
:TRex_False
;
345 case 'u': return isupper(c
)?TRex_True
:TRex_False
;
347 return TRex_False
; /*cannot happen*/
351 static TRexBool
trex_matchclass (TRex
* exp
, TRexNode
*node
, TRexChar c
) {
353 switch (node
->type
) {
355 if (c
>= node
->left
&& c
<= node
->right
) return TRex_True
;
358 if (trex_matchcclass(node
->left
, c
)) return TRex_True
;
361 if (c
== node
->type
)return TRex_True
;
363 } while ((node
->next
!= -1) && (node
= &exp
->_nodes
[node
->next
]));
368 static const TRexChar
*trex_matchnode (TRex
* exp
, TRexNode
*node
, const TRexChar
*str
, TRexNode
*next
) {
369 TRexNodeType type
= node
->type
;
372 //TRexNode *greedystop = (node->next != -1) ? &exp->_nodes[node->next] : NULL;
373 TRexNode
*greedystop
= NULL
;
374 int p0
= (node
->right
>> 16)&0x0000FFFF, p1
= node
->right
&0x0000FFFF, nmaches
= 0;
375 const TRexChar
*s
=str
, *good
= str
;
376 if (node
->next
!= -1) greedystop
= &exp
->_nodes
[node
->next
]; else greedystop
= next
;
377 while ((nmaches
== 0xFFFF || nmaches
< p1
)) {
378 const TRexChar
*stop
;
379 if (!(s
= trex_matchnode(exp
, &exp
->_nodes
[node
->left
], s
, greedystop
))) break;
383 //checks that 0 matches satisfy the expression(if so skips)
384 //if not would always stop(for instance if is a '?')
385 if (greedystop
->type
!= OP_GREEDY
||
386 (greedystop
->type
== OP_GREEDY
&& ((greedystop
->right
>> 16)&0x0000FFFF) != 0)) {
387 TRexNode
*gnext
= NULL
;
388 if (greedystop
->next
!= -1) gnext
= &exp
->_nodes
[greedystop
->next
];
389 else if (next
&& next
->next
!= -1) gnext
= &exp
->_nodes
[next
->next
];
390 stop
= trex_matchnode(exp
, greedystop
, s
, gnext
);
392 //if satisfied stop it
393 if (p0
== p1
&& p0
== nmaches
) break;
394 else if (nmaches
>= p0
&& p1
== 0xFFFF) break;
395 else if (nmaches
>= p0
&& nmaches
<= p1
) break;
399 if (s
>= exp
->_eol
) break;
401 if (p0
== p1
&& p0
== nmaches
) return good
;
402 if (nmaches
>= p0
&& p1
== 0xFFFF) return good
;
403 if (nmaches
>= p0
&& nmaches
<= p1
) return good
;
407 const TRexChar
*asd
= str
;
408 TRexNode
*temp
=&exp
->_nodes
[node
->left
];
409 while ((asd
= trex_matchnode(exp
, temp
, asd
, NULL
))) {
410 if (temp
->next
!= -1) temp
= &exp
->_nodes
[temp
->next
]; else return asd
;
413 temp
= &exp
->_nodes
[node
->right
];
414 while ((asd
= trex_matchnode(exp
, temp
, asd
, NULL
))) {
415 if (temp
->next
!= -1) temp
= &exp
->_nodes
[temp
->next
]; else return asd
;
421 TRexNode
*n
= &exp
->_nodes
[node
->left
];
422 const TRexChar
*cur
= str
;
424 if (node
->type
!= OP_NOCAPEXPR
&& node
->right
== exp
->_currsubexp
) {
425 capture
= exp
->_currsubexp
;
426 exp
->_matches
[capture
].begin
= cur
;
430 TRexNode
*subnext
= NULL
;
431 if (n
->next
!= -1) subnext
= &exp
->_nodes
[n
->next
]; else subnext
= next
;
432 if (!(cur
= trex_matchnode(exp
, n
, cur
, subnext
))) {
434 exp
->_matches
[capture
].begin
= 0;
435 exp
->_matches
[capture
].len
= 0;
439 } while ((n
->next
!= -1) && (n
= &exp
->_nodes
[n
->next
]));
440 if (capture
!= -1) exp
->_matches
[capture
].len
= cur
-exp
->_matches
[capture
].begin
;
444 if ((str
== exp
->_bol
&& !isspace(*str
)) ||
445 (str
== exp
->_eol
&& !isspace(*(str
-1))) ||
446 (!isspace(*str
) && isspace(*(str
+1))) ||
447 (isspace(*str
) && !isspace(*(str
+1)))) return node
->left
=='b' ? str
: NULL
;
448 return node
->left
=='b' ? NULL
: str
;
450 if (str
== exp
->_bol
) return str
;
453 if (str
== exp
->_eol
) return str
;
460 if (trex_matchclass(exp
, &exp
->_nodes
[node
->left
], *str
) ?
461 (type
== OP_CLASS
? TRex_True
: TRex_False
) :
462 (type
== OP_NCLASS
? TRex_True
: TRex_False
)) {
468 if (trex_matchcclass(node
->left
, *str
)) {
474 if (*str
!= node
->type
) return NULL
;
483 TRex
*trex_compile (const TRexChar
*pattern
, const TRexChar
**error
) {
484 TRex
*exp
= (TRex
*)malloc(sizeof(TRex
));
485 exp
->_eol
= exp
->_bol
= NULL
;
487 exp
->_nallocated
= (int)scstrlen(pattern
)*sizeof(TRexChar
);
488 exp
->_nodes
= (TRexNode
*)malloc(exp
->_nallocated
*sizeof(TRexNode
));
492 exp
->_first
= trex_newnode(exp
, OP_EXPR
);
494 exp
->_jmpbuf
= malloc(sizeof(jmp_buf));
495 if (setjmp(*((jmp_buf *)exp
->_jmpbuf
)) == 0) {
496 int res
= trex_list(exp
);
497 exp
->_nodes
[exp
->_first
].left
= res
;
498 if (*exp
->_p
!='\0') trex_error(exp
, _SC("unexpected character"));
506 for (i
= 0;i
< nsize
; i
++) {
507 if(exp
->_nodes
[i
].type
> MAX_CHAR
)
508 scprintf(_SC("[%02d] %10s "), i
, g_nnames
[exp
->_nodes
[i
].type
-MAX_CHAR
]);
510 scprintf(_SC("[%02d] %10c "), i
, exp
->_nodes
[i
].type
);
511 scprintf(_SC("left %02d right %02d next %02d\n"), exp
->_nodes
[i
].left
, exp
->_nodes
[i
].right
, exp
->_nodes
[i
].next
);
516 exp
->_matches
= (TRexMatch
*)malloc(exp
->_nsubexpr
*sizeof(TRexMatch
));
517 memset(exp
->_matches
, 0, exp
->_nsubexpr
*sizeof(TRexMatch
));
526 void trex_free (TRex
*exp
) {
528 if (exp
->_nodes
) free(exp
->_nodes
);
529 if (exp
->_jmpbuf
) free(exp
->_jmpbuf
);
530 if (exp
->_matches
) free(exp
->_matches
);
536 TRexBool
trex_match (TRex
*exp
, const TRexChar
*text
) {
537 const TRexChar
*res
= NULL
;
539 exp
->_eol
= text
+scstrlen(text
);
540 exp
->_currsubexp
= 0;
541 res
= trex_matchnode(exp
, exp
->_nodes
, text
, NULL
);
542 if (res
== NULL
|| res
!= exp
->_eol
) return TRex_False
;
547 TRexBool
trex_searchrange (TRex
*exp
, const TRexChar
*text_begin
, const TRexChar
*text_end
,
548 const TRexChar
**out_begin
, const TRexChar
**out_end
)
550 const TRexChar
*cur
= NULL
;
551 int node
= exp
->_first
;
552 if (text_begin
>= text_end
) return TRex_False
;
553 exp
->_bol
= text_begin
;
554 exp
->_eol
= text_end
;
558 exp
->_currsubexp
= 0;
559 cur
= trex_matchnode(exp
, &exp
->_nodes
[node
], cur
, NULL
);
561 node
= exp
->_nodes
[node
].next
;
564 } while (cur
== NULL
&& text_begin
!= text_end
);
565 if (cur
== NULL
) return TRex_False
;
567 if (out_begin
) *out_begin
= text_begin
;
568 if (out_end
) *out_end
= cur
;
573 TRexBool
trex_search (TRex
*exp
, const TRexChar
*text
, const TRexChar
**out_begin
, const TRexChar
**out_end
) {
574 return trex_searchrange(exp
, text
, text
+scstrlen(text
), out_begin
, out_end
);
578 int trex_getsubexpcount (TRex
*exp
) {
579 return exp
->_nsubexpr
;
583 TRexBool
trex_getsubexp (TRex
*exp
, int n
, TRexMatch
*subexp
) {
584 if (n
< 0 || n
>= exp
->_nsubexpr
) return TRex_False
;
585 *subexp
= exp
->_matches
[n
];