1 //////////////////////////////////////////////////////////////////////////////
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are free to incorporate any part of ADLib and Prop into
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
18 // This software is still under development and we welcome any suggestions
19 // and help from the users.
23 //////////////////////////////////////////////////////////////////////////////
29 #include <AD/generic/generic.h> // Generic definitions
30 #include <AD/contain/charset.h> // Character set
31 #include <AD/strings/regexp.h> // Regular expression search
34 static CharSet alpha
; // set of alpha characters
35 static CharSet numeric
; // set of numeric characters
36 static CharSet space
; // set of space characters
37 static CharSet word
; // set of word characters
40 { for (register int c
= 0; c
< 256; c
++) {
41 if (isalpha(c
)) alpha
+= c
;
42 if (isdigit(c
)) numeric
+= c
;
43 if (isspace(c
)) space
+= c
;
44 if (isalnum(c
)) word
+= c
;
50 // The set of opcodes used by the NFA matching machine
53 enum { SUCCESS
= 0, // accept state
54 BEGIN_OF_LINE
= 1, // match the beginning of a line
55 END_OF_LINE
= 2, // match the end of a line
56 WILD_CARD
= 3, // matches anything besides '\n'
57 MATCH_LITERAL
= 4, // matches a literal string
58 MATCH_SET
= 5, // matches any character in a set
59 MATCH_CHAR_STAR
= 6, // matches a character zero or more times
60 MATCH_CHAR_PLUS
= 7, // matches a character one or more times
61 MATCH_SET_STAR
= 8, // matches a set zero or more times
62 MATCH_SET_PLUS
= 9, // matches a set one or more times
63 OR
= 10, // disjunction, branch unlikely
64 PLUS
= 11, // disjunction, branch likely
65 JUMP
= 12, // unconditional jump
66 TIMES
= 13, // match a subexpression m to n times.
67 MATCH_BOUND
= 14, // match word boundary
68 MATCH_NON_BOUND
= 15, // match non-word boundary
69 MATCH_WORD_BEGIN
= 16, // match beginning of a word
70 MATCH_WORD_END
= 17, // match end of a word
71 REF
= 20, // matches a backward reference
72 OPEN
= 30, // beginning of a reference
73 CLOSE
= 40 // end of a reference
76 RegExp::RegExp(const RegExp
& re
) : nfa(0) { *this = re
; }
78 RegExp
& RegExp::operator = (const RegExp
& re
)
80 delete [] nfa
; nfa
= NULL
;
82 nfa
= new Opcode
[size
= re
.size
];
83 memcpy(nfa
,re
.nfa
,sizeof(Opcode
) * size
);
84 minMatchLen
= re
.minMatchLen
;
91 // The main pattern matcher, returns the character following
92 // the entire matched text if successful; otherwise, returns NULL
95 const char * RegExp::grep( register const Opcode
* PC
, // current opcode
96 register const char * text
, // current character
97 register const char * textEnd
// end of text string
99 { register char c
= *text
++; // cache the first character
100 int n
, len
, minTimes
, times
;
104 case SUCCESS
: return text
-1;
105 case BEGIN_OF_LINE
: if (text
- 1 != myText
) return NULL
; break;
107 if (text
> textEnd
) break;
108 if (c
== '\n') { c
= *text
++; break; }
111 if (text
> textEnd
|| c
== '\n') return NULL
;
113 case OPEN
+0: case OPEN
+1: case OPEN
+2: case OPEN
+3: case OPEN
+4:
114 case OPEN
+5: case OPEN
+6: case OPEN
+7: case OPEN
+8: case OPEN
+9:
115 left
[PC
[-1] - OPEN
] = text
- myText
- 1; break;
116 case CLOSE
+0: case CLOSE
+1: case CLOSE
+2: case CLOSE
+3: case CLOSE
+4:
117 case CLOSE
+5: case CLOSE
+6: case CLOSE
+7: case CLOSE
+8: case CLOSE
+9:
118 right
[PC
[-1] - CLOSE
] = text
- myText
- 2; break;
119 case REF
+0: case REF
+1: case REF
+2: case REF
+3: case REF
+4:
120 case REF
+5: case REF
+6: case REF
+7: case REF
+8: case REF
+9:
121 n
= PC
[-1] - REF
; len
= right
[n
] - left
[n
];
122 if (text
+len
>= textEnd
|| len
< 0 ||
123 memcmp(myText
+left
[n
],text
-1,len
))
125 text
+= len
- 1; c
= *text
++;
130 if (c
!= *PC
++ || text
+ len
> textEnd
|| memcmp(text
,PC
,len
))
132 PC
+= len
; text
+= len
; c
= *text
++;
136 if (text
> textEnd
||
137 bitOf((Byte
*)PC
,(unsigned char)c
) == 0) return NULL
;
138 PC
+= 32; c
= *text
++; break;
140 case MATCH_CHAR_PLUS
: minTimes
= 1; goto Match_char
;
141 case MATCH_CHAR_STAR
: minTimes
= 0;
143 { c
= PC
[1]; PC
+= 2;
144 for (times
= 0, text
--; text
< textEnd
; text
++, times
++)
145 if (*text
!= c
) break;
146 for ( ;times
>= minTimes
; times
--, text
--) {
147 const char * matchEnd
= this->grep(PC
,text
,textEnd
);
148 if (matchEnd
) return matchEnd
;
152 case MATCH_SET_PLUS
: minTimes
= 1; goto Match_set
;
153 case MATCH_SET_STAR
: minTimes
= 0;
155 { for (times
= 0, text
--; text
< textEnd
; text
++, times
++)
156 if (bitOf((Byte
*)PC
,(unsigned char)*text
) == 0) break;
157 for ( PC
+= 32; times
>= minTimes
; times
--, text
--) {
158 const char * matchEnd
= this->grep(PC
,text
,textEnd
);
159 if (matchEnd
) return matchEnd
;
164 case TIMES
: // Try subpattern m to n times
165 return this->times(PC
[0], PC
[1], text
-1, textEnd
, PC
+2);
166 case MATCH_WORD_BEGIN
: // Beginning of a word??
167 if ( text
> textEnd
|| ! word
[c
] ||
168 text
- 1 > myText
&& word
[text
[-2]])
171 case MATCH_WORD_END
: // End of a word??
172 if ( text
<= textEnd
&& ! word
[c
] ||
173 text
< textEnd
&& word
[c
]) return NULL
;
175 case MATCH_BOUND
: // A word boundary??
176 if ( (text
> textEnd
|| ! word
[c
]) &&
177 (text
- 2 < myText
|| ! word
[text
[-2]]) ||
178 text
<= textEnd
&& word
[c
] &&
179 text
- 2 >= myText
&& word
[text
[-2]] )
182 case MATCH_NON_BOUND
: // Not a word boundary??
183 if ( text
<= textEnd
&& word
[c
] &&
184 (text
- 2 < myText
|| ! word
[text
[-2]]) ||
185 (text
> textEnd
|| ! word
[c
]) &&
186 text
- 2 >= myText
&& word
[text
[-2]] )
189 case OR
: // Try one alternate, then another
190 { const char * success
= this->grep(PC
+2,text
-1,textEnd
);
191 if (success
) return success
; else PC
+= PC
[0] * 256 + PC
[1];
193 case PLUS
: // Try one alternate, then another
194 { const char * good
=
195 this->grep(PC
+PC
[0] * 256 + PC
[1],text
-1,textEnd
);
196 if (good
) return good
; else PC
+= 2;
198 case JUMP
: // Unconditional jump
199 PC
+= PC
[0] * 256 + PC
[1]; break;
200 default: return NULL
;
207 // Call and matches a subpattern |minTimes| to |maxTimes| times.
208 // Notice that |maxTimes < 0| means that the maximum times is infinite.
211 const char * RegExp::times( int minTimes
,
217 { const char * matched
;
218 while (minTimes
>= 0 && maxTimes
>= 0) {
219 if (maxTimes
== 1) return this->grep(PC
,text
,textEnd
);
220 else if (maxTimes
< 0)
221 matched
= this->times(minTimes
-1,-1,PC
,text
,textEnd
);
223 matched
= this->times(minTimes
-1,maxTimes
-1,PC
,text
,textEnd
);
224 if (matched
== NULL
) return NULL
;
225 if (minTimes
< 0) return matched
;
232 // Main entry point of the pattern matcher
235 int RegExp::Match(register const char * text
, int length
)
237 if (nfa
== NULL
) return -1;
238 if (length
< 0) length
= strlen(text
);
242 const char * textEnd
= text
+ length
;
243 register const char * limit
;
244 if (minMatchLen
> 0) limit
= textEnd
- minMatchLen
+ 1;
245 else limit
= textEnd
;
248 case BEGIN_OF_LINE
: // Anchored match
249 if ((matchedTextEnd
= this->grep(nfa
+1,text
,textEnd
))) {
251 return matchedText
- myText
;
254 case MATCH_LITERAL
: // Starts with a literal
256 for ( c
= nfa
[2]; text
< limit
; text
++)
257 if (c
== *text
&& text
+ len
<= textEnd
&&
258 memcmp(text
,nfa
+2,len
) == 0)
259 if ((matchedTextEnd
= this->grep(nfa
+len
+2,text
+len
,textEnd
))) {
261 return matchedText
- myText
;
265 case MATCH_CHAR_PLUS
: // Starts with a character
266 for ( c
= nfa
[2]; text
< limit
; text
++)
268 if ((matchedTextEnd
= this->grep(nfa
,text
,textEnd
))) {
270 return matchedText
- myText
;
273 case MATCH_SET
: case MATCH_SET_PLUS
: // Starts with a set
274 { register Byte
* set
= (Byte
*)(nfa
+1);
275 for ( ; text
< limit
; text
++)
276 if (bitOf(set
,(unsigned char)*text
))
277 if ((matchedTextEnd
= this->grep(nfa
,text
,textEnd
))) {
279 return matchedText
- myText
;
283 default: // Can start with anything
284 for ( ;text
< limit
; text
++)
285 if ((matchedTextEnd
= this->grep(nfa
,text
,textEnd
))) {
287 return matchedText
- myText
;
294 // Compile a new pattern
297 RegExp
& RegExp::compile(const char * pattern
, const char * meta
)
299 delete [] nfa
; // delete last compiled pattern, if any
301 if (meta
== NULL
) // set default meta characters
302 meta
= "*+()[]?^$-.|\\\0";
304 MetaChar M
[256]; // meta character assignment
306 if (! space
[' ']) init(); // initialize, if necessary
309 // Compute the meta character assignment
311 for (int i
= 0; i
< 256; i
++) M
[i
] = META_NONE
;
312 for (int j
= META_STAR
; j
<= META_EOP
; j
++) M
[ meta
[j
] ] = (MetaChar
)j
;
314 size
= this->comp(pattern
, M
, false); // compile it once to get the size
317 nfa
= new Opcode
[size
]; // allocate new storage
318 this->comp(pattern
, M
, true); // compile it again to emit opcodes
319 minMatchLen
= this->minMatch(nfa
); // compute the minimal matching len
325 inline void shift(register RegExp::Opcode
* start
,
326 register RegExp::Opcode
* end
,
329 while (end
> start
) { end
--; end
[size
] = end
[0]; }
333 // ``Parse'' pattern and compile the NFA
336 int RegExp::comp(const char * pattern
, const MetaChar M
[], Bool emit
)
338 register const char * p
; // pointer to pattern
339 register char c
; // cached pattern character
340 register int size
= 0; // size of the nfa
341 register Opcode
* PC
= nfa
; // nfa opcodes
342 Opcode
* last
; // last opcode
343 CharSet set
; // a character set
344 int level
= 0; // nesting level
345 Opcode
* start
[10]; // starting point for subpattern
346 int parenCount
= 0; // parenthesis count
347 int paren
[10]; // closing parenthesis number
348 Bool complement
= false; // set complement??
350 start
[0] = PC
; // starting point of first pattern
352 for (last
= NULL
, p
= pattern
; M
[c
= *p
++] != META_EOP
; ) {
359 if (last
[1] == 1) last
[0] = MATCH_CHAR_STAR
;
360 else goto Emit_star
; break;
361 case MATCH_CHAR_PLUS
: last
[0] = MATCH_CHAR_STAR
; break;
362 case MATCH_CHAR_STAR
: break;
364 case MATCH_SET_PLUS
: last
[0] = MATCH_SET_STAR
; break;
365 case MATCH_SET_STAR
: break;
368 *PC
++ = (last
- PC
- 2) / 256;
369 *PC
++ = (last
- PC
- 2) % 256;
372 last
[1] = (PC
- last
+ 2) / 256;
373 last
[2] = (PC
- last
+ 2) % 256;
377 } else goto Syntax_Error
;
378 } else size
+= 6; break;
384 if (last
[1] == 1) last
[0] = MATCH_CHAR_PLUS
;
385 else goto Emit_plus
; break;
386 case MATCH_CHAR_PLUS
:
387 case MATCH_CHAR_STAR
:
389 case MATCH_SET_STAR
: break;
390 case MATCH_SET
: last
[0] = MATCH_SET_PLUS
; break;
393 *PC
++ = (last
- PC
+ 1) / 256;
394 *PC
++ = (last
- PC
+ 1) % 256;
397 } else goto Syntax_Error
;
398 } else size
+= 3; break;
401 if (last
== NULL
) goto Syntax_Error
;
404 last
[1] = (PC
- last
+ 2) / 256;
405 last
[2] = (PC
- last
+ 2) % 256;
412 shift(start
[level
],PC
,3);
413 start
[level
][0] = OR
;
414 start
[level
][1] = (PC
- start
[level
] + 2) / 256;
415 start
[level
][2] = (PC
- start
[level
] + 2) % 256;
420 if (emit
) { last
= PC
; *PC
= OPEN
+ parenCount
; } else size
++;
421 paren
[level
] = parenCount
;
422 if (++parenCount
>= 10) goto Syntax_Error
;
423 if (++level
>= 10) goto Syntax_Error
;
428 if (--level
< 0) goto Syntax_Error
;
429 if (emit
) { *PC
++ = CLOSE
+ paren
[level
]; }
433 last
= PC
; if (emit
) *PC
++ = BEGIN_OF_LINE
; else size
++; break;
435 last
= PC
; if (emit
) *PC
++ = END_OF_LINE
; else size
++; break;
437 last
= PC
; if (emit
) *PC
++ = WILD_CARD
; else size
++; break;
440 const char * start
= p
;
442 while (M
[c
= *p
++] != META_SET_CLOSE
) {
443 if (c
== '^' && p
-1 == start
) { complement
= true; continue; }
445 case META_EOP
: goto Syntax_Error
;
446 case META_RANGE
: lastChar
= p
[-2]; break;
449 case 'n': set
+= '\n'; break;
450 case 't': set
+= '\t'; break;
454 *p
>= '0' && *p
<= '7' && p
- q
<= 2; p
++)
455 c
= c
* 8 + *p
- '0';
457 case 'd': set
+= numeric
; break;
458 case 'D': set
+= ! numeric
; break;
459 case 's': set
+= space
; break;
460 case 'S': set
+= ! space
; break;
461 case 'w': set
+= word
; break;
462 case 'W': set
+= ! word
; break;
463 default: goto Emit_set_char
;
465 default: Emit_set_char
:
467 for ( ;lastChar
<= c
; lastChar
++) set
+= lastChar
;
477 if (complement
) set
= ! set
;
480 memcpy(PC
,(const Byte
*)set
,32);
488 case 'n': c
= '\n'; goto Emit_match_char
; // new line
489 case 't': c
= '\t'; goto Emit_match_char
; // tab
492 for (q
= p
, c
-= '0'; *p
>= '0' && *p
<= '7' &&
494 c
= c
* 8 + *p
- '0';
495 goto Emit_match_char
;
497 case 'd': set
= numeric
; goto Emit_match_set
;
498 case 'D': set
= ! numeric
; goto Emit_match_set
;
499 case 'w': set
= word
; goto Emit_match_set
;
500 case 'W': set
= ! word
; goto Emit_match_set
;
501 case 's': set
= space
; goto Emit_match_set
;
502 case 'S': set
= ! space
; goto Emit_match_set
;
503 case 'b': if (emit
) *PC
++ = MATCH_BOUND
; else size
++; break;
504 case 'B': if (emit
) *PC
++ = MATCH_NON_BOUND
; else size
++; break;
505 case '<': if (emit
) *PC
++ = MATCH_WORD_BEGIN
; else size
++; break;
506 case '>': if (emit
) *PC
++ = MATCH_WORD_END
; else size
++; break;
507 default: goto Emit_match_char
;
509 case META_NONE
: Emit_match_char
:
511 if (last
&& *last
== MATCH_LITERAL
&&
512 M
[*p
] != META_STAR
&& M
[*p
] != META_PLUS
&&
513 M
[*p
] != META_OPTIONAL
) {
518 *PC
++ = MATCH_LITERAL
;
528 if (emit
) *PC
++ = SUCCESS
;
538 // Traverse the opcode and compute the minimal length of the matched text
541 int RegExp::minMatch(register const Opcode
* PC
)
542 { register int len
= 0;
545 case SUCCESS
: return len
;
546 case WILD_CARD
: len
++; break;
547 case MATCH_CHAR_STAR
: PC
+= 2; break;
548 case MATCH_CHAR_PLUS
: len
++; PC
+= 2; break;
549 case MATCH_SET_STAR
: PC
+= 32; break;
551 case MATCH_SET
: len
++; PC
+= 32; break;
552 case MATCH_LITERAL
: len
+= *PC
; PC
+= *PC
+ 1; break;
554 { int len1
= this->minMatch(PC
+ PC
[0] * 256 + PC
[1]);
555 int len2
= this->minMatch(PC
+ 2);
556 return len
+ (len1
< len2
? len1
: len2
);
558 case TIMES
: PC
+= 2; break;
559 case MATCH_BOUND
: case MATCH_NON_BOUND
:
560 case MATCH_WORD_BEGIN
: case MATCH_WORD_END
:
561 case BEGIN_OF_LINE
: case END_OF_LINE
:
562 case REF
+0: case REF
+1: case REF
+2: case REF
+3: case REF
+4:
563 case REF
+5: case REF
+6: case REF
+7: case REF
+8: case REF
+9: