1 //////////////////////////////////////////////////////////////////////////////
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are free to incorporate any part of ADLib and Prop into
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
18 // This software is still under development and we welcome any suggestions
19 // and help from the users.
23 //////////////////////////////////////////////////////////////////////////////
27 #include <AD/automata/nfa_node.h>
28 #include <AD/automata/nfa.h>
29 #include <AD/memory/mempool.h>
30 #include <AD/contain/fbitset.h>
31 #include <AD/strings/charesc.h>
32 #include <AD/strings/quark.h>
34 //////////////////////////////////////////////////////////////////////////////
35 // Import some type definitions
36 //////////////////////////////////////////////////////////////////////////////
37 typedef DFATables::State State
;
38 typedef DFATables::Rule Rule
;
39 typedef DFATables::Symbol Symbol
;
41 //////////////////////////////////////////////////////////////////////////////
42 // Constructor and destructor for class NFA
43 //////////////////////////////////////////////////////////////////////////////
45 : mem(* new MemPool
), own_mem(true), nfa(0), contexts(0), max_char(0),
46 number_of_states(0), is_singleton(0), nfa_states(0), context_nfas(0),
47 number_of_contexts(0), partitions(0)
48 { number_of_errors
= 0; bad_rule
= -1; error_message
= 0; }
50 : mem(m
), own_mem(false), nfa(0), contexts(0), max_char(0),
51 number_of_states(0), is_singleton(0), nfa_states(0), context_nfas(0),
52 number_of_contexts(0), partitions(0)
53 { number_of_errors
= 0; bad_rule
= -1; error_message
= 0; }
54 NFA::~NFA() { if (own_mem
) delete (&mem
); }
56 //////////////////////////////////////////////////////////////////////////////
58 //////////////////////////////////////////////////////////////////////////////
59 void NFA::error(const char * message
)
61 error_message
= message
;
64 //////////////////////////////////////////////////////////////////////////////
66 // Tags for the parse stack
68 //////////////////////////////////////////////////////////////////////////////
69 #define PAREN (NFA_Node*)0
70 #define DISJUNCTION (NFA_Node*)1
71 #define CONS (NFA_Node*)2
73 //////////////////////////////////////////////////////////////////////////////
74 // Method to construct a character class for '.'
75 //////////////////////////////////////////////////////////////////////////////
76 Symbol
NFA::parse_dot()
78 is_singleton
['\n'] = true;
79 char_classes
[0] = '\n' | COMPLEMENT
;
80 char_classes
[1] = END_CHAR_CLASS
;
81 char_class_table
[number_of_char_classes
++] = char_classes
;
83 return -number_of_char_classes
;
86 //////////////////////////////////////////////////////////////////////////////
87 // Method to parse a character class
88 //////////////////////////////////////////////////////////////////////////////
89 const char * NFA::parse_char_class(const char * p
, Symbol
& c
)
90 { Bool is_complemented
= false;
91 const char * start
= p
;
92 Symbol
* P
= char_classes
;
94 Symbol range_begin
= -1;
97 { case '-': { if (range_begin
< 0)
98 { error ("missing lhs in '-'"); return 0; }
100 { error ("duplicated '-'"); return 0; }
103 case '\0': { error ("missing ']' in character class"); return 0; }
104 case ']': { *P
++ = END_CHAR_CLASS
;
106 char_classes
[0] |= COMPLEMENT
;
107 char_class_table
[number_of_char_classes
++]
110 c
= -number_of_char_classes
;
113 case '^': { if (p
== start
)
114 { is_complemented
= true; p
++; break; }
116 default: { char ch
; Symbol ch2
;
117 p
= parse_char(p
,ch
);
118 if (case_insensitive
) ch
= toupper(ch
);
119 *P
++ = ch2
= (unsigned char)ch
;
120 if (Q
) // end of range???
121 { if (range_begin
>= ch2
)
122 { error ("bad range in character class"); return 0;}
123 *Q
|= RANGE_BIT
; range_begin
= -1; Q
= 0;
124 partitions
[ch2
+1] = true;
128 partitions
[ch2
] = true;
129 if (*p
!= '-') is_singleton
[ch2
] = true;
136 //////////////////////////////////////////////////////////////////////////////
137 // Method to parse the context
138 //////////////////////////////////////////////////////////////////////////////
139 const char * NFA::parse_context(const char * p
, Bool context_used
[])
140 { char name
[NFA::MAX_CONTEXT_LENGTH
];
141 char * cursor
= name
;
142 for (int i
= 0; i
< number_of_contexts
; i
++)
143 context_used
[i
] = false;
152 { const char * const * q
;
153 int context_number
= 0;
154 for (q
= contexts
; *q
; q
++, context_number
++)
155 { if (strcmp(*q
,name
) == 0)
156 { found
= context_used
[context_number
] = true; break; }
160 { error (Quark("undefined context <",name
,">")); return 0; }
161 if (*p
== '>') return p
+1;
165 case 0: { error ("missing '>' in contexts"); return 0; }
166 default: *cursor
++ = *p
++; break;
171 //////////////////////////////////////////////////////////////////////////////
172 // Join two nodes together
173 //////////////////////////////////////////////////////////////////////////////
174 inline NFA_Node
* join (MemPool
& mem
, NFA_Node
* a
, NFA_Node
* b
)
175 { return a
? mkor(mem
,a
,b
) : b
; }
177 //////////////////////////////////////////////////////////////////////////////
178 // Parse a regular expression and constructor an NFA.
179 //////////////////////////////////////////////////////////////////////////////
180 NFA_Node
* NFA::construct
181 ( const char *& regexp
, // the regular expression string
182 NFA_Node
*& new_cont
// new continuation
184 { NFA_Node
* root
= construct_concat(regexp
,new_cont
);
190 NFA_Node
* root2
= construct(regexp
,cont2
);
191 if (root
== 0 && root2
== 0)
193 } else if (root
== 0)
194 { NFA_Node
* join
= mkepsilon(mem
,0);
195 NFA_Node
* or_
= mkor(mem
,root2
,join
);
196 cont2
->set_out(join
);
197 new_cont
= join
; root
= or_
;
198 } else if (root2
== 0)
199 { NFA_Node
* join
= mkepsilon(mem
,0);
200 NFA_Node
* or_
= mkor(mem
,root
,join
);
201 new_cont
->set_out(join
);
202 new_cont
= join
; root
= or_
;
204 { NFA_Node
* or_
= mkor(mem
,root
,root2
);
205 NFA_Node
* join
= mkepsilon(mem
,0);
206 new_cont
->set_out(join
);
207 cont2
->set_out(join
);
208 new_cont
= join
; root
= or_
;
212 case ')': { return root
; }
213 default: { error("Syntax error"); return 0; }
218 //////////////////////////////////////////////////////////////////////////////
219 // Parse a concatenation
220 //////////////////////////////////////////////////////////////////////////////
221 NFA_Node
* NFA::construct_concat
222 ( const char *& regexp
, // the regular expression string
223 NFA_Node
*& new_cont
// new continuation
225 { NFA_Node
* root
= 0;
228 { NFA_Node
* cont
= 0;
229 NFA_Node
* one
= construct_one(regexp
,cont
);
232 { error ("missing ')'???"); return 0; }
233 new_cont
->set_out(one
);
236 { root
= one
; new_cont
= cont
;
241 case '\0': return root
;
246 //////////////////////////////////////////////////////////////////////////////
247 // Parse one component of a regular expression
248 //////////////////////////////////////////////////////////////////////////////
249 NFA_Node
* NFA::construct_one
250 ( const char *& regexp
, // the regular expression string
251 NFA_Node
*& new_cont
// new continuation
260 case '\0': { regexp
--; return root
; }
261 case '(': { root
= construct(regexp
,new_cont
);
263 { error ("missing closing ')'"); return 0; }
266 case ')': { error ("missing opening '('"); return 0; }
267 case ']': { error ("missing opening '['"); return 0; }
268 case '+': { if (root
== 0)
269 { error ("missing prefix for +"); return 0; }
270 NFA_Node
* n
= mkor(mem
,root
,0);
271 new_cont
->set_out(n
);
274 case '*': { if (root
== 0)
275 { error ("missing prefix for *"); return 0; }
276 NFA_Node
* n
= mkor(mem
,root
,0);
277 new_cont
->set_out(n
);
280 case '?': { if (root
== 0)
281 { error ("missing prefix for ?"); return 0; }
282 NFA_Node
* n1
= mkepsilon(mem
,0);
283 NFA_Node
* n2
= mkor(mem
,root
,n1
);
285 new_cont
->set_out(n1
);
288 case '.': { c
= parse_dot(); goto MAKE_DELTA
; }
289 case '[': { regexp
= parse_char_class(regexp
,c
);
290 if (regexp
== 0) return 0; else goto MAKE_DELTA
;
292 case '^': { if (regexp
-1 == regexp_begin
) { anchored
= true; break; }
295 regexp
= parse_char(regexp
-1,ch
); c
= (unsigned char)ch
;
296 if (case_insensitive
) c
= toupper(c
);
298 { if (c
>= 0) is_singleton
[c
] = true;
299 State s
= number_of_states
++;
300 root
= new_cont
= mkdelta(mem
,s
,c
,0);
303 } while (*regexp
== '*' || *regexp
== '+' || *regexp
== '?');
307 //////////////////////////////////////////////////////////////////////////////
308 // Method to construct an NFA from a regular expression
309 //////////////////////////////////////////////////////////////////////////////
310 NFA_Node
* NFA::parse(Rule rule
, const char * regexp
)
311 { NFA_Node
* root
, * cont
; // root and pointer to next node
312 const char * p
; // current pointer
313 anchored
= false; // should pattern start at beginning of line?
314 Bool context_used
[NFA::MAX_CONTEXTS
];
315 Bool has_context
= false;
317 ///////////////////////////////////////////////////////////////////////////
319 ///////////////////////////////////////////////////////////////////////////
323 ///////////////////////////////////////////////////////////////////////////
324 // Look for context (if any)
325 ///////////////////////////////////////////////////////////////////////////
327 { has_context
= true;
328 p
= parse_context(p
+1,context_used
);
329 if (p
== 0) goto ERROR
;
332 regexp_begin
= regexp
;
334 ///////////////////////////////////////////////////////////////////////////
335 // Now parse the regexp
336 ///////////////////////////////////////////////////////////////////////////
337 root
= construct(regexp
,cont
);
338 if (*regexp
!= '\0') error ("syntax error");
339 if (*regexp
== ')') error ("missing opening '('");
341 ///////////////////////////////////////////////////////////////////////////
342 // Create an accept node
343 ///////////////////////////////////////////////////////////////////////////
344 { NFA_Node
* n
= mkaccept(mem
,rule
);
345 if (cont
) { cont
->set_out(n
); }
349 ///////////////////////////////////////////////////////////////////////////
350 // For each context in which the nfa appears, add it to the context table.
351 ///////////////////////////////////////////////////////////////////////////
352 if (! anchored
) context_nfas
[0] = join(mem
, context_nfas
[0], root
);
353 context_nfas
[1] = join(mem
, context_nfas
[1], root
);
354 { for (int i
= 0; i
< number_of_contexts
; i
++)
355 { if (!has_context
|| context_used
[i
])
357 { context_nfas
[2 * i
+ 2] =
358 join(mem
, context_nfas
[2 * i
+ 2], root
);
360 context_nfas
[2 * i
+ 3] = join(mem
, context_nfas
[2 * i
+ 3], root
);
370 //////////////////////////////////////////////////////////////////////////////
371 // Method to compile a set of regular expressions into an NFA.
372 //////////////////////////////////////////////////////////////////////////////
373 void NFA::compile (int number_of_rules
,
374 const char * const * regexps
,
375 const char * const * my_contexts
,
376 Symbol max_character
,
377 Bool is_case_insensitive
380 ///////////////////////////////////////////////////////////////////////////
382 ///////////////////////////////////////////////////////////////////////////
383 number_of_errors
= 0;
385 contexts
= my_contexts
; // set the current set of contexts
386 max_char
= max_character
; // set the current maximal character
387 // in the character set.
388 case_insensitive
= is_case_insensitive
;
389 nfa
= 0; // reset the nfa.
390 // State 0 is the error state
391 // State 1 ... number_of_rules are the accept states.
392 number_of_states
= number_of_rules
;
393 is_singleton
= (char*)mem
.c_alloc(sizeof(char) * (max_character
+ 1));
394 partitions
= (char*)mem
.c_alloc(sizeof(char) * (max_character
+ 2));
396 ///////////////////////////////////////////////////////////////////////////
397 // Allocate space for the character classes stuff
398 ///////////////////////////////////////////////////////////////////////////
399 number_of_char_classes
= 0;
400 { int max_char_classes
= 0;
402 for (int i
= 0; i
< number_of_rules
; i
++)
403 { max_storage
+= strlen(regexps
[i
]) + 1;
404 for (const char * p
= regexps
[i
]; *p
; p
++)
405 { if (*p
== '.' || *p
== '[')
406 { max_char_classes
++; max_storage
++; }
409 char_classes
= (Symbol
*)mem
.c_alloc(sizeof(Symbol
) * max_storage
);
411 (Symbol
**)mem
.c_alloc(sizeof(Symbol
*) * max_char_classes
);
414 ///////////////////////////////////////////////////////////////////////////
415 // Allocate space for the context stuff
416 ///////////////////////////////////////////////////////////////////////////
417 { const char * const * p
= my_contexts
;
418 number_of_contexts
= 0;
420 if (p
) while (*p
) { p
++; number_of_contexts
++; }
422 (NFA_Node
**)mem
.c_alloc
423 (sizeof(NFA_Node
*) * (2 * number_of_contexts
+ 2));
426 ///////////////////////////////////////////////////////////////////////////
427 // Now construct the nfa.
428 ///////////////////////////////////////////////////////////////////////////
429 for (int i
= 0; i
< number_of_rules
; i
++)
430 { NFA_Node
* sub_nfa
= parse(i
, regexps
[i
]);
431 if (number_of_errors
> 0 && bad_rule
< 0) bad_rule
= i
;
432 if (sub_nfa
!= 0) nfa
= join(mem
,nfa
,sub_nfa
);
435 if (number_of_errors
> 0) return;
437 ///////////////////////////////////////////////////////////////////////////
438 // Compute the epsilon closure
439 ///////////////////////////////////////////////////////////////////////////
440 nfa_states
= (NFA_Delta
**)mem
.c_alloc(sizeof(NFA_Delta
*) * number_of_states
);
441 nfa
->epsilon_closure(mem
, number_of_states
, number_of_rules
, nfa_states
);
443 ///////////////////////////////////////////////////////////////////////////
444 // Compute the equivalence class and character class maps
445 ///////////////////////////////////////////////////////////////////////////
446 compute_equiv_classes();
447 compute_char_class_map();
450 //////////////////////////////////////////////////////////////////////////////
451 // Method to compute the equivalence class map.
452 //////////////////////////////////////////////////////////////////////////////
453 void NFA::compute_equiv_classes()
454 { equiv_classes
= new Symbol
[max_char
+1];
455 int partition_number
= -1;
456 number_of_equiv_classes
= 0;
457 for (int i
= 0; i
<= max_char
; i
++)
458 { if (case_insensitive
&& i
>= 'a' && i
<= 'z')
459 { equiv_classes
[i
] = equiv_classes
[i
- 'a' + 'A']; continue; }
460 if (partitions
[i
] || partition_number
< 0)
461 { partition_number
= number_of_equiv_classes
++; }
463 { equiv_classes
[i
] = number_of_equiv_classes
++; continue; }
464 equiv_classes
[i
] = partition_number
;
468 //////////////////////////////////////////////////////////////////////////////
470 // Compute the characteristic map for each character classes
472 //////////////////////////////////////////////////////////////////////////////
473 void NFA::compute_char_class_map()
475 (FastBitSet
**)mem
.m_alloc(sizeof(FastBitSet
*) * number_of_char_classes
);
476 for (int i
= 0; i
< number_of_char_classes
; i
++)
478 char_class_maps
[i
] = new (mem
) FastBitSet(mem
, number_of_equiv_classes
);
479 const Symbol
* C
= char_class_table
[i
];
480 for (const Symbol
* P
= C
; *P
!= END_CHAR_CLASS
; P
++)
481 { register Symbol A
= *P
& MASK_BITS
;
483 { for (register Symbol B
= *++P
& MASK_BITS
; A
<= B
; A
++)
484 set
->add(equiv_classes
[A
]);
486 { set
->add(equiv_classes
[A
]);
489 if (*C
& COMPLEMENT
) set
->complement();