make it compile with g++-3.3
[prop.git] / lib-src / automata / nfa.cc
blob8d5bea01f443dd44be9f1cf23a7551433fd44de0
1 //////////////////////////////////////////////////////////////////////////////
2 // NOTICE:
3 //
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are free to incorporate any part of ADLib and Prop into
9 // your programs.
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
16 // code.
18 // This software is still under development and we welcome any suggestions
19 // and help from the users.
21 // Allen Leung
22 // 1994
23 //////////////////////////////////////////////////////////////////////////////
25 #include <ctype.h>
26 #include <string.h>
27 #include <AD/automata/nfa_node.h>
28 #include <AD/automata/nfa.h>
29 #include <AD/memory/mempool.h>
30 #include <AD/contain/fbitset.h>
31 #include <AD/strings/charesc.h>
32 #include <AD/strings/quark.h>
34 //////////////////////////////////////////////////////////////////////////////
35 // Import some type definitions
36 //////////////////////////////////////////////////////////////////////////////
37 typedef DFATables::State State;
38 typedef DFATables::Rule Rule;
39 typedef DFATables::Symbol Symbol;
41 //////////////////////////////////////////////////////////////////////////////
42 // Constructor and destructor for class NFA
43 //////////////////////////////////////////////////////////////////////////////
44 NFA:: NFA()
45 : mem(* new MemPool), own_mem(true), nfa(0), contexts(0), max_char(0),
46 number_of_states(0), is_singleton(0), nfa_states(0), context_nfas(0),
47 number_of_contexts(0), partitions(0)
48 { number_of_errors = 0; bad_rule = -1; error_message = 0; }
49 NFA:: NFA(MemPool& m)
50 : mem(m), own_mem(false), nfa(0), contexts(0), max_char(0),
51 number_of_states(0), is_singleton(0), nfa_states(0), context_nfas(0),
52 number_of_contexts(0), partitions(0)
53 { number_of_errors = 0; bad_rule = -1; error_message = 0; }
54 NFA::~NFA() { if (own_mem) delete (&mem); }
56 //////////////////////////////////////////////////////////////////////////////
57 // Error handler
58 //////////////////////////////////////////////////////////////////////////////
59 void NFA::error(const char * message)
60 { number_of_errors++;
61 error_message = message;
64 //////////////////////////////////////////////////////////////////////////////
66 // Tags for the parse stack
68 //////////////////////////////////////////////////////////////////////////////
69 #define PAREN (NFA_Node*)0
70 #define DISJUNCTION (NFA_Node*)1
71 #define CONS (NFA_Node*)2
73 //////////////////////////////////////////////////////////////////////////////
74 // Method to construct a character class for '.'
75 //////////////////////////////////////////////////////////////////////////////
76 Symbol NFA::parse_dot()
78 is_singleton['\n'] = true;
79 char_classes[0] = '\n' | COMPLEMENT;
80 char_classes[1] = END_CHAR_CLASS;
81 char_class_table[number_of_char_classes++] = char_classes;
82 char_classes += 2;
83 return -number_of_char_classes;
86 //////////////////////////////////////////////////////////////////////////////
87 // Method to parse a character class
88 //////////////////////////////////////////////////////////////////////////////
89 const char * NFA::parse_char_class(const char * p, Symbol& c)
90 { Bool is_complemented = false;
91 const char * start = p;
92 Symbol * P = char_classes;
93 Symbol * Q = 0;
94 Symbol range_begin = -1;
95 for (;;)
96 { switch (*p)
97 { case '-': { if (range_begin < 0)
98 { error ("missing lhs in '-'"); return 0; }
99 if (Q)
100 { error ("duplicated '-'"); return 0; }
101 Q = P-1; p++;
102 } break;
103 case '\0': { error ("missing ']' in character class"); return 0; }
104 case ']': { *P++ = END_CHAR_CLASS;
105 if (is_complemented)
106 char_classes[0] |= COMPLEMENT;
107 char_class_table[number_of_char_classes++]
108 = char_classes;
109 char_classes = P;
110 c = -number_of_char_classes;
111 return p+1;
113 case '^': { if (p == start)
114 { is_complemented = true; p++; break; }
115 } // falls thru
116 default: { char ch; Symbol ch2;
117 p = parse_char(p,ch);
118 if (case_insensitive) ch = toupper(ch);
119 *P++ = ch2 = (unsigned char)ch;
120 if (Q) // end of range???
121 { if (range_begin >= ch2)
122 { error ("bad range in character class"); return 0;}
123 *Q |= RANGE_BIT; range_begin = -1; Q = 0;
124 partitions[ch2+1] = true;
126 else
127 { range_begin = ch2;
128 partitions[ch2] = true;
129 if (*p != '-') is_singleton[ch2] = true;
136 //////////////////////////////////////////////////////////////////////////////
137 // Method to parse the context
138 //////////////////////////////////////////////////////////////////////////////
139 const char * NFA::parse_context(const char * p, Bool context_used[])
140 { char name[NFA::MAX_CONTEXT_LENGTH];
141 char * cursor = name;
142 for (int i = 0; i < number_of_contexts; i++)
143 context_used[i] = false;
145 for (;;)
146 { switch (*p)
147 { case ',':
148 case '>':
149 { *cursor = '\0';
150 Bool found = false;
151 if (contexts)
152 { const char * const * q;
153 int context_number = 0;
154 for (q = contexts; *q; q++, context_number++)
155 { if (strcmp(*q,name) == 0)
156 { found = context_used[context_number] = true; break; }
159 if (!found)
160 { error (Quark("undefined context <",name,">")); return 0; }
161 if (*p == '>') return p+1;
162 cursor = name;
163 p++;
164 } break;
165 case 0: { error ("missing '>' in contexts"); return 0; }
166 default: *cursor++ = *p++; break;
171 //////////////////////////////////////////////////////////////////////////////
172 // Join two nodes together
173 //////////////////////////////////////////////////////////////////////////////
174 inline NFA_Node * join (MemPool& mem, NFA_Node * a, NFA_Node * b)
175 { return a ? mkor(mem,a,b) : b; }
177 //////////////////////////////////////////////////////////////////////////////
178 // Parse a regular expression and constructor an NFA.
179 //////////////////////////////////////////////////////////////////////////////
180 NFA_Node * NFA::construct
181 ( const char *& regexp, // the regular expression string
182 NFA_Node *& new_cont // new continuation
184 { NFA_Node * root = construct_concat(regexp,new_cont);
185 for (;;)
186 { switch (*regexp)
187 { case '|':
188 { regexp++;
189 NFA_Node * cont2;
190 NFA_Node * root2 = construct(regexp,cont2);
191 if (root == 0 && root2 == 0)
192 { root = 0;
193 } else if (root == 0)
194 { NFA_Node * join = mkepsilon(mem,0);
195 NFA_Node * or_ = mkor(mem,root2,join);
196 cont2->set_out(join);
197 new_cont = join; root = or_;
198 } else if (root2 == 0)
199 { NFA_Node * join = mkepsilon(mem,0);
200 NFA_Node * or_ = mkor(mem,root,join);
201 new_cont->set_out(join);
202 new_cont = join; root = or_;
203 } else
204 { NFA_Node * or_ = mkor(mem,root,root2);
205 NFA_Node * join = mkepsilon(mem,0);
206 new_cont->set_out(join);
207 cont2->set_out(join);
208 new_cont = join; root = or_;
210 } return root;
211 case '\0':
212 case ')': { return root; }
213 default: { error("Syntax error"); return 0; }
218 //////////////////////////////////////////////////////////////////////////////
219 // Parse a concatenation
220 //////////////////////////////////////////////////////////////////////////////
221 NFA_Node * NFA::construct_concat
222 ( const char *& regexp, // the regular expression string
223 NFA_Node *& new_cont // new continuation
225 { NFA_Node * root = 0;
226 new_cont = 0;
227 for (;;)
228 { NFA_Node * cont = 0;
229 NFA_Node * one = construct_one(regexp,cont);
230 if (root)
231 { if (new_cont == 0)
232 { error ("missing ')'???"); return 0; }
233 new_cont->set_out(one);
234 new_cont = cont;
235 } else
236 { root = one; new_cont = cont;
238 switch (*regexp)
239 { case '|':
240 case ')':
241 case '\0': return root;
246 //////////////////////////////////////////////////////////////////////////////
247 // Parse one component of a regular expression
248 //////////////////////////////////////////////////////////////////////////////
249 NFA_Node * NFA::construct_one
250 ( const char *& regexp, // the regular expression string
251 NFA_Node *& new_cont // new continuation
253 { Symbol c;
254 char ch;
255 NFA_Node * root = 0;
256 new_cont = 0;
258 { switch (*regexp++)
259 { case '|':
260 case '\0': { regexp--; return root; }
261 case '(': { root = construct(regexp,new_cont);
262 if (*regexp != ')')
263 { error ("missing closing ')'"); return 0; }
264 regexp++;
265 } break;
266 case ')': { error ("missing opening '('"); return 0; }
267 case ']': { error ("missing opening '['"); return 0; }
268 case '+': { if (root == 0)
269 { error ("missing prefix for +"); return 0; }
270 NFA_Node * n = mkor(mem,root,0);
271 new_cont->set_out(n);
272 new_cont = n;
273 } break;
274 case '*': { if (root == 0)
275 { error ("missing prefix for *"); return 0; }
276 NFA_Node * n = mkor(mem,root,0);
277 new_cont->set_out(n);
278 root = new_cont = n;
279 } break;
280 case '?': { if (root == 0)
281 { error ("missing prefix for ?"); return 0; }
282 NFA_Node * n1 = mkepsilon(mem,0);
283 NFA_Node * n2 = mkor(mem,root,n1);
284 root = n2;
285 new_cont->set_out(n1);
286 new_cont = n1;
287 } break;
288 case '.': { c = parse_dot(); goto MAKE_DELTA; }
289 case '[': { regexp = parse_char_class(regexp,c);
290 if (regexp == 0) return 0; else goto MAKE_DELTA;
292 case '^': { if (regexp-1 == regexp_begin) { anchored = true; break; }
293 } // falls thru
294 default:
295 regexp = parse_char(regexp-1,ch); c = (unsigned char)ch;
296 if (case_insensitive) c = toupper(c);
297 MAKE_DELTA:
298 { if (c >= 0) is_singleton[c] = true;
299 State s = number_of_states++;
300 root = new_cont = mkdelta(mem,s,c,0);
301 } break;
303 } while (*regexp == '*' || *regexp == '+' || *regexp == '?');
304 return root;
307 //////////////////////////////////////////////////////////////////////////////
308 // Method to construct an NFA from a regular expression
309 //////////////////////////////////////////////////////////////////////////////
310 NFA_Node * NFA::parse(Rule rule, const char * regexp)
311 { NFA_Node * root, * cont; // root and pointer to next node
312 const char * p; // current pointer
313 anchored = false; // should pattern start at beginning of line?
314 Bool context_used[NFA::MAX_CONTEXTS];
315 Bool has_context = false;
317 ///////////////////////////////////////////////////////////////////////////
318 // Initialize
319 ///////////////////////////////////////////////////////////////////////////
320 p = regexp;
321 root = cont = 0;
323 ///////////////////////////////////////////////////////////////////////////
324 // Look for context (if any)
325 ///////////////////////////////////////////////////////////////////////////
326 if (*p == '<')
327 { has_context = true;
328 p = parse_context(p+1,context_used);
329 if (p == 0) goto ERROR;
330 regexp = p;
332 regexp_begin = regexp;
334 ///////////////////////////////////////////////////////////////////////////
335 // Now parse the regexp
336 ///////////////////////////////////////////////////////////////////////////
337 root = construct(regexp,cont);
338 if (*regexp != '\0') error ("syntax error");
339 if (*regexp == ')') error ("missing opening '('");
341 ///////////////////////////////////////////////////////////////////////////
342 // Create an accept node
343 ///////////////////////////////////////////////////////////////////////////
344 { NFA_Node * n = mkaccept(mem,rule);
345 if (cont) { cont->set_out(n); }
346 else { root = n; }
349 ///////////////////////////////////////////////////////////////////////////
350 // For each context in which the nfa appears, add it to the context table.
351 ///////////////////////////////////////////////////////////////////////////
352 if (! anchored) context_nfas[0] = join(mem, context_nfas[0], root);
353 context_nfas[1] = join(mem, context_nfas[1], root);
354 { for (int i = 0; i < number_of_contexts; i++)
355 { if (!has_context || context_used[i])
356 { if (! anchored)
357 { context_nfas[2 * i + 2] =
358 join(mem, context_nfas[2 * i + 2], root);
360 context_nfas[2 * i + 3] = join(mem, context_nfas[2 * i + 3], root);
365 return root;
367 ERROR: return 0;
370 //////////////////////////////////////////////////////////////////////////////
371 // Method to compile a set of regular expressions into an NFA.
372 //////////////////////////////////////////////////////////////////////////////
373 void NFA::compile (int number_of_rules,
374 const char * const * regexps,
375 const char * const * my_contexts,
376 Symbol max_character,
377 Bool is_case_insensitive
380 ///////////////////////////////////////////////////////////////////////////
381 // Initialization
382 ///////////////////////////////////////////////////////////////////////////
383 number_of_errors = 0;
384 bad_rule = -1;
385 contexts = my_contexts; // set the current set of contexts
386 max_char = max_character; // set the current maximal character
387 // in the character set.
388 case_insensitive = is_case_insensitive;
389 nfa = 0; // reset the nfa.
390 // State 0 is the error state
391 // State 1 ... number_of_rules are the accept states.
392 number_of_states = number_of_rules;
393 is_singleton = (char*)mem.c_alloc(sizeof(char) * (max_character + 1));
394 partitions = (char*)mem.c_alloc(sizeof(char) * (max_character + 2));
396 ///////////////////////////////////////////////////////////////////////////
397 // Allocate space for the character classes stuff
398 ///////////////////////////////////////////////////////////////////////////
399 number_of_char_classes = 0;
400 { int max_char_classes = 0;
401 int max_storage = 0;
402 for (int i = 0; i < number_of_rules; i++)
403 { max_storage += strlen(regexps[i]) + 1;
404 for (const char * p = regexps[i]; *p; p++)
405 { if (*p == '.' || *p == '[')
406 { max_char_classes++; max_storage++; }
409 char_classes = (Symbol*)mem.c_alloc(sizeof(Symbol) * max_storage);
410 char_class_table =
411 (Symbol**)mem.c_alloc(sizeof(Symbol*) * max_char_classes);
414 ///////////////////////////////////////////////////////////////////////////
415 // Allocate space for the context stuff
416 ///////////////////////////////////////////////////////////////////////////
417 { const char * const * p = my_contexts;
418 number_of_contexts = 0;
419 context_nfas = 0;
420 if (p) while (*p) { p++; number_of_contexts++; }
421 context_nfas =
422 (NFA_Node**)mem.c_alloc
423 (sizeof(NFA_Node*) * (2 * number_of_contexts + 2));
426 ///////////////////////////////////////////////////////////////////////////
427 // Now construct the nfa.
428 ///////////////////////////////////////////////////////////////////////////
429 for (int i = 0; i < number_of_rules; i++)
430 { NFA_Node * sub_nfa = parse(i, regexps[i]);
431 if (number_of_errors > 0 && bad_rule < 0) bad_rule = i;
432 if (sub_nfa != 0) nfa = join(mem,nfa,sub_nfa);
435 if (number_of_errors > 0) return;
437 ///////////////////////////////////////////////////////////////////////////
438 // Compute the epsilon closure
439 ///////////////////////////////////////////////////////////////////////////
440 nfa_states = (NFA_Delta**)mem.c_alloc(sizeof(NFA_Delta*) * number_of_states);
441 nfa->epsilon_closure(mem, number_of_states, number_of_rules, nfa_states);
443 ///////////////////////////////////////////////////////////////////////////
444 // Compute the equivalence class and character class maps
445 ///////////////////////////////////////////////////////////////////////////
446 compute_equiv_classes();
447 compute_char_class_map();
450 //////////////////////////////////////////////////////////////////////////////
451 // Method to compute the equivalence class map.
452 //////////////////////////////////////////////////////////////////////////////
453 void NFA::compute_equiv_classes()
454 { equiv_classes = new Symbol [max_char+1];
455 int partition_number = -1;
456 number_of_equiv_classes = 0;
457 for (int i = 0; i <= max_char; i++)
458 { if (case_insensitive && i >= 'a' && i <= 'z')
459 { equiv_classes[i] = equiv_classes[i - 'a' + 'A']; continue; }
460 if (partitions[i] || partition_number < 0)
461 { partition_number = number_of_equiv_classes++; }
462 if (is_singleton[i])
463 { equiv_classes[i] = number_of_equiv_classes++; continue; }
464 equiv_classes[i] = partition_number;
468 //////////////////////////////////////////////////////////////////////////////
470 // Compute the characteristic map for each character classes
472 //////////////////////////////////////////////////////////////////////////////
473 void NFA::compute_char_class_map()
474 { char_class_maps =
475 (FastBitSet**)mem.m_alloc(sizeof(FastBitSet*) * number_of_char_classes);
476 for (int i = 0; i < number_of_char_classes; i++)
477 { FastBitSet * set =
478 char_class_maps[i] = new (mem) FastBitSet(mem, number_of_equiv_classes);
479 const Symbol * C = char_class_table[i];
480 for (const Symbol * P = C; *P != END_CHAR_CLASS; P++)
481 { register Symbol A = *P & MASK_BITS;
482 if (*P & RANGE_BIT)
483 { for (register Symbol B = *++P & MASK_BITS; A <= B; A++)
484 set->add(equiv_classes[A]);
485 } else
486 { set->add(equiv_classes[A]);
489 if (*C & COMPLEMENT) set->complement();