lib-src/automata/nfa.cc

   1 //////////////////////////////////////////////////////////////////////////////
   2 // NOTICE:
   3 //
   4 // ADLib, Prop and their related set of tools and documentation are in the
   5 // public domain.   The author(s) of this software reserve no copyrights on
   6 // the source code and any code generated using the tools.  You are encouraged
   7 // to use ADLib and Prop to develop software, in both academic and commercial
   8 // settings, and are free to incorporate any part of ADLib and Prop into
   9 // your programs.
  10 //
  11 // Although you are under no obligation to do so, we strongly recommend that
  12 // you give away all software developed using our tools.
  13 //
  14 // We also ask that credit be given to us when ADLib and/or Prop are used in
  15 // your programs, and that this notice be preserved intact in all the source
  16 // code.
  17 //
  18 // This software is still under development and we welcome any suggestions
  19 // and help from the users.
  20 //
  21 // Allen Leung
  22 // 1994
  23 //////////////////////////////////////////////////////////////////////////////
  24
  25 #include <ctype.h>
  26 #include <string.h>
  27 #include <AD/automata/nfa_node.h>
  28 #include <AD/automata/nfa.h>
  29 #include <AD/memory/mempool.h>
  30 #include <AD/contain/fbitset.h>
  31 #include <AD/strings/charesc.h>
  32 #include <AD/strings/quark.h>
  33
  34 //////////////////////////////////////////////////////////////////////////////
  35 //  Import some type definitions
  36 //////////////////////////////////////////////////////////////////////////////
  37 typedef DFATables::State  State;
  38 typedef DFATables::Rule   Rule;
  39 typedef DFATables::Symbol Symbol;
  40
  41 //////////////////////////////////////////////////////////////////////////////
  42 //  Constructor and destructor for class NFA
  43 //////////////////////////////////////////////////////////////////////////////
  44 NFA:: NFA()
  45    : mem(* new MemPool), own_mem(true), nfa(0), contexts(0), max_char(0),
  46      number_of_states(0), is_singleton(0), nfa_states(0), context_nfas(0),
  47      number_of_contexts(0), partitions(0)
  48      { number_of_errors = 0; bad_rule = -1; error_message = 0; }
  49 NFA:: NFA(MemPool& m)
  50    : mem(m), own_mem(false), nfa(0), contexts(0), max_char(0),
  51      number_of_states(0), is_singleton(0), nfa_states(0), context_nfas(0),
  52      number_of_contexts(0), partitions(0)
  53      { number_of_errors = 0; bad_rule = -1; error_message = 0; }
  54 NFA::~NFA() { if (own_mem) delete (&mem); }
  55
  56 //////////////////////////////////////////////////////////////////////////////
  57 //  Error handler
  58 //////////////////////////////////////////////////////////////////////////////
  59 void NFA::error(const char * message)
  60 {  number_of_errors++;
  61    error_message = message;
  62 }
  63
  64 //////////////////////////////////////////////////////////////////////////////
  65 //
  66 //  Tags for the parse stack
  67 //
  68 //////////////////////////////////////////////////////////////////////////////
  69 #define PAREN       (NFA_Node*)0
  70 #define DISJUNCTION (NFA_Node*)1
  71 #define CONS        (NFA_Node*)2
  72
  73 //////////////////////////////////////////////////////////////////////////////
  74 // Method to construct a character class for '.'
  75 //////////////////////////////////////////////////////////////////////////////
  76 Symbol NFA::parse_dot()
  77 {
  78    is_singleton['\n'] = true;
  79    char_classes[0] = '\n' | COMPLEMENT;
  80    char_classes[1] = END_CHAR_CLASS;
  81    char_class_table[number_of_char_classes++] = char_classes;
  82    char_classes += 2;
  83    return -number_of_char_classes;
  84 }
  85
  86 //////////////////////////////////////////////////////////////////////////////
  87 // Method to parse a character class
  88 //////////////////////////////////////////////////////////////////////////////
  89 const char * NFA::parse_char_class(const char * p, Symbol& c)
  90 {  Bool is_complemented = false;
  91    const char * start = p;
  92    Symbol * P = char_classes;
  93    Symbol * Q = 0;
  94    Symbol range_begin = -1;
  95    for (;;)
  96    {  switch (*p)
  97       {  case '-':  {  if (range_begin < 0)
  98                        {  error ("missing lhs in '-'"); return 0; }
  99                        if (Q)
 100                        {  error ("duplicated '-'"); return 0; }
 101                        Q = P-1; p++;
 102                     } break;
 103          case '\0': {  error ("missing ']' in character class"); return 0; }
 104          case ']':  {  *P++ = END_CHAR_CLASS;
 105                        if (is_complemented)
 106                            char_classes[0] |= COMPLEMENT;
 107                        char_class_table[number_of_char_classes++]
 108                           = char_classes;
 109                        char_classes = P;
 110                        c = -number_of_char_classes;
 111                        return p+1;
 112                     }
 113          case '^':  {  if (p == start)
 114                        { is_complemented = true; p++; break; }
 115                     }  // falls thru
 116          default:   {  char ch; Symbol ch2;
 117                        p = parse_char(p,ch);
 118                        if (case_insensitive) ch = toupper(ch);
 119                        *P++ = ch2 = (unsigned char)ch;
 120                        if (Q)   // end of range???
 121                        {  if (range_begin >= ch2)
 122                           {  error ("bad range in character class"); return 0;}
 123                           *Q |= RANGE_BIT; range_begin = -1; Q = 0;
 124                           partitions[ch2+1] = true;
 125                        }
 126                        else
 127                        {  range_begin = ch2;
 128                           partitions[ch2] = true;
 129                           if (*p != '-') is_singleton[ch2] = true;
 130                        }
 131                     }
 132       }
 133    }
 134 }
 135
 136 //////////////////////////////////////////////////////////////////////////////
 137 // Method to parse the context
 138 //////////////////////////////////////////////////////////////////////////////
 139 const char * NFA::parse_context(const char * p, Bool context_used[])
 140 {  char name[NFA::MAX_CONTEXT_LENGTH];
 141    char * cursor = name;
 142    for (int i = 0; i < number_of_contexts; i++)
 143       context_used[i] = false;
 144
 145    for (;;)
 146    {  switch (*p)
 147       {  case ',':
 148          case '>':
 149          {  *cursor = '\0';
 150             Bool found = false;
 151             if (contexts)
 152             {  const char * const * q;
 153                int context_number = 0;
 154                for (q = contexts; *q; q++, context_number++)
 155                {  if (strcmp(*q,name) == 0)
 156                   { found = context_used[context_number] = true; break; }
 157                }
 158             }
 159             if (!found)
 160             {  error (Quark("undefined context <",name,">")); return 0; }
 161             if (*p == '>') return p+1;
 162             cursor = name;
 163             p++;
 164          }  break;
 165          case 0:   { error ("missing '>' in contexts"); return 0; }
 166          default:  *cursor++ = *p++; break;
 167       }
 168    }
 169 }
 170
 171 //////////////////////////////////////////////////////////////////////////////
 172 //  Join two nodes together
 173 //////////////////////////////////////////////////////////////////////////////
 174 inline NFA_Node * join (MemPool& mem, NFA_Node * a, NFA_Node * b)
 175 {  return a ? mkor(mem,a,b) : b; }
 176
 177 //////////////////////////////////////////////////////////////////////////////
 178 //  Parse a regular expression and constructor an NFA.
 179 //////////////////////////////////////////////////////////////////////////////
 180 NFA_Node * NFA::construct
 181    ( const char *& regexp,   // the regular expression string
 182      NFA_Node *&   new_cont  // new continuation
 183    )
 184 {  NFA_Node * root = construct_concat(regexp,new_cont);
 185    for (;;)
 186    {  switch (*regexp)
 187       {  case '|':
 188          {  regexp++;
 189             NFA_Node * cont2;
 190             NFA_Node * root2 = construct(regexp,cont2);
 191             if (root == 0 && root2 == 0)
 192             {  root = 0;
 193             } else if (root == 0)
 194             {  NFA_Node * join = mkepsilon(mem,0);
 195                NFA_Node * or_   = mkor(mem,root2,join);
 196                cont2->set_out(join);
 197                new_cont = join; root = or_;
 198             } else if (root2 == 0)
 199             {  NFA_Node * join = mkepsilon(mem,0);
 200                NFA_Node * or_   = mkor(mem,root,join);
 201                new_cont->set_out(join);
 202                new_cont = join; root = or_;
 203             } else
 204             {  NFA_Node * or_ = mkor(mem,root,root2);
 205                NFA_Node * join = mkepsilon(mem,0);
 206                new_cont->set_out(join);
 207                cont2->set_out(join);
 208                new_cont = join; root = or_;
 209             }
 210          }  return root;
 211          case '\0':
 212          case ')':  { return root; }
 213          default:   { error("Syntax error"); return 0; }
 214       }
 215    }
 216 }
 217
 218 //////////////////////////////////////////////////////////////////////////////
 219 //  Parse a concatenation
 220 //////////////////////////////////////////////////////////////////////////////
 221 NFA_Node * NFA::construct_concat
 222    ( const char *& regexp,   // the regular expression string
 223      NFA_Node *&   new_cont  // new continuation
 224    )
 225 {  NFA_Node * root = 0;
 226    new_cont = 0;
 227    for (;;)
 228    {  NFA_Node * cont = 0;
 229       NFA_Node * one  = construct_one(regexp,cont);
 230       if (root)
 231       {  if (new_cont == 0)
 232          { error ("missing ')'???"); return 0; }
 233          new_cont->set_out(one);
 234          new_cont = cont;
 235       } else
 236       {  root = one; new_cont = cont;
 237       }
 238       switch (*regexp)
 239       {  case '|':
 240          case ')':
 241          case '\0':  return root;
 242       }
 243    }
 244 }
 245
 246 //////////////////////////////////////////////////////////////////////////////
 247 //  Parse one component of a regular expression
 248 //////////////////////////////////////////////////////////////////////////////
 249 NFA_Node * NFA::construct_one
 250    ( const char *& regexp,   // the regular expression string
 251      NFA_Node *&   new_cont  // new continuation
 252    )
 253 {  Symbol c;
 254    char ch;
 255    NFA_Node * root = 0;
 256    new_cont = 0;
 257    do
 258    {  switch (*regexp++)
 259       {  case '|':
 260          case '\0': {  regexp--; return root; }
 261          case '(':  {  root = construct(regexp,new_cont);
 262                        if (*regexp != ')')
 263                        { error ("missing closing ')'"); return 0; }
 264                        regexp++;
 265                     }  break;
 266          case ')':  {  error ("missing opening '('"); return 0; }
 267          case ']':  {  error ("missing opening '['"); return 0; }
 268          case '+':  {  if (root == 0)
 269                        { error ("missing prefix for +"); return 0; }
 270                        NFA_Node * n = mkor(mem,root,0);
 271                        new_cont->set_out(n);
 272                        new_cont = n;
 273                     }  break;
 274          case '*':  {  if (root == 0)
 275                        { error ("missing prefix for *"); return 0; }
 276                        NFA_Node * n = mkor(mem,root,0);
 277                        new_cont->set_out(n);
 278                        root = new_cont = n;
 279                     }  break;
 280          case '?':  {  if (root == 0)
 281                        { error ("missing prefix for ?"); return 0; }
 282                        NFA_Node * n1 = mkepsilon(mem,0);
 283                        NFA_Node * n2 = mkor(mem,root,n1);
 284                        root     = n2;
 285                        new_cont->set_out(n1);
 286                        new_cont = n1;
 287                     }  break;
 288          case '.': {  c = parse_dot(); goto MAKE_DELTA; }
 289          case '[': {  regexp = parse_char_class(regexp,c);
 290                       if (regexp == 0) return 0; else goto MAKE_DELTA;
 291                    }
 292          case '^': {  if (regexp-1 == regexp_begin) { anchored = true; break; }
 293                    } // falls thru
 294          default:
 295             regexp = parse_char(regexp-1,ch); c = (unsigned char)ch;
 296             if (case_insensitive) c = toupper(c);
 297          MAKE_DELTA:
 298          {  if (c >= 0) is_singleton[c] = true;
 299             State s = number_of_states++;
 300             root = new_cont = mkdelta(mem,s,c,0);
 301          }  break;
 302       }
 303    } while (*regexp == '*' || *regexp == '+' || *regexp == '?');
 304    return root;
 305 }
 306
 307 //////////////////////////////////////////////////////////////////////////////
 308 //  Method to construct an NFA from a regular expression
 309 //////////////////////////////////////////////////////////////////////////////
 310 NFA_Node * NFA::parse(Rule rule, const char * regexp)
 311 {  NFA_Node * root, * cont;  // root and pointer to next node
 312    const char * p;           // current pointer
 313    anchored = false;         // should pattern start at beginning of line?
 314    Bool context_used[NFA::MAX_CONTEXTS];
 315    Bool has_context = false;
 316
 317    ///////////////////////////////////////////////////////////////////////////
 318    //  Initialize
 319    ///////////////////////////////////////////////////////////////////////////
 320    p    = regexp;
 321    root = cont = 0;
 322
 323    ///////////////////////////////////////////////////////////////////////////
 324    //  Look for context (if any)
 325    ///////////////////////////////////////////////////////////////////////////
 326    if (*p == '<')
 327    {  has_context = true;
 328       p = parse_context(p+1,context_used);
 329       if (p == 0) goto ERROR;
 330       regexp = p;
 331    }
 332    regexp_begin = regexp;
 333
 334    ///////////////////////////////////////////////////////////////////////////
 335    //  Now parse the regexp
 336    ///////////////////////////////////////////////////////////////////////////
 337    root = construct(regexp,cont);
 338    if (*regexp != '\0') error ("syntax error");
 339    if (*regexp == ')')  error ("missing opening '('");
 340
 341    ///////////////////////////////////////////////////////////////////////////
 342    // Create an accept node
 343    ///////////////////////////////////////////////////////////////////////////
 344    {  NFA_Node * n = mkaccept(mem,rule);
 345       if (cont) { cont->set_out(n); }
 346       else { root = n; }
 347    }
 348
 349    ///////////////////////////////////////////////////////////////////////////
 350    // For each context in which the nfa appears, add it to the context table.
 351    ///////////////////////////////////////////////////////////////////////////
 352    if (! anchored) context_nfas[0] = join(mem, context_nfas[0], root);
 353    context_nfas[1] = join(mem, context_nfas[1], root);
 354    {  for (int i = 0; i < number_of_contexts; i++)
 355       {  if (!has_context || context_used[i])
 356          {  if (! anchored)
 357             {  context_nfas[2 * i + 2] =
 358                   join(mem, context_nfas[2 * i + 2], root);
 359             }
 360             context_nfas[2 * i + 3] = join(mem, context_nfas[2 * i + 3], root);
 361          }
 362       }
 363    }
 364
 365    return root;
 366
 367 ERROR: return 0;
 368 }
 369
 370 //////////////////////////////////////////////////////////////////////////////
 371 //  Method to compile a set of regular expressions into an NFA.
 372 //////////////////////////////////////////////////////////////////////////////
 373 void NFA::compile (int                  number_of_rules,
 374                    const char * const * regexps,
 375                    const char * const * my_contexts,
 376                    Symbol               max_character,
 377                    Bool                 is_case_insensitive
 378                   )
 379 {
 380    ///////////////////////////////////////////////////////////////////////////
 381    // Initialization
 382    ///////////////////////////////////////////////////////////////////////////
 383    number_of_errors = 0;
 384    bad_rule         = -1;
 385    contexts = my_contexts;   // set the current set of contexts
 386    max_char = max_character; // set the current maximal character
 387                              // in the character set.
 388    case_insensitive = is_case_insensitive;
 389    nfa = 0;                  // reset the nfa.
 390    //  State 0 is the error state
 391    //  State 1 ... number_of_rules are the accept states.
 392    number_of_states = number_of_rules;
 393    is_singleton = (char*)mem.c_alloc(sizeof(char) * (max_character + 1));
 394    partitions   = (char*)mem.c_alloc(sizeof(char) * (max_character + 2));
 395
 396    ///////////////////////////////////////////////////////////////////////////
 397    //  Allocate space for the character classes stuff
 398    ///////////////////////////////////////////////////////////////////////////
 399    number_of_char_classes = 0;
 400    {  int max_char_classes = 0;
 401       int max_storage      = 0;
 402       for (int i = 0; i < number_of_rules; i++)
 403       {  max_storage += strlen(regexps[i]) + 1;
 404          for (const char * p = regexps[i]; *p; p++)
 405          {  if (*p == '.' || *p == '[')
 406             {  max_char_classes++; max_storage++; }
 407          }
 408       }
 409       char_classes = (Symbol*)mem.c_alloc(sizeof(Symbol) * max_storage);
 410       char_class_table =
 411         (Symbol**)mem.c_alloc(sizeof(Symbol*) * max_char_classes);
 412    }
 413
 414    ///////////////////////////////////////////////////////////////////////////
 415    //  Allocate space for the context stuff
 416    ///////////////////////////////////////////////////////////////////////////
 417    {  const char * const * p = my_contexts;
 418       number_of_contexts = 0;
 419       context_nfas = 0;
 420       if (p) while (*p) { p++; number_of_contexts++; }
 421       context_nfas =
 422         (NFA_Node**)mem.c_alloc
 423            (sizeof(NFA_Node*) * (2 * number_of_contexts + 2));
 424    }
 425
 426    ///////////////////////////////////////////////////////////////////////////
 427    //  Now construct the nfa.
 428    ///////////////////////////////////////////////////////////////////////////
 429    for (int i = 0; i < number_of_rules; i++)
 430    {  NFA_Node * sub_nfa = parse(i, regexps[i]);
 431       if (number_of_errors > 0 && bad_rule < 0) bad_rule = i;
 432       if (sub_nfa != 0) nfa = join(mem,nfa,sub_nfa);
 433    }
 434
 435    if (number_of_errors > 0) return;
 436
 437    ///////////////////////////////////////////////////////////////////////////
 438    //  Compute the epsilon closure
 439    ///////////////////////////////////////////////////////////////////////////
 440    nfa_states = (NFA_Delta**)mem.c_alloc(sizeof(NFA_Delta*) * number_of_states);
 441    nfa->epsilon_closure(mem, number_of_states, number_of_rules, nfa_states);
 442
 443    ///////////////////////////////////////////////////////////////////////////
 444    //  Compute the equivalence class and character class maps
 445    ///////////////////////////////////////////////////////////////////////////
 446    compute_equiv_classes();
 447    compute_char_class_map();
 448 }
 449
 450 //////////////////////////////////////////////////////////////////////////////
 451 //  Method to compute the equivalence class map.
 452 //////////////////////////////////////////////////////////////////////////////
 453 void NFA::compute_equiv_classes()
 454 {  equiv_classes = new Symbol [max_char+1];
 455    int partition_number = -1;
 456    number_of_equiv_classes = 0;
 457    for (int i = 0; i <= max_char; i++)
 458    {  if (case_insensitive && i >= 'a' && i <= 'z')
 459       {  equiv_classes[i] = equiv_classes[i - 'a' + 'A']; continue; }
 460       if (partitions[i] || partition_number < 0)
 461       {  partition_number = number_of_equiv_classes++; }
 462       if (is_singleton[i])
 463       { equiv_classes[i] = number_of_equiv_classes++; continue; }
 464       equiv_classes[i] = partition_number;
 465    }
 466 }
 467
 468 //////////////////////////////////////////////////////////////////////////////
 469 //
 470 //  Compute the characteristic map for each character classes
 471 //
 472 //////////////////////////////////////////////////////////////////////////////
 473 void NFA::compute_char_class_map()
 474 {  char_class_maps =
 475       (FastBitSet**)mem.m_alloc(sizeof(FastBitSet*) * number_of_char_classes);
 476    for (int i = 0; i < number_of_char_classes; i++)
 477    {  FastBitSet * set =
 478          char_class_maps[i] = new (mem) FastBitSet(mem, number_of_equiv_classes);
 479       const Symbol * C = char_class_table[i];
 480       for (const Symbol * P = C; *P != END_CHAR_CLASS; P++)
 481       {  register Symbol A = *P & MASK_BITS;
 482          if (*P & RANGE_BIT)
 483          {  for (register Symbol B = *++P & MASK_BITS; A <= B; A++)
 484                set->add(equiv_classes[A]);
 485          } else
 486          {  set->add(equiv_classes[A]);
 487          }
 488       }
 489       if (*C & COMPLEMENT) set->complement();
 490    }
 491 }