lib-src/strings/regexp.cc

   1 //////////////////////////////////////////////////////////////////////////////
   2 // NOTICE:
   3 //
   4 // ADLib, Prop and their related set of tools and documentation are in the
   5 // public domain.   The author(s) of this software reserve no copyrights on
   6 // the source code and any code generated using the tools.  You are encouraged
   7 // to use ADLib and Prop to develop software, in both academic and commercial
   8 // settings, and are free to incorporate any part of ADLib and Prop into
   9 // your programs.
  10 //
  11 // Although you are under no obligation to do so, we strongly recommend that
  12 // you give away all software developed using our tools.
  13 //
  14 // We also ask that credit be given to us when ADLib and/or Prop are used in
  15 // your programs, and that this notice be preserved intact in all the source
  16 // code.
  17 //
  18 // This software is still under development and we welcome any suggestions
  19 // and help from the users.
  20 //
  21 // Allen Leung
  22 // 1994
  23 //////////////////////////////////////////////////////////////////////////////
  24
  25 #include <stddef.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <ctype.h>
  29 #include <AD/generic/generic.h>  // Generic definitions
  30 #include <AD/contain/charset.h>  // Character set
  31 #include <AD/strings/regexp.h>   // Regular expression search
  32
  33 // Local variables
  34 static CharSet alpha;    // set of alpha characters
  35 static CharSet numeric;  // set of numeric characters
  36 static CharSet space;    // set of space characters
  37 static CharSet word;     // set of word characters
  38
  39 static void init()
  40 {  for (register int c = 0; c < 256; c++) {
  41       if (isalpha(c)) alpha += c;
  42       if (isdigit(c)) numeric += c;
  43       if (isspace(c)) space += c;
  44       if (isalnum(c)) word += c;
  45    }
  46    word += '_';
  47 }
  48
  49 //
  50 //   The set of opcodes used by the NFA matching machine
  51 //
  52
  53 enum { SUCCESS = 0,            // accept state
  54        BEGIN_OF_LINE = 1,      // match the beginning of a line
  55        END_OF_LINE = 2,        // match the end of a line
  56        WILD_CARD = 3,          // matches anything besides '\n'
  57        MATCH_LITERAL = 4,      // matches a literal string
  58        MATCH_SET = 5,          // matches any character in a set
  59        MATCH_CHAR_STAR = 6,    // matches a character zero or more times
  60        MATCH_CHAR_PLUS = 7,    // matches a character one or more times
  61        MATCH_SET_STAR = 8,     // matches a set zero or more times
  62        MATCH_SET_PLUS = 9,     // matches a set one or more times
  63        OR = 10,                // disjunction, branch unlikely
  64        PLUS = 11,              // disjunction, branch likely
  65        JUMP = 12,              // unconditional jump
  66        TIMES = 13,             // match a subexpression m to n times.
  67        MATCH_BOUND = 14,       // match word boundary
  68        MATCH_NON_BOUND = 15,   // match non-word boundary
  69        MATCH_WORD_BEGIN = 16,  // match beginning of a word
  70        MATCH_WORD_END = 17,    // match end of a word
  71        REF = 20,               // matches a backward reference
  72        OPEN = 30,              // beginning of a reference
  73        CLOSE = 40              // end of a reference
  74      };
  75
  76 RegExp::RegExp(const RegExp& re) : nfa(0) { *this = re; }
  77
  78 RegExp& RegExp::operator = (const RegExp& re)
  79 {  if (this != &re) {
  80       delete [] nfa; nfa = NULL;
  81       if (re.nfa) {
  82          nfa = new Opcode [size = re.size];
  83          memcpy(nfa,re.nfa,sizeof(Opcode) * size);
  84          minMatchLen = re.minMatchLen;
  85       }
  86    }
  87    return *this;
  88 }
  89
  90 //
  91 //  The main pattern matcher, returns the character following
  92 // the entire matched text if successful; otherwise, returns NULL
  93 //
  94
  95 const char * RegExp::grep( register const Opcode * PC,    // current opcode
  96                            register const char * text,    // current character
  97                            register const char * textEnd  // end of text string
  98                          )
  99 {  register char c = *text++;  // cache the first character
 100    int n, len, minTimes, times;
 101
 102    for (;;) {
 103       switch (*PC++) {
 104          case SUCCESS: return text-1;
 105          case BEGIN_OF_LINE: if (text - 1 != myText) return NULL; break;
 106          case END_OF_LINE:
 107             if (text > textEnd) break;
 108             if (c == '\n') { c = *text++; break; }
 109             return NULL;
 110          case WILD_CARD:
 111             if (text > textEnd || c == '\n') return NULL;
 112             c = *text++; break;
 113          case OPEN+0: case OPEN+1: case OPEN+2: case OPEN+3: case OPEN+4:
 114          case OPEN+5: case OPEN+6: case OPEN+7: case OPEN+8: case OPEN+9:
 115             left[PC[-1] - OPEN] = text - myText - 1; break;
 116          case CLOSE+0: case CLOSE+1: case CLOSE+2: case CLOSE+3: case CLOSE+4:
 117          case CLOSE+5: case CLOSE+6: case CLOSE+7: case CLOSE+8: case CLOSE+9:
 118             right[PC[-1] - CLOSE] = text - myText - 2; break;
 119          case REF+0: case REF+1: case REF+2: case REF+3: case REF+4:
 120          case REF+5: case REF+6: case REF+7: case REF+8: case REF+9:
 121             n = PC[-1] - REF; len = right[n] - left[n];
 122             if (text+len >= textEnd || len < 0 ||
 123                 memcmp(myText+left[n],text-1,len))
 124                return NULL;
 125             text += len - 1; c = *text++;
 126             break;
 127
 128          case MATCH_LITERAL:
 129             len = *PC++ - 1;
 130             if (c != *PC++ || text + len > textEnd || memcmp(text,PC,len))
 131                return NULL;
 132             PC += len;  text += len;  c = *text++;
 133             break;
 134
 135          case MATCH_SET:
 136             if (text > textEnd ||
 137                 bitOf((Byte *)PC,(unsigned char)c) == 0) return NULL;
 138             PC += 32; c = *text++; break;
 139
 140          case MATCH_CHAR_PLUS: minTimes = 1; goto Match_char;
 141          case MATCH_CHAR_STAR: minTimes = 0;
 142          Match_char:
 143          {  c = PC[1]; PC += 2;
 144             for (times = 0, text--; text < textEnd; text++, times++)
 145                if (*text != c) break;
 146             for ( ;times >= minTimes; times--, text--) {
 147                const char * matchEnd = this->grep(PC,text,textEnd);
 148                if (matchEnd) return matchEnd;
 149             }
 150             return NULL;
 151          }
 152          case MATCH_SET_PLUS: minTimes = 1; goto Match_set;
 153          case MATCH_SET_STAR: minTimes = 0;
 154          Match_set:
 155          {  for (times = 0, text--; text < textEnd; text++, times++)
 156                if (bitOf((Byte *)PC,(unsigned char)*text) == 0) break;
 157             for ( PC += 32; times >= minTimes; times--, text--) {
 158                const char * matchEnd = this->grep(PC,text,textEnd);
 159                if (matchEnd) return matchEnd;
 160             }
 161             return NULL;
 162          }
 163
 164          case TIMES:  // Try subpattern m to n times
 165             return this->times(PC[0], PC[1], text-1, textEnd, PC+2);
 166          case MATCH_WORD_BEGIN:  // Beginning of a word??
 167             if ( text > textEnd || ! word[c] ||
 168                  text - 1 > myText && word[text[-2]])
 169                 return NULL;
 170             break;
 171          case MATCH_WORD_END:  // End of a word??
 172             if ( text <= textEnd && ! word[c] ||
 173                  text < textEnd && word[c]) return NULL;
 174             break;
 175          case MATCH_BOUND:  // A word boundary??
 176             if ( (text > textEnd || ! word[c]) &&
 177                  (text - 2 < myText || ! word[text[-2]]) ||
 178                 text <= textEnd && word[c] &&
 179                 text - 2 >= myText && word[text[-2]] )
 180                return NULL;
 181             break;
 182          case MATCH_NON_BOUND: // Not a word boundary??
 183             if ( text <= textEnd && word[c] &&
 184                  (text - 2 < myText || ! word[text[-2]]) ||
 185                  (text > textEnd || ! word[c]) &&
 186                  text - 2 >= myText && word[text[-2]] )
 187                return NULL;
 188             break;
 189          case OR:  // Try one alternate, then another
 190          {  const char * success = this->grep(PC+2,text-1,textEnd);
 191             if (success) return success; else PC += PC[0] * 256 + PC[1];
 192          }  break;
 193          case PLUS:  // Try one alternate, then another
 194          {  const char * good =
 195                this->grep(PC+PC[0] * 256 + PC[1],text-1,textEnd);
 196             if (good) return good; else PC += 2;
 197          }  break;
 198          case JUMP: // Unconditional jump
 199             PC += PC[0] * 256 + PC[1]; break;
 200          default: return NULL;
 201       }
 202    }
 203    return NULL;
 204 }
 205
 206 //
 207 // Call and matches a subpattern |minTimes| to |maxTimes| times.
 208 // Notice that |maxTimes < 0| means that the maximum times is infinite.
 209 //
 210
 211 const char * RegExp::times( int minTimes,
 212                             int maxTimes,
 213                             const Opcode * PC,
 214                             const char * text,
 215                             const char * textEnd
 216                           )
 217 {  const char * matched;
 218    while (minTimes >= 0 && maxTimes >= 0) {
 219       if (maxTimes == 1) return this->grep(PC,text,textEnd);
 220       else if (maxTimes < 0)
 221          matched = this->times(minTimes-1,-1,PC,text,textEnd);
 222       else
 223          matched = this->times(minTimes-1,maxTimes-1,PC,text,textEnd);
 224       if (matched == NULL) return NULL;
 225       if (minTimes < 0) return matched;
 226       text = matched;
 227    }
 228    return NULL;
 229 }
 230
 231 //
 232 // Main entry point of the pattern matcher
 233 //
 234
 235 int RegExp::Match(register const char * text, int length)
 236 {
 237    if (nfa == NULL) return -1;
 238    if (length < 0) length = strlen(text);
 239    register int c;
 240
 241    myText = text;
 242    const char * textEnd = text + length;
 243    register const char * limit;
 244    if (minMatchLen > 0) limit = textEnd - minMatchLen + 1;
 245    else limit = textEnd;
 246
 247    switch (*nfa) {
 248       case BEGIN_OF_LINE:  // Anchored match
 249          if ((matchedTextEnd = this->grep(nfa+1,text,textEnd))) {
 250             matchedText = text;
 251             return matchedText - myText;
 252          }
 253          return -1;
 254       case MATCH_LITERAL:  // Starts with a literal
 255       {  int len = nfa[1];
 256          for ( c = nfa[2]; text < limit; text++)
 257             if (c == *text && text + len <= textEnd &&
 258                 memcmp(text,nfa+2,len) == 0)
 259                if ((matchedTextEnd = this->grep(nfa+len+2,text+len,textEnd))) {
 260                   matchedText = text;
 261                   return matchedText - myText;
 262                }
 263          return -1;
 264       }
 265       case MATCH_CHAR_PLUS: // Starts with a character
 266          for ( c = nfa[2]; text < limit; text++)
 267             if (c == *text)
 268                if ((matchedTextEnd = this->grep(nfa,text,textEnd))) {
 269                   matchedText = text;
 270                   return matchedText - myText;
 271                }
 272          return -1;
 273       case MATCH_SET: case MATCH_SET_PLUS: // Starts with a set
 274       {  register Byte * set = (Byte*)(nfa+1);
 275          for ( ; text < limit; text++)
 276             if (bitOf(set,(unsigned char)*text))
 277                if ((matchedTextEnd = this->grep(nfa,text,textEnd))) {
 278                   matchedText = text;
 279                   return matchedText - myText;
 280                }
 281          return -1;
 282       }
 283       default:   // Can start with anything
 284          for ( ;text < limit; text++)
 285             if ((matchedTextEnd = this->grep(nfa,text,textEnd))) {
 286                matchedText = text;
 287                return matchedText - myText;
 288             }
 289          return -1;
 290    }
 291 }
 292
 293 //
 294 // Compile a new pattern
 295 //
 296
 297 RegExp& RegExp::compile(const char * pattern, const char * meta)
 298 {
 299    delete [] nfa;               // delete last compiled pattern, if any
 300    nfa = NULL;
 301    if (meta == NULL)            // set default meta characters
 302       meta = "*+()[]?^$-.|\\\0";
 303
 304    MetaChar M[256];             // meta character assignment
 305
 306    if (! space[' ']) init();    // initialize, if necessary
 307
 308    //
 309    // Compute the meta character assignment
 310    //
 311    for (int i = 0; i < 256; i++) M[i] = META_NONE;
 312    for (int j = META_STAR; j <= META_EOP; j++) M[ meta[j] ] = (MetaChar)j;
 313
 314    size = this->comp(pattern, M, false);  // compile it once to get the size
 315
 316    if (size > 0) {
 317       nfa = new Opcode [size];            // allocate new storage
 318       this->comp(pattern, M, true);       // compile it again to emit opcodes
 319       minMatchLen = this->minMatch(nfa);  // compute the minimal matching len
 320    }
 321
 322    return *this;
 323 }
 324
 325 inline void shift(register RegExp::Opcode * start,
 326                   register RegExp::Opcode * end,
 327                   register int size)
 328 {
 329    while (end > start) { end--; end[size] = end[0]; }
 330 }
 331
 332 //
 333 // ``Parse'' pattern and compile the NFA
 334 //
 335
 336 int RegExp::comp(const char * pattern, const MetaChar M[], Bool emit)
 337 {
 338    register const char * p;        // pointer to pattern
 339    register char c;                // cached pattern character
 340    register int size = 0;          // size of the nfa
 341    register Opcode * PC = nfa;     // nfa opcodes
 342    Opcode * last;                  // last opcode
 343    CharSet set;                    // a character set
 344    int level = 0;                  // nesting level
 345    Opcode * start[10];             // starting point for subpattern
 346    int parenCount = 0;             // parenthesis count
 347    int paren[10];                  // closing parenthesis number
 348    Bool complement = false;        // set complement??
 349
 350    start[0] = PC;   // starting point of first pattern
 351
 352    for (last = NULL, p = pattern; M[c = *p++] != META_EOP; ) {
 353       switch (M[c]) {
 354          case META_STAR:
 355             if (emit) {
 356                if (last) {
 357                   switch (last[0]) {
 358                      case MATCH_LITERAL:
 359                         if (last[1] == 1) last[0] = MATCH_CHAR_STAR;
 360                         else goto Emit_star; break;
 361                      case MATCH_CHAR_PLUS: last[0] = MATCH_CHAR_STAR; break;
 362                      case MATCH_CHAR_STAR: break;
 363                      case MATCH_SET:
 364                      case MATCH_SET_PLUS: last[0] = MATCH_SET_STAR; break;
 365                      case MATCH_SET_STAR: break;
 366                      default: Emit_star:
 367                         *PC++ = JUMP;
 368                         *PC++ = (last - PC - 2) / 256;
 369                         *PC++ = (last - PC - 2) % 256;
 370                         shift(last,PC,3);
 371                         last[0] = OR;
 372                         last[1] = (PC - last + 2) / 256;
 373                         last[2] = (PC - last + 2) % 256;
 374                         PC += 3;
 375                         break;
 376                   }
 377                } else goto Syntax_Error;
 378             } else size += 6; break;
 379          case META_PLUS:
 380             if (emit) {
 381                if (last) {
 382                   switch (last[0]) {
 383                      case MATCH_LITERAL:
 384                         if (last[1] == 1) last[0] = MATCH_CHAR_PLUS;
 385                         else goto Emit_plus; break;
 386                      case MATCH_CHAR_PLUS:
 387                      case MATCH_CHAR_STAR:
 388                      case MATCH_SET_PLUS:
 389                      case MATCH_SET_STAR: break;
 390                      case MATCH_SET: last[0] = MATCH_SET_PLUS; break;
 391                      default: Emit_plus:
 392                         *PC++ = PLUS;
 393                         *PC++ = (last - PC + 1) / 256;
 394                         *PC++ = (last - PC + 1) % 256;
 395                         break;
 396                   }
 397                } else goto Syntax_Error;
 398             } else size += 3; break;
 399          case META_OPTIONAL:
 400             if (emit) {
 401                if (last == NULL) goto Syntax_Error;
 402                shift(last,PC,3);
 403                last[0] = OR;
 404                last[1] = (PC - last + 2) / 256;
 405                last[2] = (PC - last + 2) % 256;
 406                PC += 3;
 407             } else size += 3;
 408             break;
 409          case META_OR:
 410             if (emit) {
 411                *PC++ = SUCCESS;
 412                shift(start[level],PC,3);
 413                start[level][0] = OR;
 414                start[level][1] = (PC - start[level] + 2) / 256;
 415                start[level][2] = (PC - start[level] + 2) % 256;
 416                PC += 3;
 417             } else size += 4;
 418             break;
 419          case META_OPEN:
 420             if (emit) { last = PC; *PC = OPEN + parenCount; } else size++;
 421             paren[level] = parenCount;
 422             if (++parenCount >= 10) goto Syntax_Error;
 423             if (++level >= 10) goto Syntax_Error;
 424             start[level] = PC++;
 425             break;
 426          case META_CLOSE:
 427             last = start[level];
 428             if (--level < 0) goto Syntax_Error;
 429             if (emit) { *PC++ = CLOSE + paren[level]; }
 430             else size++;
 431             break;
 432          case META_BEGIN:
 433             last = PC; if (emit) *PC++ = BEGIN_OF_LINE; else size++; break;
 434          case META_END:
 435             last = PC; if (emit) *PC++ = END_OF_LINE; else size++; break;
 436          case META_WILD:
 437             last = PC; if (emit) *PC++ = WILD_CARD; else size++; break;
 438          case META_SET_OPEN:
 439          {  set.clear();
 440             const char * start = p;
 441             int lastChar = -1;
 442             while (M[c = *p++] != META_SET_CLOSE) {
 443                if (c == '^' && p-1 == start) { complement = true; continue; }
 444                switch (M[c]) {
 445                   case META_EOP: goto Syntax_Error;
 446                   case META_RANGE: lastChar = p[-2]; break;
 447                   case META_ESCAPE:
 448                      switch (c = *p++) {
 449                         case 'n': set += '\n'; break;
 450                         case 't': set += '\t'; break;
 451                         case '0': case '1':
 452                            const char * q;
 453                            for (q = p, c -= 0;
 454                                 *p >= '0' && *p <= '7' && p - q <= 2; p++)
 455                               c = c * 8 + *p - '0';
 456                            set += c; break;
 457                         case 'd': set += numeric; break;
 458                         case 'D': set += ! numeric; break;
 459                         case 's': set += space; break;
 460                         case 'S': set += ! space; break;
 461                         case 'w': set += word; break;
 462                         case 'W': set += ! word; break;
 463                         default: goto Emit_set_char;
 464                      } break;
 465                   default: Emit_set_char:
 466                      if (lastChar >= 0)
 467                         for ( ;lastChar <= c; lastChar++) set += lastChar;
 468                      else
 469                         set += c;
 470                      lastChar = -1;
 471                   break;
 472                }
 473             }
 474          }
 475          Emit_match_set:
 476          {     if (emit) {
 477                   if (complement) set = ! set;
 478                   last = PC;
 479                   *PC++ = MATCH_SET;
 480                   memcpy(PC,(const Byte *)set,32);
 481                   PC += 32;
 482                   complement = false;
 483                } else size += 33;
 484          }
 485          break;
 486          case META_ESCAPE:
 487             switch (c = *p++) {
 488                case 'n': c = '\n'; goto Emit_match_char;  // new line
 489                case 't': c = '\t'; goto Emit_match_char;  // tab
 490                case '0': case '1':
 491                {  const char * q;
 492                   for (q = p, c -= '0'; *p >= '0' && *p <= '7' &&
 493                                         p - q <= 2; p++)
 494                      c = c * 8 + *p - '0';
 495                   goto Emit_match_char;
 496                }
 497                case 'd': set = numeric; goto Emit_match_set;
 498                case 'D': set = ! numeric; goto Emit_match_set;
 499                case 'w': set = word; goto Emit_match_set;
 500                case 'W': set = ! word; goto Emit_match_set;
 501                case 's': set = space; goto Emit_match_set;
 502                case 'S': set = ! space; goto Emit_match_set;
 503                case 'b': if (emit) *PC++ = MATCH_BOUND; else size++; break;
 504                case 'B': if (emit) *PC++ = MATCH_NON_BOUND; else size++; break;
 505                case '<': if (emit) *PC++ = MATCH_WORD_BEGIN; else size++; break;
 506                case '>': if (emit) *PC++ = MATCH_WORD_END; else size++; break;
 507                default: goto Emit_match_char;
 508             } break;
 509          case META_NONE: Emit_match_char:
 510             if (emit) {
 511                if (last && *last == MATCH_LITERAL &&
 512                    M[*p] != META_STAR && M[*p] != META_PLUS &&
 513                    M[*p] != META_OPTIONAL) {
 514                   last[1]++;
 515                   *PC++ = c;
 516                } else {
 517                   last = PC;
 518                   *PC++ = MATCH_LITERAL;
 519                   *PC++ = 1;
 520                   *PC++ = c;
 521                }
 522             } else size += 3;
 523             break;
 524          default: break;
 525       }
 526    }
 527
 528    if (emit) *PC++ = SUCCESS;
 529
 530    return size+1;
 531
 532 Syntax_Error:
 533
 534    return -1;
 535 }
 536
 537 //
 538 // Traverse the opcode and compute the minimal length of the matched text
 539 //
 540
 541 int RegExp::minMatch(register const Opcode * PC)
 542 {  register int len = 0;
 543    for (;;) {
 544       switch (*PC++) {
 545          case SUCCESS: return len;
 546          case WILD_CARD: len++; break;
 547          case MATCH_CHAR_STAR: PC += 2; break;
 548          case MATCH_CHAR_PLUS: len++; PC += 2; break;
 549          case MATCH_SET_STAR: PC += 32; break;
 550          case MATCH_SET_PLUS:
 551          case MATCH_SET: len++; PC += 32; break;
 552          case MATCH_LITERAL: len += *PC; PC += *PC + 1; break;
 553          case OR:
 554          {        int len1 = this->minMatch(PC + PC[0] * 256 + PC[1]);
 555                   int len2 = this->minMatch(PC + 2);
 556                   return len + (len1 < len2 ? len1 : len2);
 557          }
 558          case TIMES: PC += 2; break;
 559          case MATCH_BOUND: case MATCH_NON_BOUND:
 560          case MATCH_WORD_BEGIN: case MATCH_WORD_END:
 561          case BEGIN_OF_LINE: case END_OF_LINE:
 562          case REF+0: case REF+1: case REF+2: case REF+3: case REF+4:
 563          case REF+5: case REF+6: case REF+7: case REF+8: case REF+9:
 564             break;
 565          default: return len;
 566       }
 567    }
 568 }