scintilla/RESearch.cxx

   1 // Scintilla source code edit control
   2 /** @file RESearch.cxx
   3  ** Regular expression search library.
   4  **/
   5
   6 /*
   7  * regex - Regular expression pattern matching  and replacement
   8  *
   9  * By:  Ozan S. Yigit (oz)
  10  *      Dept. of Computer Science
  11  *      York University
  12  *
  13  * Original code available from http://www.cs.yorku.ca/~oz/
  14  * Translation to C++ by Neil Hodgson neilh@scintilla.org
  15  * Removed all use of register.
  16  * Converted to modern function prototypes.
  17  * Put all global/static variables into an object so this code can be
  18  * used from multiple threads etc.
  19  *
  20  * These routines are the PUBLIC DOMAIN equivalents of regex
  21  * routines as found in 4.nBSD UN*X, with minor extensions.
  22  *
  23  * These routines are derived from various implementations found
  24  * in software tools books, and Conroy's grep. They are NOT derived
  25  * from licensed/restricted software.
  26  * For more interesting/academic/complicated implementations,
  27  * see Henry Spencer's regexp routines, or GNU Emacs pattern
  28  * matching module.
  29  *
  30  * Modification history removed.
  31  *
  32  * Interfaces:
  33  *      RESearch::Compile:        compile a regular expression into a NFA.
  34  *
  35  *                      char *RESearch::Compile(s)
  36  *                      char *s;
  37  *
  38  *      RESearch::Execute:        execute the NFA to match a pattern.
  39  *
  40  *                      int RESearch::Execute(s)
  41  *                      char *s;
  42  *
  43  *      RESearch::ModifyWord            change RESearch::Execute's understanding of what a "word"
  44  *                      looks like (for \< and \>) by adding into the
  45  *                      hidden word-syntax table.
  46  *
  47  *                      void RESearch::ModifyWord(s)
  48  *                      char *s;
  49  *
  50  *      RESearch::Substitute:   substitute the matched portions in a new string.
  51  *
  52  *                      int RESearch::Substitute(src, dst)
  53  *                      char *src;
  54  *                      char *dst;
  55  *
  56  *      re_fail:        failure routine for RESearch::Execute.
  57  *
  58  *                      void re_fail(msg, op)
  59  *                      char *msg;
  60  *                      char op;
  61  *
  62  * Regular Expressions:
  63  *
  64  *      [1]     char    matches itself, unless it is a special
  65  *                      character (metachar): . \ [ ] * + ^ $
  66  *
  67  *      [2]     .       matches any character.
  68  *
  69  *      [3]     \       matches the character following it, except
  70  *                      when followed by a left or right round bracket,
  71  *                      a digit 1 to 9 or a left or right angle bracket.
  72  *                      (see [7], [8] and [9])
  73  *                      It is used as an escape character for all
  74  *                      other meta-characters, and itself. When used
  75  *                      in a set ([4]), it is treated as an ordinary
  76  *                      character.
  77  *
  78  *      [4]     [set]   matches one of the characters in the set.
  79  *                      If the first character in the set is "^",
  80  *                      it matches a character NOT in the set, i.e.
  81  *                      complements the set. A shorthand S-E is
  82  *                      used to specify a set of characters S upto
  83  *                      E, inclusive. The special characters "]" and
  84  *                      "-" have no special meaning if they appear
  85  *                      as the first chars in the set.
  86  *                      examples:        match:
  87  *
  88  *                              [a-z]    any lowercase alpha
  89  *
  90  *                              [^]-]    any char except ] and -
  91  *
  92  *                              [^A-Z]   any char except uppercase
  93  *                                       alpha
  94  *
  95  *                              [a-zA-Z] any alpha
  96  *
  97  *      [5]     *       any regular expression form [1] to [4], followed by
  98  *                      closure char (*) matches zero or more matches of
  99  *                      that form.
 100  *
 101  *      [6]     +       same as [5], except it matches one or more.
 102  *
 103  *      [7]             a regular expression in the form [1] to [10], enclosed
 104  *                      as \(form\) matches what form matches. The enclosure
 105  *                      creates a set of tags, used for [8] and for
 106  *                      pattern substution. The tagged forms are numbered
 107  *                      starting from 1.
 108  *
 109  *      [8]             a \ followed by a digit 1 to 9 matches whatever a
 110  *                      previously tagged regular expression ([7]) matched.
 111  *
 112  *      [9]     \<      a regular expression starting with a \< construct
 113  *              \>      and/or ending with a \> construct, restricts the
 114  *                      pattern matching to the beginning of a word, and/or
 115  *                      the end of a word. A word is defined to be a character
 116  *                      string beginning and/or ending with the characters
 117  *                      A-Z a-z 0-9 and _. It must also be preceded and/or
 118  *                      followed by any character outside those mentioned.
 119  *
 120  *      [10]            a composite regular expression xy where x and y
 121  *                      are in the form [1] to [10] matches the longest
 122  *                      match of x followed by a match for y.
 123  *
 124  *      [11]    ^       a regular expression starting with a ^ character
 125  *              $       and/or ending with a $ character, restricts the
 126  *                      pattern matching to the beginning of the line,
 127  *                      or the end of line. [anchors] Elsewhere in the
 128  *                      pattern, ^ and $ are treated as ordinary characters.
 129  *
 130  *
 131  * Acknowledgements:
 132  *
 133  *      HCR's Hugh Redelmeier has been most helpful in various
 134  *      stages of development. He convinced me to include BOW
 135  *      and EOW constructs, originally invented by Rob Pike at
 136  *      the University of Toronto.
 137  *
 138  * References:
 139  *              Software tools                  Kernighan & Plauger
 140  *              Software tools in Pascal        Kernighan & Plauger
 141  *              Grep [rsx-11 C dist]            David Conroy
 142  *              ed - text editor                Un*x Programmer's Manual
 143  *              Advanced editing on Un*x        B. W. Kernighan
 144  *              RegExp routines                 Henry Spencer
 145  *
 146  * Notes:
 147  *
 148  *      This implementation uses a bit-set representation for character
 149  *      classes for speed and compactness. Each character is represented
 150  *      by one bit in a 128-bit block. Thus, CCL always takes a
 151  *      constant 16 bytes in the internal nfa, and RESearch::Execute does a single
 152  *      bit comparison to locate the character in the set.
 153  *
 154  * Examples:
 155  *
 156  *      pattern:        foo*.*
 157  *      compile:        CHR f CHR o CLO CHR o END CLO ANY END END
 158  *      matches:        fo foo fooo foobar fobar foxx ...
 159  *
 160  *      pattern:        fo[ob]a[rz]
 161  *      compile:        CHR f CHR o CCL bitset CHR a CCL bitset END
 162  *      matches:        fobar fooar fobaz fooaz
 163  *
 164  *      pattern:        foo\\+
 165  *      compile:        CHR f CHR o CHR o CHR \ CLO CHR \ END END
 166  *      matches:        foo\ foo\\ foo\\\  ...
 167  *
 168  *      pattern:        \(foo\)[1-3]\1  (same as foo[1-3]foo)
 169  *      compile:        BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
 170  *      matches:        foo1foo foo2foo foo3foo
 171  *
 172  *      pattern:        \(fo.*\)-\1
 173  *      compile:        BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
 174  *      matches:        foo-foo fo-fo fob-fob foobar-foobar ...
 175  */
 176
 177 #include "RESearch.h"
 178
 179 #define OKP     1
 180 #define NOP     0
 181
 182 #define CHR     1
 183 #define ANY     2
 184 #define CCL     3
 185 #define BOL     4
 186 #define EOL     5
 187 #define BOT     6
 188 #define EOT     7
 189 #define BOW     8
 190 #define EOW     9
 191 #define REF     10
 192 #define CLO     11
 193
 194 #define END     0
 195
 196 /*
 197  * The following defines are not meant to be changeable.
 198  * They are for readability only.
 199  */
 200 #define BLKIND  0370
 201 #define BITIND  07
 202
 203 #define ASCIIB  0177
 204
 205 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
 206
 207 #define badpat(x)       (*nfa = END, x)
 208
 209 RESearch::RESearch() {
 210         Init();
 211 }
 212
 213 RESearch::~RESearch() {
 214         Clear();
 215 }
 216
 217 void RESearch::Init() {
 218         sta = NOP;                      /* status of lastpat */
 219         bol = 0;
 220         for (int i=0; i<MAXTAG; i++)
 221                 pat[i] = 0;
 222         for (int j=0; j<BITBLK; j++)
 223                 bittab[j] = 0;
 224 }
 225
 226 void RESearch::Clear() {
 227         for (int i=0; i<MAXTAG; i++) {
 228                 delete []pat[i];
 229                 pat[i] = 0;
 230                 bopat[i] = NOTFOUND;
 231                 eopat[i] = NOTFOUND;
 232         }
 233 }
 234
 235 bool RESearch::GrabMatches(CharacterIndexer &ci) {
 236         bool success = true;
 237         for (unsigned int i=0; i<MAXTAG; i++) {
 238                 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
 239                         unsigned int len = eopat[i] - bopat[i];
 240                         pat[i] = new char[len + 1];
 241                         if (pat[i]) {
 242                                 for (unsigned int j=0; j<len; j++)
 243                                         pat[i][j] = ci.CharAt(bopat[i] + j);
 244                                 pat[i][len] = '\0';
 245                         } else {
 246                                 success = false;
 247                         }
 248                 }
 249         }
 250         return success;
 251 }
 252
 253 void RESearch::ChSet(char c) {
 254         bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
 255 }
 256
 257 void RESearch::ChSetWithCase(char c, bool caseSensitive) {
 258         if (caseSensitive) {
 259                 ChSet(c);
 260         } else {
 261                 if ((c >= 'a') && (c <= 'z')) {
 262                         ChSet(c);
 263                         ChSet(static_cast<char>(c - 'a' + 'A'));
 264                 } else if ((c >= 'A') && (c <= 'Z')) {
 265                         ChSet(c);
 266                         ChSet(static_cast<char>(c - 'A' + 'a'));
 267                 } else {
 268                         ChSet(c);
 269                 }
 270         }
 271 }
 272
 273 const char escapeValue(char ch) {
 274         switch (ch) {
 275         case 'a':       return '\a';
 276         case 'b':       return '\b';
 277         case 'f':       return '\f';
 278         case 'n':       return '\n';
 279         case 'r':       return '\r';
 280         case 't':       return '\t';
 281         case 'v':       return '\v';
 282         }
 283         return 0;
 284 }
 285
 286 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
 287         char *mp=nfa;          /* nfa pointer       */
 288         char *lp;              /* saved pointer..   */
 289         char *sp=nfa;          /* another one..     */
 290     char *mpMax = mp + MAXNFA - BITBLK - 10;
 291
 292         int tagi = 0;          /* tag stack index   */
 293         int tagc = 1;          /* actual tag count  */
 294
 295         int n;
 296         char mask;              /* xor mask -CCL/NCL */
 297         int c1, c2;
 298
 299         if (!pat || !length)
 300                 if (sta)
 301                         return 0;
 302                 else
 303                         return badpat("No previous regular expression");
 304         sta = NOP;
 305
 306         const char *p=pat;               /* pattern pointer   */
 307         for (int i=0; i<length; i++, p++) {
 308                 if (mp > mpMax)
 309                         return badpat("Pattern too long");
 310                 lp = mp;
 311                 switch(*p) {
 312
 313                 case '.':               /* match any char..  */
 314                         *mp++ = ANY;
 315                         break;
 316
 317                 case '^':               /* match beginning.. */
 318                         if (p == pat)
 319                                 *mp++ = BOL;
 320                         else {
 321                                 *mp++ = CHR;
 322                                 *mp++ = *p;
 323                         }
 324                         break;
 325
 326                 case '$':               /* match endofline.. */
 327                         if (!*(p+1))
 328                                 *mp++ = EOL;
 329                         else {
 330                                 *mp++ = CHR;
 331                                 *mp++ = *p;
 332                         }
 333                         break;
 334
 335                 case '[':               /* match char class..*/
 336                         *mp++ = CCL;
 337
 338                         i++;
 339                         if (*++p == '^') {
 340                                 mask = '\377';
 341                                 i++;
 342                                 p++;
 343                         } else
 344                                 mask = 0;
 345
 346                         if (*p == '-') {                /* real dash */
 347                                 i++;
 348                                 ChSet(*p++);
 349                         }
 350                         if (*p == ']') {        /* real brace */
 351                                 i++;
 352                                 ChSet(*p++);
 353                         }
 354                         while (*p && *p != ']') {
 355                                 if (*p == '-' && *(p+1) && *(p+1) != ']') {
 356                                         i++;
 357                                         p++;
 358                                         c1 = *(p-2) + 1;
 359                                         i++;
 360                                         c2 = *p++;
 361                                         while (c1 <= c2) {
 362                                                 ChSetWithCase(static_cast<char>(c1++), caseSensitive);
 363                                         }
 364                                 } else if (*p == '\\' && *(p+1)) {
 365                                         i++;
 366                                         p++;
 367                                         char escape = escapeValue(*p);
 368                                         if (escape)
 369                                                 ChSetWithCase(escape, caseSensitive);
 370                                         else
 371                                                 ChSetWithCase(*p, caseSensitive);
 372                                         i++;
 373                                         p++;
 374                                 } else {
 375                                         i++;
 376                                         ChSetWithCase(*p++, caseSensitive);
 377                                 }
 378                         }
 379                         if (!*p)
 380                                 return badpat("Missing ]");
 381
 382                         for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
 383                                 *mp++ = static_cast<char>(mask ^ bittab[n]);
 384
 385                         break;
 386
 387                 case '*':               /* match 0 or more.. */
 388                 case '+':               /* match 1 or more.. */
 389                         if (p == pat)
 390                                 return badpat("Empty closure");
 391                         lp = sp;                /* previous opcode */
 392                         if (*lp == CLO)         /* equivalence..   */
 393                                 break;
 394                         switch(*lp) {
 395
 396                         case BOL:
 397                         case BOT:
 398                         case EOT:
 399                         case BOW:
 400                         case EOW:
 401                         case REF:
 402                                 return badpat("Illegal closure");
 403                         default:
 404                                 break;
 405                         }
 406
 407                         if (*p == '+')
 408                                 for (sp = mp; lp < sp; lp++)
 409                                         *mp++ = *lp;
 410
 411                         *mp++ = END;
 412                         *mp++ = END;
 413                         sp = mp;
 414                         while (--mp > lp)
 415                                 *mp = mp[-1];
 416                         *mp = CLO;
 417                         mp = sp;
 418                         break;
 419
 420                 case '\\':              /* tags, backrefs .. */
 421                         i++;
 422                         switch(*++p) {
 423
 424                         case '<':
 425                                 *mp++ = BOW;
 426                                 break;
 427                         case '>':
 428                                 if (*sp == BOW)
 429                                         return badpat("Null pattern inside \\<\\>");
 430                                 *mp++ = EOW;
 431                                 break;
 432                         case '1':
 433                         case '2':
 434                         case '3':
 435                         case '4':
 436                         case '5':
 437                         case '6':
 438                         case '7':
 439                         case '8':
 440                         case '9':
 441                                 n = *p-'0';
 442                                 if (tagi > 0 && tagstk[tagi] == n)
 443                                         return badpat("Cyclical reference");
 444                                 if (tagc > n) {
 445                                         *mp++ = static_cast<char>(REF);
 446                                         *mp++ = static_cast<char>(n);
 447                                 }
 448                                 else
 449                                         return badpat("Undetermined reference");
 450                                 break;
 451                         case 'a':
 452                         case 'b':
 453                         case 'n':
 454                         case 'f':
 455                         case 'r':
 456                         case 't':
 457                         case 'v':
 458                                 *mp++ = CHR;
 459                                 *mp++ = escapeValue(*p);
 460                                 break;
 461                         default:
 462                                 if (!posix && *p == '(') {
 463                                         if (tagc < MAXTAG) {
 464                                                 tagstk[++tagi] = tagc;
 465                                                 *mp++ = BOT;
 466                                                 *mp++ = static_cast<char>(tagc++);
 467                                         }
 468                                         else
 469                                                 return badpat("Too many \\(\\) pairs");
 470                                 } else if (!posix && *p == ')') {
 471                                         if (*sp == BOT)
 472                                                 return badpat("Null pattern inside \\(\\)");
 473                                         if (tagi > 0) {
 474                                                 *mp++ = static_cast<char>(EOT);
 475                                                 *mp++ = static_cast<char>(tagstk[tagi--]);
 476                                         }
 477                                         else
 478                                                 return badpat("Unmatched \\)");
 479                                 } else {
 480                                         *mp++ = CHR;
 481                                         *mp++ = *p;
 482                                 }
 483                         }
 484                         break;
 485
 486                 default :               /* an ordinary char  */
 487                         if (posix && *p == '(') {
 488                                 if (tagc < MAXTAG) {
 489                                         tagstk[++tagi] = tagc;
 490                                         *mp++ = BOT;
 491                                         *mp++ = static_cast<char>(tagc++);
 492                                 }
 493                                 else
 494                                         return badpat("Too many () pairs");
 495                         } else if (posix && *p == ')') {
 496                                 if (*sp == BOT)
 497                                         return badpat("Null pattern inside ()");
 498                                 if (tagi > 0) {
 499                                         *mp++ = static_cast<char>(EOT);
 500                                         *mp++ = static_cast<char>(tagstk[tagi--]);
 501                                 }
 502                                 else
 503                                         return badpat("Unmatched )");
 504                         } else if (caseSensitive) {
 505                                 *mp++ = CHR;
 506                                 *mp++ = *p;
 507                         } else {
 508                                 *mp++ = CCL;
 509                                 mask = 0;
 510                                 ChSetWithCase(*p, false);
 511                                 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
 512                                         *mp++ = static_cast<char>(mask ^ bittab[n]);
 513                         }
 514                         break;
 515                 }
 516                 sp = lp;
 517         }
 518         if (tagi > 0)
 519                 return badpat((posix ? "Unmatched (" : "Unmatched \\("));
 520         *mp = END;
 521         sta = OKP;
 522         return 0;
 523 }
 524
 525 /*
 526  * RESearch::Execute:
 527  *      execute nfa to find a match.
 528  *
 529  *      special cases: (nfa[0])
 530  *              BOL
 531  *                      Match only once, starting from the
 532  *                      beginning.
 533  *              CHR
 534  *                      First locate the character without
 535  *                      calling PMatch, and if found, call
 536  *                      PMatch for the remaining string.
 537  *              END
 538  *                      RESearch::Compile failed, poor luser did not
 539  *                      check for it. Fail fast.
 540  *
 541  *      If a match is found, bopat[0] and eopat[0] are set
 542  *      to the beginning and the end of the matched fragment,
 543  *      respectively.
 544  *
 545  */
 546
 547 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
 548         char c;
 549         int ep = NOTFOUND;
 550         char *ap = nfa;
 551
 552         bol = lp;
 553         failure = 0;
 554
 555         Clear();
 556
 557         switch(*ap) {
 558
 559         case BOL:                       /* anchored: match from BOL only */
 560                 ep = PMatch(ci, lp, endp, ap);
 561                 break;
 562         case EOL:                       /* just searching for end of line normal path doesn't work */
 563                 if (*(ap+1) == END) {
 564                         lp = endp;
 565                         ep = lp;
 566                         break;
 567                 } else {
 568                         return 0;
 569                 }
 570         case CHR:                       /* ordinary char: locate it fast */
 571                 c = *(ap+1);
 572                 while ((lp < endp) && (ci.CharAt(lp) != c))
 573                         lp++;
 574                 if (lp >= endp)         /* if EOS, fail, else fall thru. */
 575                         return 0;
 576         default:                        /* regular matching all the way. */
 577                 while (lp < endp) {
 578                         ep = PMatch(ci, lp, endp, ap);
 579                         if (ep != NOTFOUND)
 580                                 break;
 581                         lp++;
 582                 }
 583                 break;
 584         case END:                       /* munged automaton. fail always */
 585                 return 0;
 586         }
 587         if (ep == NOTFOUND)
 588                 return 0;
 589
 590         bopat[0] = lp;
 591         eopat[0] = ep;
 592         return 1;
 593 }
 594
 595 /*
 596  * PMatch: internal routine for the hard part
 597  *
 598  *      This code is partly snarfed from an early grep written by
 599  *      David Conroy. The backref and tag stuff, and various other
 600  *      innovations are by oz.
 601  *
 602  *      special case optimizations: (nfa[n], nfa[n+1])
 603  *              CLO ANY
 604  *                      We KNOW .* will match everything upto the
 605  *                      end of line. Thus, directly go to the end of
 606  *                      line, without recursive PMatch calls. As in
 607  *                      the other closure cases, the remaining pattern
 608  *                      must be matched by moving backwards on the
 609  *                      string recursively, to find a match for xy
 610  *                      (x is ".*" and y is the remaining pattern)
 611  *                      where the match satisfies the LONGEST match for
 612  *                      x followed by a match for y.
 613  *              CLO CHR
 614  *                      We can again scan the string forward for the
 615  *                      single char and at the point of failure, we
 616  *                      execute the remaining nfa recursively, same as
 617  *                      above.
 618  *
 619  *      At the end of a successful match, bopat[n] and eopat[n]
 620  *      are set to the beginning and end of subpatterns matched
 621  *      by tagged expressions (n = 1 to 9).
 622  *
 623  */
 624
 625 extern void re_fail(char *,char);
 626
 627 /*
 628  * character classification table for word boundary operators BOW
 629  * and EOW. the reason for not using ctype macros is that we can
 630  * let the user add into our own table. see RESearch::ModifyWord. This table
 631  * is not in the bitset form, since we may wish to extend it in the
 632  * future for other character classifications.
 633  *
 634  *      TRUE for 0-9 A-Z a-z _
 635  */
 636 static char chrtyp[MAXCHR] = {
 637         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 638         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 639         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 640         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 641         0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
 642         1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
 643         0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
 644         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 645         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 646         1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
 647         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 648         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 649         1, 1, 1, 0, 0, 0, 0, 0
 650         };
 651
 652 #define inascii(x)      (0177&(x))
 653 #define iswordc(x)      chrtyp[inascii(x)]
 654 #define isinset(x,y)    ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
 655
 656 /*
 657  * skip values for CLO XXX to skip past the closure
 658  */
 659
 660 #define ANYSKIP 2       /* [CLO] ANY END ...         */
 661 #define CHRSKIP 3       /* [CLO] CHR chr END ...     */
 662 #define CCLSKIP 34      /* [CLO] CCL 32bytes END ... */
 663
 664 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
 665         int op, c, n;
 666         int e;          /* extra pointer for CLO */
 667         int bp;         /* beginning of subpat.. */
 668         int ep;         /* ending of subpat..    */
 669         int are;                        /* to save the line ptr. */
 670
 671         while ((op = *ap++) != END)
 672                 switch(op) {
 673
 674                 case CHR:
 675                         if (ci.CharAt(lp++) != *ap++)
 676                                 return NOTFOUND;
 677                         break;
 678                 case ANY:
 679                         if (lp++ >= endp)
 680                                 return NOTFOUND;
 681                         break;
 682                 case CCL:
 683                         c = ci.CharAt(lp++);
 684                         if (!isinset(ap,c))
 685                                 return NOTFOUND;
 686                         ap += BITBLK;
 687                         break;
 688                 case BOL:
 689                         if (lp != bol)
 690                                 return NOTFOUND;
 691                         break;
 692                 case EOL:
 693                         if (lp < endp)
 694                                 return NOTFOUND;
 695                         break;
 696                 case BOT:
 697                         bopat[*ap++] = lp;
 698                         break;
 699                 case EOT:
 700                         eopat[*ap++] = lp;
 701                         break;
 702                 case BOW:
 703                         if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
 704                                 return NOTFOUND;
 705                         break;
 706                 case EOW:
 707                         if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
 708                                 return NOTFOUND;
 709                         break;
 710                 case REF:
 711                         n = *ap++;
 712                         bp = bopat[n];
 713                         ep = eopat[n];
 714                         while (bp < ep)
 715                                 if (ci.CharAt(bp++) != ci.CharAt(lp++))
 716                                         return NOTFOUND;
 717                         break;
 718                 case CLO:
 719                         are = lp;
 720                         switch(*ap) {
 721
 722                         case ANY:
 723                                 while (lp < endp)
 724                                         lp++;
 725                                 n = ANYSKIP;
 726                                 break;
 727                         case CHR:
 728                                 c = *(ap+1);
 729                                 while ((lp < endp) && (c == ci.CharAt(lp)))
 730                                         lp++;
 731                                 n = CHRSKIP;
 732                                 break;
 733                         case CCL:
 734                                 while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
 735                                         lp++;
 736                                 n = CCLSKIP;
 737                                 break;
 738                         default:
 739                                 failure = true;
 740                                 //re_fail("closure: bad nfa.", *ap);
 741                                 return NOTFOUND;
 742                         }
 743
 744                         ap += n;
 745
 746                         while (lp >= are) {
 747                                 if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
 748                                         return e;
 749                                 --lp;
 750                         }
 751                         return NOTFOUND;
 752                 default:
 753                         //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
 754                         return NOTFOUND;
 755                 }
 756         return lp;
 757 }
 758
 759 /*
 760  * RESearch::ModifyWord:
 761  *      add new characters into the word table to change RESearch::Execute's
 762  *      understanding of what a word should look like. Note that we
 763  *      only accept additions into the word definition.
 764  *
 765  *      If the string parameter is 0 or null string, the table is
 766  *      reset back to the default containing A-Z a-z 0-9 _. [We use
 767  *      the compact bitset representation for the default table]
 768  */
 769
 770 static char deftab[16] = {
 771         0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
 772         '\376', '\377', '\377', 007
 773 };
 774
 775 void RESearch::ModifyWord(char *s) {
 776         int i;
 777
 778         if (!s || !*s) {
 779                 for (i = 0; i < MAXCHR; i++)
 780                         if (!isinset(deftab,i))
 781                                 iswordc(i) = 0;
 782         }
 783         else
 784                 while(*s)
 785                         iswordc(*s++) = 1;
 786 }
 787
 788 /*
 789  * RESearch::Substitute:
 790  *      substitute the matched portions of the src in dst.
 791  *
 792  *      &       substitute the entire matched pattern.
 793  *
 794  *      \digit  substitute a subpattern, with the given tag number.
 795  *              Tags are numbered from 1 to 9. If the particular
 796  *              tagged subpattern does not exist, null is substituted.
 797  */
 798 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
 799         char c;
 800         int  pin;
 801         int bp;
 802         int ep;
 803
 804         if (!*src || !bopat[0])
 805                 return 0;
 806
 807         while ((c = *src++) != 0) {
 808                 switch(c) {
 809
 810                 case '&':
 811                         pin = 0;
 812                         break;
 813
 814                 case '\\':
 815                         c = *src++;
 816                         if (c >= '0' && c <= '9') {
 817                                 pin = c - '0';
 818                                 break;
 819                         }
 820
 821                 default:
 822                         *dst++ = c;
 823                         continue;
 824                 }
 825
 826                 if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
 827                         while (ci.CharAt(bp) && bp < ep)
 828                                 *dst++ = ci.CharAt(bp++);
 829                         if (bp < ep)
 830                                 return 0;
 831                 }
 832         }
 833         *dst = (char) 0;
 834         return 1;
 835 }