*** empty log message ***
[anjuta-git-plugin.git] / scintilla / RESearch.cxx
blobb7ea71bfb9631d81d3b92743c6626b2dfda35cee
1 // Scintilla source code edit control
2 /** @file RESearch.cxx
3 ** Regular expression search library.
4 **/
6 /*
7 * regex - Regular expression pattern matching and replacement
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
11 * York University
13 * Original code available from http://www.cs.yorku.ca/~oz/
14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
15 * Removed all use of register.
16 * Converted to modern function prototypes.
17 * Put all global/static variables into an object so this code can be
18 * used from multiple threads etc.
20 * These routines are the PUBLIC DOMAIN equivalents of regex
21 * routines as found in 4.nBSD UN*X, with minor extensions.
23 * These routines are derived from various implementations found
24 * in software tools books, and Conroy's grep. They are NOT derived
25 * from licensed/restricted software.
26 * For more interesting/academic/complicated implementations,
27 * see Henry Spencer's regexp routines, or GNU Emacs pattern
28 * matching module.
30 * Modification history removed.
32 * Interfaces:
33 * RESearch::Compile: compile a regular expression into a NFA.
35 * char *RESearch::Compile(s)
36 * char *s;
38 * RESearch::Execute: execute the NFA to match a pattern.
40 * int RESearch::Execute(s)
41 * char *s;
43 * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
44 * looks like (for \< and \>) by adding into the
45 * hidden word-syntax table.
47 * void RESearch::ModifyWord(s)
48 * char *s;
50 * RESearch::Substitute: substitute the matched portions in a new string.
52 * int RESearch::Substitute(src, dst)
53 * char *src;
54 * char *dst;
56 * re_fail: failure routine for RESearch::Execute.
58 * void re_fail(msg, op)
59 * char *msg;
60 * char op;
62 * Regular Expressions:
64 * [1] char matches itself, unless it is a special
65 * character (metachar): . \ [ ] * + ^ $
67 * [2] . matches any character.
69 * [3] \ matches the character following it, except
70 * when followed by a left or right round bracket,
71 * a digit 1 to 9 or a left or right angle bracket.
72 * (see [7], [8] and [9])
73 * It is used as an escape character for all
74 * other meta-characters, and itself. When used
75 * in a set ([4]), it is treated as an ordinary
76 * character.
78 * [4] [set] matches one of the characters in the set.
79 * If the first character in the set is "^",
80 * it matches a character NOT in the set, i.e.
81 * complements the set. A shorthand S-E is
82 * used to specify a set of characters S upto
83 * E, inclusive. The special characters "]" and
84 * "-" have no special meaning if they appear
85 * as the first chars in the set.
86 * examples: match:
88 * [a-z] any lowercase alpha
90 * [^]-] any char except ] and -
92 * [^A-Z] any char except uppercase
93 * alpha
95 * [a-zA-Z] any alpha
97 * [5] * any regular expression form [1] to [4], followed by
98 * closure char (*) matches zero or more matches of
99 * that form.
101 * [6] + same as [5], except it matches one or more.
103 * [7] a regular expression in the form [1] to [10], enclosed
104 * as \(form\) matches what form matches. The enclosure
105 * creates a set of tags, used for [8] and for
106 * pattern substution. The tagged forms are numbered
107 * starting from 1.
109 * [8] a \ followed by a digit 1 to 9 matches whatever a
110 * previously tagged regular expression ([7]) matched.
112 * [9] \< a regular expression starting with a \< construct
113 * \> and/or ending with a \> construct, restricts the
114 * pattern matching to the beginning of a word, and/or
115 * the end of a word. A word is defined to be a character
116 * string beginning and/or ending with the characters
117 * A-Z a-z 0-9 and _. It must also be preceded and/or
118 * followed by any character outside those mentioned.
120 * [10] a composite regular expression xy where x and y
121 * are in the form [1] to [10] matches the longest
122 * match of x followed by a match for y.
124 * [11] ^ a regular expression starting with a ^ character
125 * $ and/or ending with a $ character, restricts the
126 * pattern matching to the beginning of the line,
127 * or the end of line. [anchors] Elsewhere in the
128 * pattern, ^ and $ are treated as ordinary characters.
131 * Acknowledgements:
133 * HCR's Hugh Redelmeier has been most helpful in various
134 * stages of development. He convinced me to include BOW
135 * and EOW constructs, originally invented by Rob Pike at
136 * the University of Toronto.
138 * References:
139 * Software tools Kernighan & Plauger
140 * Software tools in Pascal Kernighan & Plauger
141 * Grep [rsx-11 C dist] David Conroy
142 * ed - text editor Un*x Programmer's Manual
143 * Advanced editing on Un*x B. W. Kernighan
144 * RegExp routines Henry Spencer
146 * Notes:
148 * This implementation uses a bit-set representation for character
149 * classes for speed and compactness. Each character is represented
150 * by one bit in a 128-bit block. Thus, CCL always takes a
151 * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
152 * bit comparison to locate the character in the set.
154 * Examples:
156 * pattern: foo*.*
157 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
158 * matches: fo foo fooo foobar fobar foxx ...
160 * pattern: fo[ob]a[rz]
161 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
162 * matches: fobar fooar fobaz fooaz
164 * pattern: foo\\+
165 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
166 * matches: foo\ foo\\ foo\\\ ...
168 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
169 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
170 * matches: foo1foo foo2foo foo3foo
172 * pattern: \(fo.*\)-\1
173 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
174 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
177 #include "RESearch.h"
179 #define OKP 1
180 #define NOP 0
182 #define CHR 1
183 #define ANY 2
184 #define CCL 3
185 #define BOL 4
186 #define EOL 5
187 #define BOT 6
188 #define EOT 7
189 #define BOW 8
190 #define EOW 9
191 #define REF 10
192 #define CLO 11
194 #define END 0
197 * The following defines are not meant to be changeable.
198 * They are for readability only.
200 #define BLKIND 0370
201 #define BITIND 07
203 #define ASCIIB 0177
205 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
207 #define badpat(x) (*nfa = END, x)
209 RESearch::RESearch() {
210 Init();
213 RESearch::~RESearch() {
214 Clear();
217 void RESearch::Init() {
218 sta = NOP; /* status of lastpat */
219 bol = 0;
220 for (int i=0; i<MAXTAG; i++)
221 pat[i] = 0;
222 for (int j=0; j<BITBLK; j++)
223 bittab[j] = 0;
226 void RESearch::Clear() {
227 for (int i=0; i<MAXTAG; i++) {
228 delete []pat[i];
229 pat[i] = 0;
230 bopat[i] = NOTFOUND;
231 eopat[i] = NOTFOUND;
235 bool RESearch::GrabMatches(CharacterIndexer &ci) {
236 bool success = true;
237 for (unsigned int i=0; i<MAXTAG; i++) {
238 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
239 unsigned int len = eopat[i] - bopat[i];
240 pat[i] = new char[len + 1];
241 if (pat[i]) {
242 for (unsigned int j=0; j<len; j++)
243 pat[i][j] = ci.CharAt(bopat[i] + j);
244 pat[i][len] = '\0';
245 } else {
246 success = false;
250 return success;
253 void RESearch::ChSet(char c) {
254 bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
257 void RESearch::ChSetWithCase(char c, bool caseSensitive) {
258 if (caseSensitive) {
259 ChSet(c);
260 } else {
261 if ((c >= 'a') && (c <= 'z')) {
262 ChSet(c);
263 ChSet(static_cast<char>(c - 'a' + 'A'));
264 } else if ((c >= 'A') && (c <= 'Z')) {
265 ChSet(c);
266 ChSet(static_cast<char>(c - 'A' + 'a'));
267 } else {
268 ChSet(c);
273 const char escapeValue(char ch) {
274 switch (ch) {
275 case 'a': return '\a';
276 case 'b': return '\b';
277 case 'f': return '\f';
278 case 'n': return '\n';
279 case 'r': return '\r';
280 case 't': return '\t';
281 case 'v': return '\v';
283 return 0;
286 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
287 char *mp=nfa; /* nfa pointer */
288 char *lp; /* saved pointer.. */
289 char *sp=nfa; /* another one.. */
290 char *mpMax = mp + MAXNFA - BITBLK - 10;
292 int tagi = 0; /* tag stack index */
293 int tagc = 1; /* actual tag count */
295 int n;
296 char mask; /* xor mask -CCL/NCL */
297 int c1, c2;
299 if (!pat || !length)
300 if (sta)
301 return 0;
302 else
303 return badpat("No previous regular expression");
304 sta = NOP;
306 const char *p=pat; /* pattern pointer */
307 for (int i=0; i<length; i++, p++) {
308 if (mp > mpMax)
309 return badpat("Pattern too long");
310 lp = mp;
311 switch(*p) {
313 case '.': /* match any char.. */
314 *mp++ = ANY;
315 break;
317 case '^': /* match beginning.. */
318 if (p == pat)
319 *mp++ = BOL;
320 else {
321 *mp++ = CHR;
322 *mp++ = *p;
324 break;
326 case '$': /* match endofline.. */
327 if (!*(p+1))
328 *mp++ = EOL;
329 else {
330 *mp++ = CHR;
331 *mp++ = *p;
333 break;
335 case '[': /* match char class..*/
336 *mp++ = CCL;
338 i++;
339 if (*++p == '^') {
340 mask = '\377';
341 i++;
342 p++;
343 } else
344 mask = 0;
346 if (*p == '-') { /* real dash */
347 i++;
348 ChSet(*p++);
350 if (*p == ']') { /* real brace */
351 i++;
352 ChSet(*p++);
354 while (*p && *p != ']') {
355 if (*p == '-' && *(p+1) && *(p+1) != ']') {
356 i++;
357 p++;
358 c1 = *(p-2) + 1;
359 i++;
360 c2 = *p++;
361 while (c1 <= c2) {
362 ChSetWithCase(static_cast<char>(c1++), caseSensitive);
364 } else if (*p == '\\' && *(p+1)) {
365 i++;
366 p++;
367 char escape = escapeValue(*p);
368 if (escape)
369 ChSetWithCase(escape, caseSensitive);
370 else
371 ChSetWithCase(*p, caseSensitive);
372 i++;
373 p++;
374 } else {
375 i++;
376 ChSetWithCase(*p++, caseSensitive);
379 if (!*p)
380 return badpat("Missing ]");
382 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
383 *mp++ = static_cast<char>(mask ^ bittab[n]);
385 break;
387 case '*': /* match 0 or more.. */
388 case '+': /* match 1 or more.. */
389 if (p == pat)
390 return badpat("Empty closure");
391 lp = sp; /* previous opcode */
392 if (*lp == CLO) /* equivalence.. */
393 break;
394 switch(*lp) {
396 case BOL:
397 case BOT:
398 case EOT:
399 case BOW:
400 case EOW:
401 case REF:
402 return badpat("Illegal closure");
403 default:
404 break;
407 if (*p == '+')
408 for (sp = mp; lp < sp; lp++)
409 *mp++ = *lp;
411 *mp++ = END;
412 *mp++ = END;
413 sp = mp;
414 while (--mp > lp)
415 *mp = mp[-1];
416 *mp = CLO;
417 mp = sp;
418 break;
420 case '\\': /* tags, backrefs .. */
421 i++;
422 switch(*++p) {
424 case '<':
425 *mp++ = BOW;
426 break;
427 case '>':
428 if (*sp == BOW)
429 return badpat("Null pattern inside \\<\\>");
430 *mp++ = EOW;
431 break;
432 case '1':
433 case '2':
434 case '3':
435 case '4':
436 case '5':
437 case '6':
438 case '7':
439 case '8':
440 case '9':
441 n = *p-'0';
442 if (tagi > 0 && tagstk[tagi] == n)
443 return badpat("Cyclical reference");
444 if (tagc > n) {
445 *mp++ = static_cast<char>(REF);
446 *mp++ = static_cast<char>(n);
448 else
449 return badpat("Undetermined reference");
450 break;
451 case 'a':
452 case 'b':
453 case 'n':
454 case 'f':
455 case 'r':
456 case 't':
457 case 'v':
458 *mp++ = CHR;
459 *mp++ = escapeValue(*p);
460 break;
461 default:
462 if (!posix && *p == '(') {
463 if (tagc < MAXTAG) {
464 tagstk[++tagi] = tagc;
465 *mp++ = BOT;
466 *mp++ = static_cast<char>(tagc++);
468 else
469 return badpat("Too many \\(\\) pairs");
470 } else if (!posix && *p == ')') {
471 if (*sp == BOT)
472 return badpat("Null pattern inside \\(\\)");
473 if (tagi > 0) {
474 *mp++ = static_cast<char>(EOT);
475 *mp++ = static_cast<char>(tagstk[tagi--]);
477 else
478 return badpat("Unmatched \\)");
479 } else {
480 *mp++ = CHR;
481 *mp++ = *p;
484 break;
486 default : /* an ordinary char */
487 if (posix && *p == '(') {
488 if (tagc < MAXTAG) {
489 tagstk[++tagi] = tagc;
490 *mp++ = BOT;
491 *mp++ = static_cast<char>(tagc++);
493 else
494 return badpat("Too many () pairs");
495 } else if (posix && *p == ')') {
496 if (*sp == BOT)
497 return badpat("Null pattern inside ()");
498 if (tagi > 0) {
499 *mp++ = static_cast<char>(EOT);
500 *mp++ = static_cast<char>(tagstk[tagi--]);
502 else
503 return badpat("Unmatched )");
504 } else if (caseSensitive) {
505 *mp++ = CHR;
506 *mp++ = *p;
507 } else {
508 *mp++ = CCL;
509 mask = 0;
510 ChSetWithCase(*p, false);
511 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
512 *mp++ = static_cast<char>(mask ^ bittab[n]);
514 break;
516 sp = lp;
518 if (tagi > 0)
519 return badpat((posix ? "Unmatched (" : "Unmatched \\("));
520 *mp = END;
521 sta = OKP;
522 return 0;
526 * RESearch::Execute:
527 * execute nfa to find a match.
529 * special cases: (nfa[0])
530 * BOL
531 * Match only once, starting from the
532 * beginning.
533 * CHR
534 * First locate the character without
535 * calling PMatch, and if found, call
536 * PMatch for the remaining string.
537 * END
538 * RESearch::Compile failed, poor luser did not
539 * check for it. Fail fast.
541 * If a match is found, bopat[0] and eopat[0] are set
542 * to the beginning and the end of the matched fragment,
543 * respectively.
547 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
548 char c;
549 int ep = NOTFOUND;
550 char *ap = nfa;
552 bol = lp;
553 failure = 0;
555 Clear();
557 switch(*ap) {
559 case BOL: /* anchored: match from BOL only */
560 ep = PMatch(ci, lp, endp, ap);
561 break;
562 case EOL: /* just searching for end of line normal path doesn't work */
563 if (*(ap+1) == END) {
564 lp = endp;
565 ep = lp;
566 break;
567 } else {
568 return 0;
570 case CHR: /* ordinary char: locate it fast */
571 c = *(ap+1);
572 while ((lp < endp) && (ci.CharAt(lp) != c))
573 lp++;
574 if (lp >= endp) /* if EOS, fail, else fall thru. */
575 return 0;
576 default: /* regular matching all the way. */
577 while (lp < endp) {
578 ep = PMatch(ci, lp, endp, ap);
579 if (ep != NOTFOUND)
580 break;
581 lp++;
583 break;
584 case END: /* munged automaton. fail always */
585 return 0;
587 if (ep == NOTFOUND)
588 return 0;
590 bopat[0] = lp;
591 eopat[0] = ep;
592 return 1;
596 * PMatch: internal routine for the hard part
598 * This code is partly snarfed from an early grep written by
599 * David Conroy. The backref and tag stuff, and various other
600 * innovations are by oz.
602 * special case optimizations: (nfa[n], nfa[n+1])
603 * CLO ANY
604 * We KNOW .* will match everything upto the
605 * end of line. Thus, directly go to the end of
606 * line, without recursive PMatch calls. As in
607 * the other closure cases, the remaining pattern
608 * must be matched by moving backwards on the
609 * string recursively, to find a match for xy
610 * (x is ".*" and y is the remaining pattern)
611 * where the match satisfies the LONGEST match for
612 * x followed by a match for y.
613 * CLO CHR
614 * We can again scan the string forward for the
615 * single char and at the point of failure, we
616 * execute the remaining nfa recursively, same as
617 * above.
619 * At the end of a successful match, bopat[n] and eopat[n]
620 * are set to the beginning and end of subpatterns matched
621 * by tagged expressions (n = 1 to 9).
625 extern void re_fail(char *,char);
628 * character classification table for word boundary operators BOW
629 * and EOW. the reason for not using ctype macros is that we can
630 * let the user add into our own table. see RESearch::ModifyWord. This table
631 * is not in the bitset form, since we may wish to extend it in the
632 * future for other character classifications.
634 * TRUE for 0-9 A-Z a-z _
636 static char chrtyp[MAXCHR] = {
637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
641 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
642 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
643 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
647 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
648 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
649 1, 1, 1, 0, 0, 0, 0, 0
652 #define inascii(x) (0177&(x))
653 #define iswordc(x) chrtyp[inascii(x)]
654 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
657 * skip values for CLO XXX to skip past the closure
660 #define ANYSKIP 2 /* [CLO] ANY END ... */
661 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
662 #define CCLSKIP 34 /* [CLO] CCL 32bytes END ... */
664 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
665 int op, c, n;
666 int e; /* extra pointer for CLO */
667 int bp; /* beginning of subpat.. */
668 int ep; /* ending of subpat.. */
669 int are; /* to save the line ptr. */
671 while ((op = *ap++) != END)
672 switch(op) {
674 case CHR:
675 if (ci.CharAt(lp++) != *ap++)
676 return NOTFOUND;
677 break;
678 case ANY:
679 if (lp++ >= endp)
680 return NOTFOUND;
681 break;
682 case CCL:
683 c = ci.CharAt(lp++);
684 if (!isinset(ap,c))
685 return NOTFOUND;
686 ap += BITBLK;
687 break;
688 case BOL:
689 if (lp != bol)
690 return NOTFOUND;
691 break;
692 case EOL:
693 if (lp < endp)
694 return NOTFOUND;
695 break;
696 case BOT:
697 bopat[*ap++] = lp;
698 break;
699 case EOT:
700 eopat[*ap++] = lp;
701 break;
702 case BOW:
703 if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
704 return NOTFOUND;
705 break;
706 case EOW:
707 if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
708 return NOTFOUND;
709 break;
710 case REF:
711 n = *ap++;
712 bp = bopat[n];
713 ep = eopat[n];
714 while (bp < ep)
715 if (ci.CharAt(bp++) != ci.CharAt(lp++))
716 return NOTFOUND;
717 break;
718 case CLO:
719 are = lp;
720 switch(*ap) {
722 case ANY:
723 while (lp < endp)
724 lp++;
725 n = ANYSKIP;
726 break;
727 case CHR:
728 c = *(ap+1);
729 while ((lp < endp) && (c == ci.CharAt(lp)))
730 lp++;
731 n = CHRSKIP;
732 break;
733 case CCL:
734 while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
735 lp++;
736 n = CCLSKIP;
737 break;
738 default:
739 failure = true;
740 //re_fail("closure: bad nfa.", *ap);
741 return NOTFOUND;
744 ap += n;
746 while (lp >= are) {
747 if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
748 return e;
749 --lp;
751 return NOTFOUND;
752 default:
753 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
754 return NOTFOUND;
756 return lp;
760 * RESearch::ModifyWord:
761 * add new characters into the word table to change RESearch::Execute's
762 * understanding of what a word should look like. Note that we
763 * only accept additions into the word definition.
765 * If the string parameter is 0 or null string, the table is
766 * reset back to the default containing A-Z a-z 0-9 _. [We use
767 * the compact bitset representation for the default table]
770 static char deftab[16] = {
771 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
772 '\376', '\377', '\377', 007
775 void RESearch::ModifyWord(char *s) {
776 int i;
778 if (!s || !*s) {
779 for (i = 0; i < MAXCHR; i++)
780 if (!isinset(deftab,i))
781 iswordc(i) = 0;
783 else
784 while(*s)
785 iswordc(*s++) = 1;
789 * RESearch::Substitute:
790 * substitute the matched portions of the src in dst.
792 * & substitute the entire matched pattern.
794 * \digit substitute a subpattern, with the given tag number.
795 * Tags are numbered from 1 to 9. If the particular
796 * tagged subpattern does not exist, null is substituted.
798 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
799 char c;
800 int pin;
801 int bp;
802 int ep;
804 if (!*src || !bopat[0])
805 return 0;
807 while ((c = *src++) != 0) {
808 switch(c) {
810 case '&':
811 pin = 0;
812 break;
814 case '\\':
815 c = *src++;
816 if (c >= '0' && c <= '9') {
817 pin = c - '0';
818 break;
821 default:
822 *dst++ = c;
823 continue;
826 if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
827 while (ci.CharAt(bp) && bp < ep)
828 *dst++ = ci.CharAt(bp++);
829 if (bp < ep)
830 return 0;
833 *dst = (char) 0;
834 return 1;