gcc config
[prop.git] / lib-src / strings / regexp.cc
blobdb82c6629245e2759c3479bdb3315a2d8a4fa295
1 //////////////////////////////////////////////////////////////////////////////
2 // NOTICE:
3 //
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are free to incorporate any part of ADLib and Prop into
9 // your programs.
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
16 // code.
18 // This software is still under development and we welcome any suggestions
19 // and help from the users.
21 // Allen Leung
22 // 1994
23 //////////////////////////////////////////////////////////////////////////////
25 #include <stddef.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include <AD/generic/generic.h> // Generic definitions
30 #include <AD/contain/charset.h> // Character set
31 #include <AD/strings/regexp.h> // Regular expression search
33 // Local variables
34 static CharSet alpha; // set of alpha characters
35 static CharSet numeric; // set of numeric characters
36 static CharSet space; // set of space characters
37 static CharSet word; // set of word characters
39 static void init()
40 { for (register int c = 0; c < 256; c++) {
41 if (isalpha(c)) alpha += c;
42 if (isdigit(c)) numeric += c;
43 if (isspace(c)) space += c;
44 if (isalnum(c)) word += c;
46 word += '_';
50 // The set of opcodes used by the NFA matching machine
53 enum { SUCCESS = 0, // accept state
54 BEGIN_OF_LINE = 1, // match the beginning of a line
55 END_OF_LINE = 2, // match the end of a line
56 WILD_CARD = 3, // matches anything besides '\n'
57 MATCH_LITERAL = 4, // matches a literal string
58 MATCH_SET = 5, // matches any character in a set
59 MATCH_CHAR_STAR = 6, // matches a character zero or more times
60 MATCH_CHAR_PLUS = 7, // matches a character one or more times
61 MATCH_SET_STAR = 8, // matches a set zero or more times
62 MATCH_SET_PLUS = 9, // matches a set one or more times
63 OR = 10, // disjunction, branch unlikely
64 PLUS = 11, // disjunction, branch likely
65 JUMP = 12, // unconditional jump
66 TIMES = 13, // match a subexpression m to n times.
67 MATCH_BOUND = 14, // match word boundary
68 MATCH_NON_BOUND = 15, // match non-word boundary
69 MATCH_WORD_BEGIN = 16, // match beginning of a word
70 MATCH_WORD_END = 17, // match end of a word
71 REF = 20, // matches a backward reference
72 OPEN = 30, // beginning of a reference
73 CLOSE = 40 // end of a reference
76 RegExp::RegExp(const RegExp& re) : nfa(0) { *this = re; }
78 RegExp& RegExp::operator = (const RegExp& re)
79 { if (this != &re) {
80 delete [] nfa; nfa = NULL;
81 if (re.nfa) {
82 nfa = new Opcode [size = re.size];
83 memcpy(nfa,re.nfa,sizeof(Opcode) * size);
84 minMatchLen = re.minMatchLen;
87 return *this;
91 // The main pattern matcher, returns the character following
92 // the entire matched text if successful; otherwise, returns NULL
95 const char * RegExp::grep( register const Opcode * PC, // current opcode
96 register const char * text, // current character
97 register const char * textEnd // end of text string
99 { register char c = *text++; // cache the first character
100 int n, len, minTimes, times;
102 for (;;) {
103 switch (*PC++) {
104 case SUCCESS: return text-1;
105 case BEGIN_OF_LINE: if (text - 1 != myText) return NULL; break;
106 case END_OF_LINE:
107 if (text > textEnd) break;
108 if (c == '\n') { c = *text++; break; }
109 return NULL;
110 case WILD_CARD:
111 if (text > textEnd || c == '\n') return NULL;
112 c = *text++; break;
113 case OPEN+0: case OPEN+1: case OPEN+2: case OPEN+3: case OPEN+4:
114 case OPEN+5: case OPEN+6: case OPEN+7: case OPEN+8: case OPEN+9:
115 left[PC[-1] - OPEN] = text - myText - 1; break;
116 case CLOSE+0: case CLOSE+1: case CLOSE+2: case CLOSE+3: case CLOSE+4:
117 case CLOSE+5: case CLOSE+6: case CLOSE+7: case CLOSE+8: case CLOSE+9:
118 right[PC[-1] - CLOSE] = text - myText - 2; break;
119 case REF+0: case REF+1: case REF+2: case REF+3: case REF+4:
120 case REF+5: case REF+6: case REF+7: case REF+8: case REF+9:
121 n = PC[-1] - REF; len = right[n] - left[n];
122 if (text+len >= textEnd || len < 0 ||
123 memcmp(myText+left[n],text-1,len))
124 return NULL;
125 text += len - 1; c = *text++;
126 break;
128 case MATCH_LITERAL:
129 len = *PC++ - 1;
130 if (c != *PC++ || text + len > textEnd || memcmp(text,PC,len))
131 return NULL;
132 PC += len; text += len; c = *text++;
133 break;
135 case MATCH_SET:
136 if (text > textEnd ||
137 bitOf((Byte *)PC,(unsigned char)c) == 0) return NULL;
138 PC += 32; c = *text++; break;
140 case MATCH_CHAR_PLUS: minTimes = 1; goto Match_char;
141 case MATCH_CHAR_STAR: minTimes = 0;
142 Match_char:
143 { c = PC[1]; PC += 2;
144 for (times = 0, text--; text < textEnd; text++, times++)
145 if (*text != c) break;
146 for ( ;times >= minTimes; times--, text--) {
147 const char * matchEnd = this->grep(PC,text,textEnd);
148 if (matchEnd) return matchEnd;
150 return NULL;
152 case MATCH_SET_PLUS: minTimes = 1; goto Match_set;
153 case MATCH_SET_STAR: minTimes = 0;
154 Match_set:
155 { for (times = 0, text--; text < textEnd; text++, times++)
156 if (bitOf((Byte *)PC,(unsigned char)*text) == 0) break;
157 for ( PC += 32; times >= minTimes; times--, text--) {
158 const char * matchEnd = this->grep(PC,text,textEnd);
159 if (matchEnd) return matchEnd;
161 return NULL;
164 case TIMES: // Try subpattern m to n times
165 return this->times(PC[0], PC[1], text-1, textEnd, PC+2);
166 case MATCH_WORD_BEGIN: // Beginning of a word??
167 if ( text > textEnd || ! word[c] ||
168 text - 1 > myText && word[text[-2]])
169 return NULL;
170 break;
171 case MATCH_WORD_END: // End of a word??
172 if ( text <= textEnd && ! word[c] ||
173 text < textEnd && word[c]) return NULL;
174 break;
175 case MATCH_BOUND: // A word boundary??
176 if ( (text > textEnd || ! word[c]) &&
177 (text - 2 < myText || ! word[text[-2]]) ||
178 text <= textEnd && word[c] &&
179 text - 2 >= myText && word[text[-2]] )
180 return NULL;
181 break;
182 case MATCH_NON_BOUND: // Not a word boundary??
183 if ( text <= textEnd && word[c] &&
184 (text - 2 < myText || ! word[text[-2]]) ||
185 (text > textEnd || ! word[c]) &&
186 text - 2 >= myText && word[text[-2]] )
187 return NULL;
188 break;
189 case OR: // Try one alternate, then another
190 { const char * success = this->grep(PC+2,text-1,textEnd);
191 if (success) return success; else PC += PC[0] * 256 + PC[1];
192 } break;
193 case PLUS: // Try one alternate, then another
194 { const char * good =
195 this->grep(PC+PC[0] * 256 + PC[1],text-1,textEnd);
196 if (good) return good; else PC += 2;
197 } break;
198 case JUMP: // Unconditional jump
199 PC += PC[0] * 256 + PC[1]; break;
200 default: return NULL;
203 return NULL;
207 // Call and matches a subpattern |minTimes| to |maxTimes| times.
208 // Notice that |maxTimes < 0| means that the maximum times is infinite.
211 const char * RegExp::times( int minTimes,
212 int maxTimes,
213 const Opcode * PC,
214 const char * text,
215 const char * textEnd
217 { const char * matched;
218 while (minTimes >= 0 && maxTimes >= 0) {
219 if (maxTimes == 1) return this->grep(PC,text,textEnd);
220 else if (maxTimes < 0)
221 matched = this->times(minTimes-1,-1,PC,text,textEnd);
222 else
223 matched = this->times(minTimes-1,maxTimes-1,PC,text,textEnd);
224 if (matched == NULL) return NULL;
225 if (minTimes < 0) return matched;
226 text = matched;
228 return NULL;
232 // Main entry point of the pattern matcher
235 int RegExp::Match(register const char * text, int length)
237 if (nfa == NULL) return -1;
238 if (length < 0) length = strlen(text);
239 register int c;
241 myText = text;
242 const char * textEnd = text + length;
243 register const char * limit;
244 if (minMatchLen > 0) limit = textEnd - minMatchLen + 1;
245 else limit = textEnd;
247 switch (*nfa) {
248 case BEGIN_OF_LINE: // Anchored match
249 if ((matchedTextEnd = this->grep(nfa+1,text,textEnd))) {
250 matchedText = text;
251 return matchedText - myText;
253 return -1;
254 case MATCH_LITERAL: // Starts with a literal
255 { int len = nfa[1];
256 for ( c = nfa[2]; text < limit; text++)
257 if (c == *text && text + len <= textEnd &&
258 memcmp(text,nfa+2,len) == 0)
259 if ((matchedTextEnd = this->grep(nfa+len+2,text+len,textEnd))) {
260 matchedText = text;
261 return matchedText - myText;
263 return -1;
265 case MATCH_CHAR_PLUS: // Starts with a character
266 for ( c = nfa[2]; text < limit; text++)
267 if (c == *text)
268 if ((matchedTextEnd = this->grep(nfa,text,textEnd))) {
269 matchedText = text;
270 return matchedText - myText;
272 return -1;
273 case MATCH_SET: case MATCH_SET_PLUS: // Starts with a set
274 { register Byte * set = (Byte*)(nfa+1);
275 for ( ; text < limit; text++)
276 if (bitOf(set,(unsigned char)*text))
277 if ((matchedTextEnd = this->grep(nfa,text,textEnd))) {
278 matchedText = text;
279 return matchedText - myText;
281 return -1;
283 default: // Can start with anything
284 for ( ;text < limit; text++)
285 if ((matchedTextEnd = this->grep(nfa,text,textEnd))) {
286 matchedText = text;
287 return matchedText - myText;
289 return -1;
294 // Compile a new pattern
297 RegExp& RegExp::compile(const char * pattern, const char * meta)
299 delete [] nfa; // delete last compiled pattern, if any
300 nfa = NULL;
301 if (meta == NULL) // set default meta characters
302 meta = "*+()[]?^$-.|\\\0";
304 MetaChar M[256]; // meta character assignment
306 if (! space[' ']) init(); // initialize, if necessary
309 // Compute the meta character assignment
311 for (int i = 0; i < 256; i++) M[i] = META_NONE;
312 for (int j = META_STAR; j <= META_EOP; j++) M[ meta[j] ] = (MetaChar)j;
314 size = this->comp(pattern, M, false); // compile it once to get the size
316 if (size > 0) {
317 nfa = new Opcode [size]; // allocate new storage
318 this->comp(pattern, M, true); // compile it again to emit opcodes
319 minMatchLen = this->minMatch(nfa); // compute the minimal matching len
322 return *this;
325 inline void shift(register RegExp::Opcode * start,
326 register RegExp::Opcode * end,
327 register int size)
329 while (end > start) { end--; end[size] = end[0]; }
333 // ``Parse'' pattern and compile the NFA
336 int RegExp::comp(const char * pattern, const MetaChar M[], Bool emit)
338 register const char * p; // pointer to pattern
339 register char c; // cached pattern character
340 register int size = 0; // size of the nfa
341 register Opcode * PC = nfa; // nfa opcodes
342 Opcode * last; // last opcode
343 CharSet set; // a character set
344 int level = 0; // nesting level
345 Opcode * start[10]; // starting point for subpattern
346 int parenCount = 0; // parenthesis count
347 int paren[10]; // closing parenthesis number
348 Bool complement = false; // set complement??
350 start[0] = PC; // starting point of first pattern
352 for (last = NULL, p = pattern; M[c = *p++] != META_EOP; ) {
353 switch (M[c]) {
354 case META_STAR:
355 if (emit) {
356 if (last) {
357 switch (last[0]) {
358 case MATCH_LITERAL:
359 if (last[1] == 1) last[0] = MATCH_CHAR_STAR;
360 else goto Emit_star; break;
361 case MATCH_CHAR_PLUS: last[0] = MATCH_CHAR_STAR; break;
362 case MATCH_CHAR_STAR: break;
363 case MATCH_SET:
364 case MATCH_SET_PLUS: last[0] = MATCH_SET_STAR; break;
365 case MATCH_SET_STAR: break;
366 default: Emit_star:
367 *PC++ = JUMP;
368 *PC++ = (last - PC - 2) / 256;
369 *PC++ = (last - PC - 2) % 256;
370 shift(last,PC,3);
371 last[0] = OR;
372 last[1] = (PC - last + 2) / 256;
373 last[2] = (PC - last + 2) % 256;
374 PC += 3;
375 break;
377 } else goto Syntax_Error;
378 } else size += 6; break;
379 case META_PLUS:
380 if (emit) {
381 if (last) {
382 switch (last[0]) {
383 case MATCH_LITERAL:
384 if (last[1] == 1) last[0] = MATCH_CHAR_PLUS;
385 else goto Emit_plus; break;
386 case MATCH_CHAR_PLUS:
387 case MATCH_CHAR_STAR:
388 case MATCH_SET_PLUS:
389 case MATCH_SET_STAR: break;
390 case MATCH_SET: last[0] = MATCH_SET_PLUS; break;
391 default: Emit_plus:
392 *PC++ = PLUS;
393 *PC++ = (last - PC + 1) / 256;
394 *PC++ = (last - PC + 1) % 256;
395 break;
397 } else goto Syntax_Error;
398 } else size += 3; break;
399 case META_OPTIONAL:
400 if (emit) {
401 if (last == NULL) goto Syntax_Error;
402 shift(last,PC,3);
403 last[0] = OR;
404 last[1] = (PC - last + 2) / 256;
405 last[2] = (PC - last + 2) % 256;
406 PC += 3;
407 } else size += 3;
408 break;
409 case META_OR:
410 if (emit) {
411 *PC++ = SUCCESS;
412 shift(start[level],PC,3);
413 start[level][0] = OR;
414 start[level][1] = (PC - start[level] + 2) / 256;
415 start[level][2] = (PC - start[level] + 2) % 256;
416 PC += 3;
417 } else size += 4;
418 break;
419 case META_OPEN:
420 if (emit) { last = PC; *PC = OPEN + parenCount; } else size++;
421 paren[level] = parenCount;
422 if (++parenCount >= 10) goto Syntax_Error;
423 if (++level >= 10) goto Syntax_Error;
424 start[level] = PC++;
425 break;
426 case META_CLOSE:
427 last = start[level];
428 if (--level < 0) goto Syntax_Error;
429 if (emit) { *PC++ = CLOSE + paren[level]; }
430 else size++;
431 break;
432 case META_BEGIN:
433 last = PC; if (emit) *PC++ = BEGIN_OF_LINE; else size++; break;
434 case META_END:
435 last = PC; if (emit) *PC++ = END_OF_LINE; else size++; break;
436 case META_WILD:
437 last = PC; if (emit) *PC++ = WILD_CARD; else size++; break;
438 case META_SET_OPEN:
439 { set.clear();
440 const char * start = p;
441 int lastChar = -1;
442 while (M[c = *p++] != META_SET_CLOSE) {
443 if (c == '^' && p-1 == start) { complement = true; continue; }
444 switch (M[c]) {
445 case META_EOP: goto Syntax_Error;
446 case META_RANGE: lastChar = p[-2]; break;
447 case META_ESCAPE:
448 switch (c = *p++) {
449 case 'n': set += '\n'; break;
450 case 't': set += '\t'; break;
451 case '0': case '1':
452 const char * q;
453 for (q = p, c -= 0;
454 *p >= '0' && *p <= '7' && p - q <= 2; p++)
455 c = c * 8 + *p - '0';
456 set += c; break;
457 case 'd': set += numeric; break;
458 case 'D': set += ! numeric; break;
459 case 's': set += space; break;
460 case 'S': set += ! space; break;
461 case 'w': set += word; break;
462 case 'W': set += ! word; break;
463 default: goto Emit_set_char;
464 } break;
465 default: Emit_set_char:
466 if (lastChar >= 0)
467 for ( ;lastChar <= c; lastChar++) set += lastChar;
468 else
469 set += c;
470 lastChar = -1;
471 break;
475 Emit_match_set:
476 { if (emit) {
477 if (complement) set = ! set;
478 last = PC;
479 *PC++ = MATCH_SET;
480 memcpy(PC,(const Byte *)set,32);
481 PC += 32;
482 complement = false;
483 } else size += 33;
485 break;
486 case META_ESCAPE:
487 switch (c = *p++) {
488 case 'n': c = '\n'; goto Emit_match_char; // new line
489 case 't': c = '\t'; goto Emit_match_char; // tab
490 case '0': case '1':
491 { const char * q;
492 for (q = p, c -= '0'; *p >= '0' && *p <= '7' &&
493 p - q <= 2; p++)
494 c = c * 8 + *p - '0';
495 goto Emit_match_char;
497 case 'd': set = numeric; goto Emit_match_set;
498 case 'D': set = ! numeric; goto Emit_match_set;
499 case 'w': set = word; goto Emit_match_set;
500 case 'W': set = ! word; goto Emit_match_set;
501 case 's': set = space; goto Emit_match_set;
502 case 'S': set = ! space; goto Emit_match_set;
503 case 'b': if (emit) *PC++ = MATCH_BOUND; else size++; break;
504 case 'B': if (emit) *PC++ = MATCH_NON_BOUND; else size++; break;
505 case '<': if (emit) *PC++ = MATCH_WORD_BEGIN; else size++; break;
506 case '>': if (emit) *PC++ = MATCH_WORD_END; else size++; break;
507 default: goto Emit_match_char;
508 } break;
509 case META_NONE: Emit_match_char:
510 if (emit) {
511 if (last && *last == MATCH_LITERAL &&
512 M[*p] != META_STAR && M[*p] != META_PLUS &&
513 M[*p] != META_OPTIONAL) {
514 last[1]++;
515 *PC++ = c;
516 } else {
517 last = PC;
518 *PC++ = MATCH_LITERAL;
519 *PC++ = 1;
520 *PC++ = c;
522 } else size += 3;
523 break;
524 default: break;
528 if (emit) *PC++ = SUCCESS;
530 return size+1;
532 Syntax_Error:
534 return -1;
538 // Traverse the opcode and compute the minimal length of the matched text
541 int RegExp::minMatch(register const Opcode * PC)
542 { register int len = 0;
543 for (;;) {
544 switch (*PC++) {
545 case SUCCESS: return len;
546 case WILD_CARD: len++; break;
547 case MATCH_CHAR_STAR: PC += 2; break;
548 case MATCH_CHAR_PLUS: len++; PC += 2; break;
549 case MATCH_SET_STAR: PC += 32; break;
550 case MATCH_SET_PLUS:
551 case MATCH_SET: len++; PC += 32; break;
552 case MATCH_LITERAL: len += *PC; PC += *PC + 1; break;
553 case OR:
554 { int len1 = this->minMatch(PC + PC[0] * 256 + PC[1]);
555 int len2 = this->minMatch(PC + 2);
556 return len + (len1 < len2 ? len1 : len2);
558 case TIMES: PC += 2; break;
559 case MATCH_BOUND: case MATCH_NON_BOUND:
560 case MATCH_WORD_BEGIN: case MATCH_WORD_END:
561 case BEGIN_OF_LINE: case END_OF_LINE:
562 case REF+0: case REF+1: case REF+2: case REF+3: case REF+4:
563 case REF+5: case REF+6: case REF+7: case REF+8: case REF+9:
564 break;
565 default: return len;