libc/regex: Fix a reference of an uninitialized variable.
[dragonfly.git] / contrib / tre / lib / tre-match-backtrack.c
blob24ed48cfdaf1cce35b0c35de8a66f9b07e93a578
1 /*
2 tre-match-backtrack.c - TRE backtracking regex matching engine
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
7 */
9 /*
10 This matcher is for regexps that use back referencing. Regexp matching
11 with back referencing is an NP-complete problem on the number of back
12 references. The easiest way to match them is to use a backtracking
13 routine which basically goes through all possible paths in the TNFA
14 and chooses the one which results in the best (leftmost and longest)
15 match. This can be spectacularly expensive and may run out of stack
16 space, but there really is no better known generic algorithm. Quoting
17 Henry Spencer from comp.compilers:
18 <URL: http://compilers.iecc.com/comparch/article/93-03-102>
20 POSIX.2 REs require longest match, which is really exciting to
21 implement since the obsolete ("basic") variant also includes
22 \<digit>. I haven't found a better way of tackling this than doing
23 a preliminary match using a DFA (or simulation) on a modified RE
24 that just replicates subREs for \<digit>, and then doing a
25 backtracking match to determine whether the subRE matches were
26 right. This can be rather slow, but I console myself with the
27 thought that people who use \<digit> deserve very slow execution.
28 (Pun unintentional but very appropriate.)
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif /* HAVE_CONFIG_H */
37 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
38 info while running */
39 #undef TRE_USE_ALLOCA
41 #ifdef TRE_USE_ALLOCA
42 /* AIX requires this to be the first thing in the file. */
43 #ifndef __GNUC__
44 # if HAVE_ALLOCA_H
45 # include <alloca.h>
46 # else
47 # ifdef _AIX
48 #pragma alloca
49 # else
50 # ifndef alloca /* predefined by HP cc +Olibcalls */
51 char *alloca ();
52 # endif
53 # endif
54 # endif
55 #endif
56 #endif /* TRE_USE_ALLOCA */
58 #include <assert.h>
59 #include <stdlib.h>
60 #include <string.h>
61 #ifdef HAVE_WCHAR_H
62 #include <wchar.h>
63 #endif /* HAVE_WCHAR_H */
64 #ifdef HAVE_WCTYPE_H
65 #include <wctype.h>
66 #endif /* HAVE_WCTYPE_H */
67 #ifndef TRE_WCHAR
68 #include <ctype.h>
69 #endif /* !TRE_WCHAR */
70 #ifdef HAVE_MALLOC_H
71 #include <malloc.h>
72 #endif /* HAVE_MALLOC_H */
74 #include "tre-internal.h"
75 #include "tre-mem.h"
76 #include "tre-match-utils.h"
77 #include "tre.h"
78 #include "xmalloc.h"
80 typedef struct {
81 int pos;
82 unsigned int pos_add_next;
83 const char *str_byte;
84 #ifdef TRE_WCHAR
85 const wchar_t *str_wide;
86 #endif /* TRE_WCHAR */
87 tre_tnfa_transition_t *state;
88 int state_id;
89 int next_c;
90 tre_tag_t *tags;
91 #ifdef TRE_MBSTATE
92 mbstate_t mbstate;
93 #endif /* TRE_MBSTATE */
94 } tre_backtrack_item_t;
96 typedef struct tre_backtrack_struct {
97 tre_backtrack_item_t item;
98 struct tre_backtrack_struct *prev;
99 struct tre_backtrack_struct *next;
100 } *tre_backtrack_t;
102 #ifdef TRE_WCHAR
103 #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide)
104 #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide
105 #else /* !TRE_WCHAR */
106 #define BT_STACK_WIDE_IN(_str_wide)
107 #define BT_STACK_WIDE_OUT
108 #endif /* !TRE_WCHAR */
110 #ifdef TRE_MBSTATE
111 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
112 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
113 #else /* !TRE_MBSTATE */
114 #define BT_STACK_MBSTATE_IN
115 #define BT_STACK_MBSTATE_OUT
116 #endif /* !TRE_MBSTATE */
119 #ifdef TRE_USE_ALLOCA
120 #define tre_bt_mem_new tre_mem_newa
121 #define tre_bt_mem_alloc tre_mem_alloca
122 #define tre_bt_mem_destroy(obj) do { } while (0)
123 #else /* !TRE_USE_ALLOCA */
124 #define tre_bt_mem_new tre_mem_new
125 #define tre_bt_mem_alloc tre_mem_alloc
126 #define tre_bt_mem_destroy tre_mem_destroy
127 #endif /* !TRE_USE_ALLOCA */
130 #define BT_STACK_PUSH(_pos, _pos_add_next, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
131 do \
133 if (!stack->next) \
135 tre_backtrack_t s; \
136 s = tre_bt_mem_alloc(mem, sizeof(*s)); \
137 if (!s) \
139 tre_bt_mem_destroy(mem); \
140 if (tags) \
141 xfree(tags); \
142 if (pmatch) \
143 xfree(pmatch); \
144 if (states_seen) \
145 xfree(states_seen); \
146 return REG_ESPACE; \
148 s->prev = stack; \
149 s->next = NULL; \
150 s->item.tags = tre_bt_mem_alloc(mem, \
151 num_tags * sizeof(*tags)); \
152 if (!s->item.tags) \
154 tre_bt_mem_destroy(mem); \
155 if (tags) \
156 xfree(tags); \
157 if (pmatch) \
158 xfree(pmatch); \
159 if (states_seen) \
160 xfree(states_seen); \
161 return REG_ESPACE; \
163 stack->next = s; \
164 stack = s; \
166 else \
167 stack = stack->next; \
168 stack->item.pos = (_pos); \
169 stack->item.pos_add_next = (_pos_add_next); \
170 stack->item.str_byte = (_str_byte); \
171 BT_STACK_WIDE_IN(_str_wide); \
172 stack->item.state = (_state); \
173 stack->item.state_id = (_state_id); \
174 stack->item.next_c = (_next_c); \
175 memcpy(stack->item.tags, (_tags), num_tags * sizeof(*(_tags))); \
176 BT_STACK_MBSTATE_IN; \
178 while (/*CONSTCOND*/0)
180 #define BT_STACK_POP() \
181 do \
183 assert(stack->prev); \
184 pos = stack->item.pos; \
185 pos_add_next = stack->item.pos_add_next; \
186 str_byte = stack->item.str_byte; \
187 BT_STACK_WIDE_OUT; \
188 state = stack->item.state; \
189 next_c = stack->item.next_c; \
190 memcpy(tags, stack->item.tags, num_tags * sizeof(*tags)); \
191 BT_STACK_MBSTATE_OUT; \
192 stack = stack->prev; \
194 while (/*CONSTCOND*/0)
196 #undef MIN
197 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
199 reg_errcode_t
200 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
201 int len, tre_str_type_t type, tre_tag_t *match_tags,
202 int eflags, int *match_end_ofs)
204 /* State variables required by GET_NEXT_WCHAR. */
205 tre_char_t prev_c = 0, next_c = 0;
206 const char *str_byte = string;
207 int pos = 0;
208 unsigned int pos_add_next = 1;
209 #ifdef TRE_WCHAR
210 const wchar_t *str_wide = string;
211 #ifdef TRE_MBSTATE
212 mbstate_t mbstate;
213 #endif /* TRE_MBSTATE */
214 #endif /* TRE_WCHAR */
215 int reg_notbol = eflags & REG_NOTBOL;
216 int reg_noteol = eflags & REG_NOTEOL;
217 int reg_newline = tnfa->cflags & REG_NEWLINE;
218 int i;
220 /* These are used to remember the necessary values of the above
221 variables to return to the position where the current search
222 started from. */
223 int next_c_start;
224 const char *str_byte_start;
225 int pos_start = -1;
226 #ifdef TRE_WCHAR
227 const wchar_t *str_wide_start;
228 #endif /* TRE_WCHAR */
229 #ifdef TRE_MBSTATE
230 mbstate_t mbstate_start;
231 #endif /* TRE_MBSTATE */
233 /* End offset of best match so far, or -1 if no match found yet. */
234 int match_eo = -1;
235 /* Tag arrays. */
236 int *next_tags;
237 tre_tag_t *tags = NULL;
238 /* Current TNFA state. */
239 tre_tnfa_transition_t *state;
240 int *states_seen = NULL;
242 /* Memory allocator to for allocating the backtracking stack. */
243 tre_mem_t mem = tre_bt_mem_new();
245 /* The backtracking stack. */
246 tre_backtrack_t stack;
248 tre_tnfa_transition_t *trans_i;
249 regmatch_t *pmatch = NULL;
250 reg_errcode_t ret;
252 int num_tags = tnfa->num_tags;
253 int touch = 1;
254 char *buf;
255 int tbytes;
257 #ifdef TRE_MBSTATE
258 memset(&mbstate, '\0', sizeof(mbstate));
259 #endif /* TRE_MBSTATE */
260 #ifndef TRE_USE_ALLOCA
261 buf = NULL;
262 #endif
264 if (!mem)
265 return REG_ESPACE;
266 stack = tre_bt_mem_alloc(mem, sizeof(*stack));
267 if (!stack)
269 ret = REG_ESPACE;
270 goto error_exit;
272 stack->prev = NULL;
273 stack->next = NULL;
275 DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
276 DPRINT(("len = %d\n", len));
279 int pbytes, sbytes, total_bytes;
280 char *tmp_buf;
281 /* Compute the length of the block we need. */
282 tbytes = sizeof(*tags) * num_tags;
283 pbytes = sizeof(*pmatch) * tnfa->num_submatches;
284 sbytes = sizeof(*states_seen) * tnfa->num_states;
285 total_bytes =
286 (sizeof(long) - 1) * 2 /* for alignment paddings */
287 + tbytes + pbytes + sbytes;
289 DPRINT(("tre_tnfa_run_backtrack, allocate %d bytes\n", total_bytes));
290 /* Allocate the memory. */
291 #ifdef TRE_USE_ALLOCA
292 buf = alloca(total_bytes);
293 #else /* !TRE_USE_ALLOCA */
294 buf = xmalloc((unsigned)total_bytes);
295 #endif /* !TRE_USE_ALLOCA */
296 if (buf == NULL)
297 return REG_ESPACE;
299 /* Get the various pointers within tmp_buf (properly aligned). */
300 tags = (void *)buf;
301 tmp_buf = buf + tbytes;
302 tmp_buf += ALIGN(tmp_buf, long);
303 pmatch = (void *)tmp_buf;
304 tmp_buf += pbytes;
305 tmp_buf += ALIGN(tmp_buf, long);
306 states_seen = (void *)tmp_buf;
309 retry:
311 memset(tags, 0, num_tags * sizeof(*tags));
312 if (match_tags)
313 memset(match_tags, 0, num_tags * sizeof(*match_tags));
314 for (i = 0; i < tnfa->num_states; i++)
315 states_seen[i] = 0;
318 state = NULL;
319 pos = pos_start;
320 GET_NEXT_WCHAR();
321 pos_start = pos;
322 next_c_start = next_c;
323 str_byte_start = str_byte;
324 #ifdef TRE_WCHAR
325 str_wide_start = str_wide;
326 #endif /* TRE_WCHAR */
327 #ifdef TRE_MBSTATE
328 mbstate_start = mbstate;
329 #endif /* TRE_MBSTATE */
331 /* Handle initial states. */
332 next_tags = NULL;
333 for (trans_i = tnfa->initial; trans_i->state; trans_i++)
335 DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
336 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
338 DPRINT(("assert failed\n"));
339 continue;
341 if (state == NULL)
343 /* Start from this state. */
344 state = trans_i->state;
345 next_tags = trans_i->tags;
347 else
349 /* Backtrack to this state. */
350 DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
351 BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide, trans_i->state,
352 trans_i->state_id, next_c, tags, mbstate);
354 int *tmp = trans_i->tags;
355 if (tmp)
357 while (*tmp >= 0)
358 tre_tag_set(stack->item.tags, *tmp++, pos, touch);
359 touch++;
365 if (next_tags)
367 for (; *next_tags >= 0; next_tags++)
368 tre_tag_set(tags, *next_tags, pos, touch);
369 touch++;
373 DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
374 DPRINT(("pos:chr/code | state and tags\n"));
375 DPRINT(("-------------+------------------------------------------------\n"));
377 if (state == NULL)
378 goto backtrack;
380 while (/*CONSTCOND*/1)
382 tre_tnfa_transition_t *next_state;
383 int empty_br_match;
385 DPRINT(("start loop\n"));
387 if (match_eo >= 0 && tnfa->num_minimals)
389 int skip = 0;
390 #ifdef TRE_DEBUG
391 DPRINT(("Checking minimal conditions: match_eo=%d match_tags=",
392 match_eo));
393 tre_print_tags(match_tags, tnfa->num_tags);
394 DPRINT(("\n"));
395 #endif /* TRE_DEBUG */
396 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
398 int end = tnfa->minimal_tags[i];
399 int start = tnfa->minimal_tags[i + 1];
400 DPRINT((" Minimal start %d, end %d\n", start, end));
401 if (tre_minimal_tag_order(start, end, match_tags, tags) > 0)
403 skip = 1;
404 break;
407 if (!skip)
409 #ifdef TRE_DEBUG
410 DPRINT((" Keeping tags="));
411 tre_print_tags(tags, tnfa->num_tags);
412 DPRINT(("\n"));
413 #endif /* TRE_DEBUG */
415 else
417 #ifdef TRE_DEBUG
418 DPRINT((" Throwing out tags="));
419 tre_print_tags(tags, tnfa->num_tags);
420 DPRINT(("\n"));
421 #endif /* TRE_DEBUG */
422 goto backtrack;
426 if (state == tnfa->final)
428 DPRINT((" match found, match_eo=%d pos=%d\n", match_eo, pos));
430 if (match_eo >= 0 && tnfa->num_minimals)
432 int compare = 0;
433 #ifdef TRE_DEBUG
434 DPRINT(("Checking minimal conditions: match_eo=%d "
435 "match_tags=", match_eo));
436 tre_print_tags(match_tags, tnfa->num_tags);
437 DPRINT(("\n"));
438 #endif /* TRE_DEBUG */
439 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
441 int end = tnfa->minimal_tags[i];
442 int start = tnfa->minimal_tags[i + 1];
443 DPRINT((" Minimal start %d, end %d\n", start, end));
444 if ((compare = tre_minimal_tag_order(start, end,
445 match_tags, tags)) != 0)
446 break;
448 if (compare > 0)
450 #ifdef TRE_DEBUG
451 DPRINT((" Throwing out new match, tags="));
452 tre_print_tags(tags, tnfa->num_tags);
453 DPRINT(("\n"));
454 #endif /* TRE_DEBUG */
455 goto backtrack;
457 else if (compare < 0)
459 #ifdef TRE_DEBUG
460 DPRINT((" Throwing out old match, tags="));
461 tre_print_tags(match_tags, tnfa->num_tags);
462 DPRINT(("\n"));
463 #endif /* TRE_DEBUG */
464 match_eo = -1;
468 if (match_eo < pos
469 || (match_eo == pos
470 && match_tags
471 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
472 tags, match_tags)))
474 /* This match wins the previous match. */
475 #ifdef TRE_DEBUG
476 DPRINT((" win previous tags="));
477 tre_print_tags(tags, tnfa->num_tags);
478 DPRINT(("\n"));
479 #endif /* TRE_DEBUG */
480 match_eo = pos;
481 if (match_tags)
482 memcpy(match_tags, tags, num_tags * sizeof(*tags));
484 /* Our TNFAs never have transitions leaving from the final state,
485 so we jump right to backtracking. */
486 goto backtrack;
489 #ifdef TRE_DEBUG
490 DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
491 state));
492 tre_print_tags(tags, tnfa->num_tags);
493 DPRINT(("\n"));
494 #endif /* TRE_DEBUG */
496 /* Go to the next character in the input string. */
497 empty_br_match = 0;
498 trans_i = state;
499 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
501 /* This is a back reference state. All transitions leaving from
502 this state have the same back reference "assertion". Instead
503 of reading the next character, we match the back reference. */
504 int so, eo, bt = trans_i->u.backref;
505 int bt_len;
506 int result;
508 DPRINT((" should match back reference %d\n", bt));
509 /* Get the substring we need to match against. Remember to
510 turn off REG_NOSUB temporarily. */
511 ret = tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
512 tnfa, tags, pos);
513 if (ret != REG_OK) goto error_exit;
514 so = pmatch[bt].rm_so;
515 eo = pmatch[bt].rm_eo;
516 bt_len = eo - so;
518 #ifdef TRE_DEBUG
520 int slen;
521 if (len < 0)
522 slen = bt_len;
523 else
524 slen = MIN(bt_len, len - pos);
526 if (type == STR_BYTE)
528 DPRINT((" substring (len %d) is [%d, %d]: '%.*s'\n",
529 bt_len, so, eo, bt_len, (char*)string + so));
530 DPRINT((" current string is '%.*s'\n", slen, str_byte - 1));
532 #ifdef TRE_WCHAR
533 else if (type == STR_WIDE)
535 DPRINT((" substring (len %d) is [%d, %d]: '%.*" STRF "'\n",
536 bt_len, so, eo, bt_len, (wchar_t*)string + so));
537 DPRINT((" current string is '%.*" STRF "'\n",
538 slen, str_wide - 1));
540 #endif /* TRE_WCHAR */
542 #endif
544 if (so < 0)
546 result = 1; /* Back reference of nomatch doesn't match */
548 else if (len < 0)
550 #ifdef TRE_WCHAR
551 if (type == STR_WIDE)
552 result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
553 (size_t)bt_len);
554 else
555 #endif /* TRE_WCHAR */
556 result = strncmp((const char*)string + so, str_byte - 1,
557 (size_t)bt_len);
559 else if (len - pos < bt_len)
560 result = 1;
561 #ifdef TRE_WCHAR
562 else if (type == STR_WIDE)
563 result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
564 (size_t)bt_len);
565 #endif /* TRE_WCHAR */
566 else
567 result = memcmp((const char*)string + so, str_byte - 1,
568 (size_t)bt_len);
570 if (result == 0)
572 /* Back reference matched. Check for infinite loop. */
573 if (bt_len == 0)
574 empty_br_match = 1;
575 if (empty_br_match && states_seen[trans_i->state_id])
577 DPRINT((" avoid loop\n"));
578 goto backtrack;
581 states_seen[trans_i->state_id] = empty_br_match;
583 /* Advance in input string and resync `prev_c', `next_c'
584 and pos. */
585 DPRINT((" back reference matched\n"));
586 str_byte += bt_len - 1;
587 #ifdef TRE_WCHAR
588 str_wide += bt_len - 1;
589 #endif /* TRE_WCHAR */
590 pos += bt_len - 1;
591 GET_NEXT_WCHAR();
592 DPRINT((" pos now %d\n", pos));
594 else
596 DPRINT((" back reference did not match\n"));
597 goto backtrack;
600 else
602 /* Check for end of string. */
603 if (len < 0)
605 if (next_c == L'\0')
606 goto backtrack;
608 else
610 if (pos >= len)
611 goto backtrack;
614 /* Read the next character. */
615 GET_NEXT_WCHAR();
618 next_state = NULL;
619 for (trans_i = state; trans_i->state; trans_i++)
621 DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
622 trans_i->code_min, trans_i->code_max,
623 trans_i->code_min, trans_i->code_max,
624 trans_i->assertions, trans_i->state_id));
625 if (trans_i->code_min <= (tre_cint_t)prev_c
626 && trans_i->code_max >= (tre_cint_t)prev_c)
628 if (trans_i->assertions
629 && (CHECK_ASSERTIONS(trans_i->assertions)
630 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
632 DPRINT((" assertion failed\n"));
633 continue;
636 if (next_state == NULL)
638 /* First matching transition. */
639 DPRINT((" Next state is %d\n", trans_i->state_id));
640 next_state = trans_i->state;
641 next_tags = trans_i->tags;
643 else
645 /* Second matching transition. We may need to backtrack here
646 to take this transition instead of the first one, so we
647 push this transition in the backtracking stack so we can
648 jump back here if needed. */
649 DPRINT((" saving state %d for backtracking\n",
650 trans_i->state_id));
651 BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide,
652 trans_i->state, trans_i->state_id, next_c,
653 tags, mbstate);
655 int *tmp;
656 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
657 tre_tag_set(stack->item.tags, *tmp, pos, touch);
658 touch++;
660 #if 0 /* XXX - it's important not to look at all transitions here to keep
661 the stack small! */
662 break;
663 #endif
668 if (next_state != NULL)
670 /* Matching transitions were found. Take the first one. */
671 state = next_state;
673 /* Update the tag values. */
674 if (next_tags)
676 while (*next_tags >= 0)
677 tre_tag_set(tags, *next_tags++, pos, touch);
678 touch++;
681 else
683 backtrack:
684 /* A matching transition was not found. Try to backtrack. */
685 if (stack->prev)
687 DPRINT((" backtracking\n"));
688 if (stack->item.state->assertions & ASSERT_BACKREF)
690 DPRINT((" states_seen[%d] = 0\n",
691 stack->item.state_id));
692 states_seen[stack->item.state_id] = 0;
695 BT_STACK_POP();
697 else if (match_eo < 0)
699 /* Try starting from a later position in the input string. */
700 /* Check for end of string. */
701 if (pos == pos_start)
703 if (len < 0)
705 if (next_c == L'\0')
707 DPRINT(("end of string.\n"));
708 break;
711 else
713 if (pos >= len)
715 DPRINT(("end of string.\n"));
716 break;
720 DPRINT(("restarting from next start position\n"));
721 next_c = next_c_start;
722 #ifdef TRE_MBSTATE
723 mbstate = mbstate_start;
724 #endif /* TRE_MBSTATE */
725 str_byte = str_byte_start;
726 #ifdef TRE_WCHAR
727 str_wide = str_wide_start;
728 #endif /* TRE_WCHAR */
729 goto retry;
731 else
733 DPRINT(("finished\n"));
734 break;
739 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
740 *match_end_ofs = match_eo;
742 error_exit:
743 tre_bt_mem_destroy(mem);
744 #ifndef TRE_USE_ALLOCA
745 if (buf)
746 xfree(buf);
747 #endif /* !TRE_USE_ALLOCA */
749 return ret;