2 tre-match-backtrack.c - TRE backtracking regex matching engine
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
10 This matcher is for regexps that use back referencing. Regexp matching
11 with back referencing is an NP-complete problem on the number of back
12 references. The easiest way to match them is to use a backtracking
13 routine which basically goes through all possible paths in the TNFA
14 and chooses the one which results in the best (leftmost and longest)
15 match. This can be spectacularly expensive and may run out of stack
16 space, but there really is no better known generic algorithm. Quoting
17 Henry Spencer from comp.compilers:
18 <URL: http://compilers.iecc.com/comparch/article/93-03-102>
20 POSIX.2 REs require longest match, which is really exciting to
21 implement since the obsolete ("basic") variant also includes
22 \<digit>. I haven't found a better way of tackling this than doing
23 a preliminary match using a DFA (or simulation) on a modified RE
24 that just replicates subREs for \<digit>, and then doing a
25 backtracking match to determine whether the subRE matches were
26 right. This can be rather slow, but I console myself with the
27 thought that people who use \<digit> deserve very slow execution.
28 (Pun unintentional but very appropriate.)
35 #endif /* HAVE_CONFIG_H */
37 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
42 /* AIX requires this to be the first thing in the file. */
50 # ifndef alloca /* predefined by HP cc +Olibcalls */
56 #endif /* TRE_USE_ALLOCA */
63 #endif /* HAVE_WCHAR_H */
66 #endif /* HAVE_WCTYPE_H */
69 #endif /* !TRE_WCHAR */
72 #endif /* HAVE_MALLOC_H */
74 #include "tre-internal.h"
76 #include "tre-match-utils.h"
82 unsigned int pos_add_next
;
85 const wchar_t *str_wide
;
86 #endif /* TRE_WCHAR */
87 tre_tnfa_transition_t
*state
;
93 #endif /* TRE_MBSTATE */
94 } tre_backtrack_item_t
;
96 typedef struct tre_backtrack_struct
{
97 tre_backtrack_item_t item
;
98 struct tre_backtrack_struct
*prev
;
99 struct tre_backtrack_struct
*next
;
103 #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide)
104 #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide
105 #else /* !TRE_WCHAR */
106 #define BT_STACK_WIDE_IN(_str_wide)
107 #define BT_STACK_WIDE_OUT
108 #endif /* !TRE_WCHAR */
111 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
112 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
113 #else /* !TRE_MBSTATE */
114 #define BT_STACK_MBSTATE_IN
115 #define BT_STACK_MBSTATE_OUT
116 #endif /* !TRE_MBSTATE */
119 #ifdef TRE_USE_ALLOCA
120 #define tre_bt_mem_new tre_mem_newa
121 #define tre_bt_mem_alloc tre_mem_alloca
122 #define tre_bt_mem_destroy(obj) do { } while (0)
123 #else /* !TRE_USE_ALLOCA */
124 #define tre_bt_mem_new tre_mem_new
125 #define tre_bt_mem_alloc tre_mem_alloc
126 #define tre_bt_mem_destroy tre_mem_destroy
127 #endif /* !TRE_USE_ALLOCA */
130 #define BT_STACK_PUSH(_pos, _pos_add_next, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
136 s = tre_bt_mem_alloc(mem, sizeof(*s)); \
139 tre_bt_mem_destroy(mem); \
145 xfree(states_seen); \
150 s->item.tags = tre_bt_mem_alloc(mem, \
151 num_tags * sizeof(*tags)); \
154 tre_bt_mem_destroy(mem); \
160 xfree(states_seen); \
167 stack = stack->next; \
168 stack->item.pos = (_pos); \
169 stack->item.pos_add_next = (_pos_add_next); \
170 stack->item.str_byte = (_str_byte); \
171 BT_STACK_WIDE_IN(_str_wide); \
172 stack->item.state = (_state); \
173 stack->item.state_id = (_state_id); \
174 stack->item.next_c = (_next_c); \
175 memcpy(stack->item.tags, (_tags), num_tags * sizeof(*(_tags))); \
176 BT_STACK_MBSTATE_IN; \
178 while (/*CONSTCOND*/0)
180 #define BT_STACK_POP() \
183 assert(stack->prev); \
184 pos = stack->item.pos; \
185 pos_add_next = stack->item.pos_add_next; \
186 str_byte = stack->item.str_byte; \
188 state = stack->item.state; \
189 next_c = stack->item.next_c; \
190 memcpy(tags, stack->item.tags, num_tags * sizeof(*tags)); \
191 BT_STACK_MBSTATE_OUT; \
192 stack = stack->prev; \
194 while (/*CONSTCOND*/0)
197 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
200 tre_tnfa_run_backtrack(const tre_tnfa_t
*tnfa
, const void *string
,
201 int len
, tre_str_type_t type
, tre_tag_t
*match_tags
,
202 int eflags
, int *match_end_ofs
)
204 /* State variables required by GET_NEXT_WCHAR. */
205 tre_char_t prev_c
= 0, next_c
= 0;
206 const char *str_byte
= string
;
208 unsigned int pos_add_next
= 1;
210 const wchar_t *str_wide
= string
;
213 #endif /* TRE_MBSTATE */
214 #endif /* TRE_WCHAR */
215 int reg_notbol
= eflags
& REG_NOTBOL
;
216 int reg_noteol
= eflags
& REG_NOTEOL
;
217 int reg_newline
= tnfa
->cflags
& REG_NEWLINE
;
220 /* These are used to remember the necessary values of the above
221 variables to return to the position where the current search
224 const char *str_byte_start
;
227 const wchar_t *str_wide_start
;
228 #endif /* TRE_WCHAR */
230 mbstate_t mbstate_start
;
231 #endif /* TRE_MBSTATE */
233 /* End offset of best match so far, or -1 if no match found yet. */
237 tre_tag_t
*tags
= NULL
;
238 /* Current TNFA state. */
239 tre_tnfa_transition_t
*state
;
240 int *states_seen
= NULL
;
242 /* Memory allocator to for allocating the backtracking stack. */
243 tre_mem_t mem
= tre_bt_mem_new();
245 /* The backtracking stack. */
246 tre_backtrack_t stack
;
248 tre_tnfa_transition_t
*trans_i
;
249 regmatch_t
*pmatch
= NULL
;
252 int num_tags
= tnfa
->num_tags
;
258 memset(&mbstate
, '\0', sizeof(mbstate
));
259 #endif /* TRE_MBSTATE */
260 #ifndef TRE_USE_ALLOCA
266 stack
= tre_bt_mem_alloc(mem
, sizeof(*stack
));
275 DPRINT(("tnfa_execute_backtrack, input type %d\n", type
));
276 DPRINT(("len = %d\n", len
));
279 int pbytes
, sbytes
, total_bytes
;
281 /* Compute the length of the block we need. */
282 tbytes
= sizeof(*tags
) * num_tags
;
283 pbytes
= sizeof(*pmatch
) * tnfa
->num_submatches
;
284 sbytes
= sizeof(*states_seen
) * tnfa
->num_states
;
286 (sizeof(long) - 1) * 2 /* for alignment paddings */
287 + tbytes
+ pbytes
+ sbytes
;
289 DPRINT(("tre_tnfa_run_backtrack, allocate %d bytes\n", total_bytes
));
290 /* Allocate the memory. */
291 #ifdef TRE_USE_ALLOCA
292 buf
= alloca(total_bytes
);
293 #else /* !TRE_USE_ALLOCA */
294 buf
= xmalloc((unsigned)total_bytes
);
295 #endif /* !TRE_USE_ALLOCA */
299 /* Get the various pointers within tmp_buf (properly aligned). */
301 tmp_buf
= buf
+ tbytes
;
302 tmp_buf
+= ALIGN(tmp_buf
, long);
303 pmatch
= (void *)tmp_buf
;
305 tmp_buf
+= ALIGN(tmp_buf
, long);
306 states_seen
= (void *)tmp_buf
;
311 memset(tags
, 0, num_tags
* sizeof(*tags
));
313 memset(match_tags
, 0, num_tags
* sizeof(*match_tags
));
314 for (i
= 0; i
< tnfa
->num_states
; i
++)
322 next_c_start
= next_c
;
323 str_byte_start
= str_byte
;
325 str_wide_start
= str_wide
;
326 #endif /* TRE_WCHAR */
328 mbstate_start
= mbstate
;
329 #endif /* TRE_MBSTATE */
331 /* Handle initial states. */
333 for (trans_i
= tnfa
->initial
; trans_i
->state
; trans_i
++)
335 DPRINT(("> init %p, prev_c %lc\n", trans_i
->state
, (tre_cint_t
)prev_c
));
336 if (trans_i
->assertions
&& CHECK_ASSERTIONS(trans_i
->assertions
))
338 DPRINT(("assert failed\n"));
343 /* Start from this state. */
344 state
= trans_i
->state
;
345 next_tags
= trans_i
->tags
;
349 /* Backtrack to this state. */
350 DPRINT(("saving state %d for backtracking\n", trans_i
->state_id
));
351 BT_STACK_PUSH(pos
, pos_add_next
, str_byte
, str_wide
, trans_i
->state
,
352 trans_i
->state_id
, next_c
, tags
, mbstate
);
354 int *tmp
= trans_i
->tags
;
358 tre_tag_set(stack
->item
.tags
, *tmp
++, pos
, touch
);
367 for (; *next_tags
>= 0; next_tags
++)
368 tre_tag_set(tags
, *next_tags
, pos
, touch
);
373 DPRINT(("entering match loop, pos %d, str_byte %p\n", pos
, str_byte
));
374 DPRINT(("pos:chr/code | state and tags\n"));
375 DPRINT(("-------------+------------------------------------------------\n"));
380 while (/*CONSTCOND*/1)
382 tre_tnfa_transition_t
*next_state
;
385 DPRINT(("start loop\n"));
387 if (match_eo
>= 0 && tnfa
->num_minimals
)
391 DPRINT(("Checking minimal conditions: match_eo=%d match_tags=",
393 tre_print_tags(match_tags
, tnfa
->num_tags
);
395 #endif /* TRE_DEBUG */
396 for (i
= 0; tnfa
->minimal_tags
[i
] >= 0; i
+= 2)
398 int end
= tnfa
->minimal_tags
[i
];
399 int start
= tnfa
->minimal_tags
[i
+ 1];
400 DPRINT((" Minimal start %d, end %d\n", start
, end
));
401 if (tre_minimal_tag_order(start
, end
, match_tags
, tags
) > 0)
410 DPRINT((" Keeping tags="));
411 tre_print_tags(tags
, tnfa
->num_tags
);
413 #endif /* TRE_DEBUG */
418 DPRINT((" Throwing out tags="));
419 tre_print_tags(tags
, tnfa
->num_tags
);
421 #endif /* TRE_DEBUG */
426 if (state
== tnfa
->final
)
428 DPRINT((" match found, match_eo=%d pos=%d\n", match_eo
, pos
));
430 if (match_eo
>= 0 && tnfa
->num_minimals
)
434 DPRINT(("Checking minimal conditions: match_eo=%d "
435 "match_tags=", match_eo
));
436 tre_print_tags(match_tags
, tnfa
->num_tags
);
438 #endif /* TRE_DEBUG */
439 for (i
= 0; tnfa
->minimal_tags
[i
] >= 0; i
+= 2)
441 int end
= tnfa
->minimal_tags
[i
];
442 int start
= tnfa
->minimal_tags
[i
+ 1];
443 DPRINT((" Minimal start %d, end %d\n", start
, end
));
444 if ((compare
= tre_minimal_tag_order(start
, end
,
445 match_tags
, tags
)) != 0)
451 DPRINT((" Throwing out new match, tags="));
452 tre_print_tags(tags
, tnfa
->num_tags
);
454 #endif /* TRE_DEBUG */
457 else if (compare
< 0)
460 DPRINT((" Throwing out old match, tags="));
461 tre_print_tags(match_tags
, tnfa
->num_tags
);
463 #endif /* TRE_DEBUG */
471 && tre_tag_order(tnfa
->num_tags
, tnfa
->tag_directions
,
474 /* This match wins the previous match. */
476 DPRINT((" win previous tags="));
477 tre_print_tags(tags
, tnfa
->num_tags
);
479 #endif /* TRE_DEBUG */
482 memcpy(match_tags
, tags
, num_tags
* sizeof(*tags
));
484 /* Our TNFAs never have transitions leaving from the final state,
485 so we jump right to backtracking. */
490 DPRINT(("%3d:%2lc/%05d | %p ", pos
, (tre_cint_t
)next_c
, (int)next_c
,
492 tre_print_tags(tags
, tnfa
->num_tags
);
494 #endif /* TRE_DEBUG */
496 /* Go to the next character in the input string. */
499 if (trans_i
->state
&& trans_i
->assertions
& ASSERT_BACKREF
)
501 /* This is a back reference state. All transitions leaving from
502 this state have the same back reference "assertion". Instead
503 of reading the next character, we match the back reference. */
504 int so
, eo
, bt
= trans_i
->u
.backref
;
508 DPRINT((" should match back reference %d\n", bt
));
509 /* Get the substring we need to match against. Remember to
510 turn off REG_NOSUB temporarily. */
511 ret
= tre_fill_pmatch(bt
+ 1, pmatch
, tnfa
->cflags
& ~REG_NOSUB
,
513 if (ret
!= REG_OK
) goto error_exit
;
514 so
= pmatch
[bt
].rm_so
;
515 eo
= pmatch
[bt
].rm_eo
;
524 slen
= MIN(bt_len
, len
- pos
);
526 if (type
== STR_BYTE
)
528 DPRINT((" substring (len %d) is [%d, %d]: '%.*s'\n",
529 bt_len
, so
, eo
, bt_len
, (char*)string
+ so
));
530 DPRINT((" current string is '%.*s'\n", slen
, str_byte
- 1));
533 else if (type
== STR_WIDE
)
535 DPRINT((" substring (len %d) is [%d, %d]: '%.*" STRF
"'\n",
536 bt_len
, so
, eo
, bt_len
, (wchar_t*)string
+ so
));
537 DPRINT((" current string is '%.*" STRF
"'\n",
538 slen
, str_wide
- 1));
540 #endif /* TRE_WCHAR */
546 result
= 1; /* Back reference of nomatch doesn't match */
551 if (type
== STR_WIDE
)
552 result
= wcsncmp((const wchar_t*)string
+ so
, str_wide
- 1,
555 #endif /* TRE_WCHAR */
556 result
= strncmp((const char*)string
+ so
, str_byte
- 1,
559 else if (len
- pos
< bt_len
)
562 else if (type
== STR_WIDE
)
563 result
= wmemcmp((const wchar_t*)string
+ so
, str_wide
- 1,
565 #endif /* TRE_WCHAR */
567 result
= memcmp((const char*)string
+ so
, str_byte
- 1,
572 /* Back reference matched. Check for infinite loop. */
575 if (empty_br_match
&& states_seen
[trans_i
->state_id
])
577 DPRINT((" avoid loop\n"));
581 states_seen
[trans_i
->state_id
] = empty_br_match
;
583 /* Advance in input string and resync `prev_c', `next_c'
585 DPRINT((" back reference matched\n"));
586 str_byte
+= bt_len
- 1;
588 str_wide
+= bt_len
- 1;
589 #endif /* TRE_WCHAR */
592 DPRINT((" pos now %d\n", pos
));
596 DPRINT((" back reference did not match\n"));
602 /* Check for end of string. */
614 /* Read the next character. */
619 for (trans_i
= state
; trans_i
->state
; trans_i
++)
621 DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
622 trans_i
->code_min
, trans_i
->code_max
,
623 trans_i
->code_min
, trans_i
->code_max
,
624 trans_i
->assertions
, trans_i
->state_id
));
625 if (trans_i
->code_min
<= (tre_cint_t
)prev_c
626 && trans_i
->code_max
>= (tre_cint_t
)prev_c
)
628 if (trans_i
->assertions
629 && (CHECK_ASSERTIONS(trans_i
->assertions
)
630 || CHECK_CHAR_CLASSES(trans_i
, tnfa
, eflags
)))
632 DPRINT((" assertion failed\n"));
636 if (next_state
== NULL
)
638 /* First matching transition. */
639 DPRINT((" Next state is %d\n", trans_i
->state_id
));
640 next_state
= trans_i
->state
;
641 next_tags
= trans_i
->tags
;
645 /* Second matching transition. We may need to backtrack here
646 to take this transition instead of the first one, so we
647 push this transition in the backtracking stack so we can
648 jump back here if needed. */
649 DPRINT((" saving state %d for backtracking\n",
651 BT_STACK_PUSH(pos
, pos_add_next
, str_byte
, str_wide
,
652 trans_i
->state
, trans_i
->state_id
, next_c
,
656 for (tmp
= trans_i
->tags
; tmp
&& *tmp
>= 0; tmp
++)
657 tre_tag_set(stack
->item
.tags
, *tmp
, pos
, touch
);
660 #if 0 /* XXX - it's important not to look at all transitions here to keep
668 if (next_state
!= NULL
)
670 /* Matching transitions were found. Take the first one. */
673 /* Update the tag values. */
676 while (*next_tags
>= 0)
677 tre_tag_set(tags
, *next_tags
++, pos
, touch
);
684 /* A matching transition was not found. Try to backtrack. */
687 DPRINT((" backtracking\n"));
688 if (stack
->item
.state
->assertions
& ASSERT_BACKREF
)
690 DPRINT((" states_seen[%d] = 0\n",
691 stack
->item
.state_id
));
692 states_seen
[stack
->item
.state_id
] = 0;
697 else if (match_eo
< 0)
699 /* Try starting from a later position in the input string. */
700 /* Check for end of string. */
701 if (pos
== pos_start
)
707 DPRINT(("end of string.\n"));
715 DPRINT(("end of string.\n"));
720 DPRINT(("restarting from next start position\n"));
721 next_c
= next_c_start
;
723 mbstate
= mbstate_start
;
724 #endif /* TRE_MBSTATE */
725 str_byte
= str_byte_start
;
727 str_wide
= str_wide_start
;
728 #endif /* TRE_WCHAR */
733 DPRINT(("finished\n"));
739 ret
= match_eo
>= 0 ? REG_OK
: REG_NOMATCH
;
740 *match_end_ofs
= match_eo
;
743 tre_bt_mem_destroy(mem
);
744 #ifndef TRE_USE_ALLOCA
747 #endif /* !TRE_USE_ALLOCA */