mfiutil(8): Use MAN instead of MAN8.
[dragonfly.git] / contrib / tre / lib / tre-compile.c
blob6c4187e0423d402d92d78dbcc2de9d82494c44a6
1 /*
2 tre-compile.c - TRE regex compiler
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
7 */
9 /*
10 TODO:
11 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
12 function calls.
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <stdio.h>
20 #include <assert.h>
21 #include <string.h>
22 #include <limits.h>
24 #include "tre-internal.h"
25 #include "tre-mem.h"
26 #include "tre-stack.h"
27 #include "tre-ast.h"
28 #include "tre-parse.h"
29 #include "tre-compile.h"
30 #include "tre.h"
31 #include "xmalloc.h"
34 The bit_ffs() macro in bitstring.h is flawed. Replace it with a working one.
36 #undef bit_ffs
37 #define bit_ffs(name, nbits, value) { \
38 register bitstr_t *_name = name; \
39 register int _byte, _nbits = nbits; \
40 register int _stopbyte = _bit_byte(_nbits), _value = -1; \
41 for (_byte = 0; _byte <= _stopbyte; ++_byte) \
42 if (_name[_byte]) { \
43 _value = _byte << 3; \
44 for (_stopbyte = _name[_byte]; !(_stopbyte&0x1); \
45 ++_value, _stopbyte >>= 1); \
46 break; \
47 } \
48 *(value) = _value; \
52 Algorithms to setup tags so that submatch addressing can be done.
56 #ifdef TRE_DEBUG
57 static const char *tag_dir_str[] = {
58 "minimize",
59 "maximize",
60 "left-maximize"
63 static const char _indent[] = " ";
65 static void
66 print_indent(int indent)
68 while (indent-- > 0)
69 DPRINT((_indent));
72 static void print_last_matched_pre(tre_last_matched_pre_t *lm, int indent,
73 int num_tags);
74 static void
75 print_last_match_branch_pre(tre_last_matched_branch_pre_t *branch, int indent,
76 int num_tags)
78 tre_last_matched_pre_t *u = branch->last_matched;
79 int n_last_matched = 0;
81 while (u)
83 n_last_matched++;
84 u = u->next;
87 print_indent(indent);
88 DPRINT(("BRANCH: tot_branches=%d tot_last_matched=%d tot_tags=%d\n",
89 branch->tot_branches, branch->tot_last_matched, branch->tot_tags));
90 print_indent(indent);
91 DPRINT(("..n_last_matched=%d last_matched=%d\n", branch->n_last_matched,
92 n_last_matched));
93 if (branch->n_last_matched != n_last_matched)
94 DPRINT(("*** mismatch between n_last_matched and unions ***\n"));
95 if (branch->cmp_tag > 0)
97 int i;
98 const char *sep = " tags=";
99 print_indent(indent);
100 DPRINT(("..cmp_tag=%d n_tags=%d", branch->cmp_tag, branch->n_tags));
101 for (i = 0; i < num_tags; i++)
102 if (bit_test(branch->tags, i))
104 DPRINT(("%s%d", sep, i));
105 sep = ",";
107 DPRINT(("\n"));
110 u = branch->last_matched;
111 indent++;
112 while (u)
114 print_last_matched_pre(u, indent, num_tags);
115 u = u->next;
119 static void
120 print_last_matched_pre(tre_last_matched_pre_t *lm, int indent, int num_tags)
122 tre_last_matched_branch_pre_t *b = lm->branches;
123 int n_branches = 0;
125 while (b)
127 n_branches++;
128 b = b->next;
131 print_indent(indent);
132 DPRINT(("LAST_MATCHED: tot_branches=%d tot_last_matched=%d tot_tags=%d\n",
133 lm->tot_branches, lm->tot_last_matched, lm->tot_tags));
134 print_indent(indent);
135 DPRINT(("..start_tag=%d n_branches=%d branches=%d\n", lm->start_tag,
136 lm->n_branches, n_branches));
137 if (lm->n_branches != n_branches)
138 DPRINT(("*** mismatch between n and branches ***\n"));
140 b = lm->branches;
141 indent++;
142 while (b)
144 print_last_match_branch_pre(b, indent, num_tags);
145 b = b->next;
149 static void print_last_matched(tre_last_matched_t *lm, int indent);
150 static void
151 print_last_match_branch(tre_last_matched_branch_t *branch, int indent)
153 tre_last_matched_t *u;
154 int i;
156 print_indent(indent);
157 DPRINT(("BRANCH: n_last_matched=%d\n", branch->n_last_matched));
158 if (branch->cmp_tag > 0)
160 print_indent(indent);
161 DPRINT(("..cmp_tag=%d n_tags=%d", branch->cmp_tag, branch->n_tags));
162 if (branch->n_tags > 0)
164 const char *sep = " tags=";
165 for (i = 0; i < branch->n_tags; i++)
167 DPRINT(("%s%d", sep, branch->tags[i]));
168 sep = ",";
171 DPRINT(("\n"));
174 u = branch->last_matched;
175 indent++;
176 for (i = branch->n_last_matched; i > 0; i--, u++)
177 print_last_matched(u, indent);
180 static void
181 print_last_matched(tre_last_matched_t *lm, int indent)
183 int i;
184 tre_last_matched_branch_t *b;
186 print_indent(indent);
187 DPRINT(("LAST_MATCHED: n_branches=%d start_tag=%d\n", lm->n_branches,
188 lm->start_tag));
190 b = lm->branches;
191 indent++;
192 for (i = lm->n_branches; i > 0; i--, b++)
193 print_last_match_branch(b, indent);
195 #endif /* TRE_DEBUG */
198 /* Merge the tre_last_matched_branch_pre_t of src into dst, creating a new
199 one if needed. If tag_id > 0, add that tag as well (a negative tag_id will
200 create an unset tre_last_matched_branch_pre_t. */
201 static reg_errcode_t
202 tre_merge_branches(tre_mem_t mem, tre_ast_node_t *dst, tre_ast_node_t *src,
203 int tag_id, int num_tags)
205 tre_last_matched_branch_pre_t *db = dst->last_matched_branch;
206 tre_last_matched_branch_pre_t *sb = (src ? src->last_matched_branch : NULL);
208 if (db)
210 if (sb)
212 bitstr_t *l = db->tags;
213 bitstr_t *r = sb->tags;
214 int i = bitstr_size(num_tags);
216 while(i-- > 0)
217 *l++ |= *r++;
218 /* db and sb are the info from two parallel sub-trees, so the tags
219 must be mutually exclusive, and we can just add their numbers */
220 db->n_tags += sb->n_tags;
221 db->tot_tags += sb->tot_tags;
222 if (db->last_matched)
224 if (sb->last_matched)
226 tre_last_matched_pre_t *u = db->last_matched;
228 while(u->next)
229 u = u->next;
230 u->next = sb->last_matched;
231 db->n_last_matched += sb->n_last_matched;
232 db->tot_branches += sb->tot_branches;
233 db->tot_last_matched += sb->tot_last_matched;
236 else if (sb->last_matched)
238 db->last_matched = sb->last_matched;
239 db->n_last_matched = sb->n_last_matched;
240 db->tot_branches = sb->tot_branches;
241 db->tot_last_matched = sb->tot_last_matched;
245 else
246 db = sb;
248 if (tag_id != 0)
250 if (!db)
252 db = tre_mem_calloc(mem, sizeof(tre_last_matched_branch_pre_t)
253 + bitstr_size(num_tags));
254 if (db == NULL)
255 return REG_ESPACE;
256 db->tot_branches = 1;
258 if (tag_id > 0)
260 /* tag_id is a new tag, and shouldn't exist in db's tags,
261 so we can always increment n_tags */
262 bit_set(db->tags, tag_id);
263 db->n_tags++;
264 db->tot_tags++;
267 dst->last_matched_branch = db;
268 return REG_OK;
272 /* Inserts a catenation node to the root of the tree given in `node'.
273 As the left child a new tag with number `tag_id' to `node' is added,
274 and the right child is the old root. */
275 static reg_errcode_t
276 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
278 tre_catenation_t *c;
280 DPRINT(("add_tag_left: tag %d\n", tag_id));
282 c = tre_mem_alloc(mem, sizeof(*c));
283 if (c == NULL)
284 return REG_ESPACE;
285 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
286 if (c->left == NULL)
287 return REG_ESPACE;
288 c->right = tre_mem_calloc(mem, sizeof(tre_ast_node_t));
289 if (c->right == NULL)
290 return REG_ESPACE;
292 c->right->obj = node->obj;
293 c->right->type = node->type;
294 c->right->last_matched_branch = node->last_matched_branch;
295 c->right->nullable = -1;
296 c->right->submatch_id = -1;
297 node->obj = c;
298 node->type = CATENATION;
299 node->original = c->right;
300 return REG_OK;
303 /* Inserts a catenation node to the root of the tree given in `node'.
304 As the right child a new tag with number `tag_id' to `node' is added,
305 and the left child is the old root. */
306 static reg_errcode_t
307 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
309 tre_catenation_t *c;
311 DPRINT(("tre_add_tag_right: tag %d\n", tag_id));
313 c = tre_mem_alloc(mem, sizeof(*c));
314 if (c == NULL)
315 return REG_ESPACE;
316 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
317 if (c->right == NULL)
318 return REG_ESPACE;
319 c->left = tre_mem_calloc(mem, sizeof(tre_ast_node_t));
320 if (c->left == NULL)
321 return REG_ESPACE;
323 c->left->obj = node->obj;
324 c->left->type = node->type;
325 c->left->last_matched_branch = node->last_matched_branch;
326 c->left->nullable = -1;
327 c->left->submatch_id = -1;
328 node->obj = c;
329 node->type = CATENATION;
330 node->original = c->left;
331 return REG_OK;
334 typedef enum {
335 ADDTAGS_RECURSE,
336 ADDTAGS_RECURSE_NOT_TOP_UNION,
337 ADDTAGS_AFTER_ITERATION,
338 ADDTAGS_AFTER_UNION_LEFT,
339 ADDTAGS_AFTER_UNION_RIGHT,
340 ADDTAGS_AFTER_CAT_LEFT,
341 ADDTAGS_AFTER_CAT_RIGHT,
342 ADDTAGS_SET_SUBMATCH_END,
343 ADDTAGS_UNION_RECURSE,
344 ADDTAGS_UNION_RIGHT_RECURSE,
345 ADDTAGS_AFTER_UNION_TOP,
346 } tre_addtags_symbol_t;
348 enum {
349 COPY_LAST_MATCHED_BRANCH,
350 COPY_LAST_MATCHED_BRANCH_NEXT,
351 COPY_LAST_MATCHED,
352 COPY_LAST_MATCHED_NEXT,
356 #define REGSET_UNSET ((unsigned)-1)
358 /* Go through `regset' and set submatch data for submatches that are
359 using this tag. */
360 static void
361 tre_purge_regset(unsigned *regset, tre_tnfa_t *tnfa, int tag)
363 int i;
365 for (i = 0; regset[i] != REGSET_UNSET; i++)
367 int id = regset[i] / 2;
368 int start = !(regset[i] % 2);
369 if (id >= SUBMATCH_ID_INVISIBLE_START)
370 continue;
371 DPRINT((" Using tag %d for %s offset of "
372 "submatch %d\n", tag,
373 start ? "start" : "end", id));
374 if (start)
375 tnfa->submatch_data[id].so_tag = tag;
376 else
377 tnfa->submatch_data[id].eo_tag = tag;
379 regset[0] = -1;
383 #define REGSET_HAS_STARTS 0x1
384 #define REGSET_HAS_ENDS 0x2
387 /* Adds tags to appropriate locations in the parse tree in `tree', so that
388 subexpressions marked for submatch addressing can be traced. */
389 static reg_errcode_t
390 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
391 tre_tnfa_t *tnfa)
393 reg_errcode_t status = REG_OK;
394 tre_addtags_symbol_t symbol;
395 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
396 int bottom = tre_stack_num_objects(stack);
397 /* True for first pass (counting number of needed tags) */
398 int first_pass = (mem == NULL || tnfa == NULL);
399 unsigned *regset, *orig_regset;
400 int regset_contains = 0;
401 int num_tags = 0; /* Total number of tags. */
402 int num_minimals = 0; /* Number of special minimal tags. */
403 int tag = 0; /* The tag that is to be added next. */
404 int next_tag = 1; /* Next tag to use after this one. */
405 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
406 int *reorder_tags = NULL; /* Tag reorder array: a pair for each reorder,
407 * the first is the tag to reorder, the second
408 * is the tag after which the first is reordered */
409 int *rtp; /* Pointer used to fill in reorder_tags and
410 * tag_order */
411 int *to_reorder; /* Transform array converting sequential order to
412 * that specified by reorder_tags */
413 int id;
415 tre_tag_direction_t direction = TRE_TAG_LEFT_MAXIMIZE;
416 if (!first_pass)
418 DPRINT(("Initializing direction to %s\n", tag_dir_str[direction]));
419 tnfa->end_tag = 0;
420 tnfa->minimal_tags[0] = -1;
423 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches
424 + tnfa->num_submatches_invisible + 1) * 2));
425 if (regset == NULL)
427 status = REG_ESPACE;
428 goto error_regset;
430 regset[0] = REGSET_UNSET;
431 orig_regset = regset;
433 if (!first_pass)
435 /* Allocate all memory for reorder_tags, tag_order, to_seq_order and
436 * to_reorder in one batch (assuming all are the same type) */
437 rtp = reorder_tags = xmalloc(sizeof(*reorder_tags) *
438 ((2 * tnfa->num_reorder_tags + 1) +
439 tnfa->num_tags));
440 if (reorder_tags == NULL)
442 status = REG_ESPACE;
443 goto error_reorder_tags;
445 to_reorder = reorder_tags + (2 * tnfa->num_reorder_tags + 1);
448 STACK_PUSH(stack, voidptr, node);
449 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
451 while (tre_stack_num_objects(stack) > bottom)
453 if (status != REG_OK)
454 break;
456 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
457 switch (symbol)
459 int top_union;
461 case ADDTAGS_SET_SUBMATCH_END:
463 int i;
465 id = tre_stack_pop_int(stack);
466 node = tre_stack_pop_voidptr(stack);
467 /* Add end of this submatch to regset. */
468 for (i = 0; regset[i] != REGSET_UNSET; i++);
469 regset[i] = id * 2 + 1;
470 regset[i + 1] = -1;
471 regset_contains |= REGSET_HAS_ENDS;
473 /* Always put a tag after a minimal iterator. */
474 if (minimal_tag >= 0)
476 if (first_pass)
478 node->num_tags++;
479 DPRINT((" ADDTAGS_SET_SUBMATCH_END: node->num_tags = %d\n",
480 node->num_tags));
482 else
484 int i;
485 status = tre_merge_branches(mem, node, NULL, tag,
486 tnfa->num_tags);
487 if (status != REG_OK)
488 break;
489 status = tre_add_tag_right(mem, node, tag);
490 if (status != REG_OK)
491 break;
492 tnfa->tag_directions[tag] = TRE_TAG_MINIMIZE;
493 DPRINT(("Setting t%d direction to %s\n", tag,
494 tag_dir_str[tnfa->tag_directions[tag]]));
495 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
496 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
497 tnfa->minimal_tags[i] = tag;
498 tnfa->minimal_tags[i + 1] = minimal_tag;
499 tnfa->minimal_tags[i + 2] = -1;
501 DPRINT((" Minimal end: t%d reordered to "
502 "after t%d\n", tag, minimal_tag));
503 /* Append to tag_order, move "tag" after
504 * "minimal_tag" */
505 *rtp++ = tag;
506 *rtp++ = minimal_tag;
508 num_minimals++;
509 tre_purge_regset(regset, tnfa, tag);
512 minimal_tag = -1;
513 DPRINT((" ADDTAGS_SET_SUBMATCH_END num_tags++ tag=%d\n", tag));
514 regset[0] = REGSET_UNSET;
515 regset_contains = 0;
516 tag = next_tag;
517 num_tags++;
518 next_tag++;
520 break;
523 case ADDTAGS_RECURSE_NOT_TOP_UNION:
524 /* Like ADDTAGS_RECURSE, except that top_union is set to zero,
525 * indicating that if a union is being processed, it is not the
526 * top-most of a series */
527 top_union = 0;
528 goto do_addtags_recurse;
530 case ADDTAGS_RECURSE:
531 /* Setting top_union to 1 means that if a union is begin processed,
532 * it is the top-most of a series, and should recurse through the
533 * series to set the left_tag and right_tag values */
534 top_union = 1;
536 do_addtags_recurse:
537 node = tre_stack_pop_voidptr(stack);
539 id = node->submatch_id;
540 if (id >= 0)
542 int i;
545 /* Add start of this submatch to regset. */
546 for (i = 0; regset[i] != REGSET_UNSET; i++);
547 regset[i] = id * 2;
548 regset[i + 1] = -1;
549 regset_contains |= REGSET_HAS_STARTS;
551 /* Add end of this submatch to regset after processing this
552 node. */
553 STACK_PUSH(stack, voidptr, node);
554 STACK_PUSHX(stack, int, id);
555 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
558 switch (node->type)
560 case LITERAL:
562 tre_literal_t *lit = node->obj;
564 if (!IS_SPECIAL(lit) || IS_BACKREF(lit) || IS_EMPTY(lit) || IS_ASSERTION(lit))
566 DPRINT(("Literal %d-%d\n",
567 (int)lit->code_min, (int)lit->code_max));
568 if (regset_contains)
570 /* Regset is not empty, so add a tag before the
571 literal or backref. */
572 if (first_pass)
574 DPRINT((" ADDTAGS_RECURSE:LITERAL node->num_tags = 1\n"));
575 node->num_tags = 1;
577 else
579 status = tre_merge_branches(mem, node, NULL, tag,
580 tnfa->num_tags);
581 if (status != REG_OK)
582 break;
583 status = tre_add_tag_left(mem, node, tag);
584 if (status != REG_OK)
585 break;
586 if (regset_contains == REGSET_HAS_STARTS)
587 tnfa->tag_directions[tag] = TRE_TAG_LEFT_MAXIMIZE;
588 else
589 tnfa->tag_directions[tag] = direction;
590 DPRINT(("Setting t%d direction to %s\n", tag,
591 tag_dir_str[tnfa->tag_directions[tag]]));
592 tre_purge_regset(regset, tnfa, tag);
594 if (IS_BACKREF(lit))
596 int b = lit->code_max;
597 int t = tnfa->submatch_data[b].so_tag;
598 /* Fail if the referenced submatch hasn't been
599 * completed yet */
600 if (tnfa->submatch_data[b].eo_tag < 0)
602 status = REG_ESUBREG;
603 break;
605 if (t < tag)
607 DPRINT((" Backref %d start: "
608 "t%d reordered to before t%d\n",
609 b, tag, t));
610 if(t > 0)
611 t--;
612 /* Append to tag_order, move "tag" after
613 * "t" */
614 *rtp++ = tag;
615 *rtp++ = t;
617 #if TRE_DEBUG
618 else
619 DPRINT((" Backref %d start: "
620 "(t%d already before t%d)\n",
621 b, tag, t));
622 #endif /* TRE_DEBUG */
626 DPRINT((" ADDTAGS_RECURSE:LITERAL num_tags++ tag=%d\n",
627 tag));
628 regset[0] = REGSET_UNSET;
629 regset_contains = 0;
630 tag = next_tag;
631 num_tags++;
632 next_tag++;
635 else
637 assert(!IS_TAG(lit));
639 break;
641 case CATENATION:
643 tre_catenation_t *cat = node->obj;
644 tre_ast_node_t *left = cat->left;
645 tre_ast_node_t *right = cat->right;
646 int reserved_tag = -1;
647 DPRINT(("Catenation, next_tag = %d\n", next_tag));
650 /* After processing right child. */
651 STACK_PUSHX(stack, voidptr, node);
652 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
654 /* Process right child. */
655 STACK_PUSHX(stack, voidptr, right);
656 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
658 /* After processing left child. */
659 STACK_PUSHX(stack, int, next_tag + left->num_tags);
660 DPRINT((" Pushing %d for after left\n",
661 next_tag + left->num_tags));
662 if (left->num_tags > 0 && right->num_tags > 0)
664 /* Reserve the next tag to the right child. */
665 DPRINT((" ADDTAGS_RECURSE:CATENATION num_tags++ "
666 "Reserving next_tag %d to right child\n",
667 next_tag));
668 reserved_tag = next_tag;
669 next_tag++;
671 STACK_PUSHX(stack, int, reserved_tag);
672 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
674 /* Process left child. */
675 STACK_PUSHX(stack, voidptr, left);
676 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
679 break;
680 case ITERATION:
682 tre_iteration_t *iter = node->obj;
683 DPRINT(("Iteration\n"));
685 if (first_pass)
686 STACK_PUSHX(stack, int, regset_contains != 0);
687 STACK_PUSHX(stack, int, tag);
688 STACK_PUSHX(stack, voidptr, node);
689 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
691 STACK_PUSHX(stack, voidptr, iter->arg);
692 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
694 /* Regset is not empty, so add a tag here (this always happens
695 because iterators always get submatch id, even if in the
696 invisible range) */
697 if (regset_contains)
699 if (!first_pass)
701 status = tre_merge_branches(mem, node, NULL, tag,
702 tnfa->num_tags);
703 if (status != REG_OK)
704 break;
705 status = tre_add_tag_left(mem, node, tag);
706 if (status != REG_OK)
707 break;
708 if (regset_contains == REGSET_HAS_STARTS && tag != 0)
709 tnfa->tag_directions[tag] = iter->minimal ?
710 TRE_TAG_MINIMIZE :
711 TRE_TAG_LEFT_MAXIMIZE;
712 else
713 tnfa->tag_directions[tag] = direction;
714 DPRINT(("Setting t%d direction to %s\n", tag,
715 tag_dir_str[tnfa->tag_directions[tag]]));
716 tre_purge_regset(regset, tnfa, tag);
719 DPRINT((" ADDTAGS_RECURSE:ITERATION num_tags++ tag=%d\n",
720 tag));
721 regset[0] = REGSET_UNSET;
722 regset_contains = 0;
723 tag = next_tag;
724 num_tags++;
725 next_tag++;
727 direction = TRE_TAG_LEFT_MAXIMIZE;
728 DPRINT((" Setting direction to %s\n", tag_dir_str[direction]));
730 break;
731 case UNION:
733 tre_union_t *uni;
734 tre_ast_node_t *left;
735 tre_ast_node_t *right;
736 int front_tag = -1;
738 DPRINT(("Union\n"));
740 if (regset_contains)
742 DPRINT((" UNION num_tags++ tag=%d\n", tag));
743 front_tag = tag;
744 tag = next_tag;
745 num_tags++;
746 next_tag++;
749 /* For the top union, walk the tree of consecutive unions,
750 * setting the left_tag and right_tag values in increasing
751 * order (left to right priority) */
752 if (top_union &&
753 (node->num_submatches -
754 (node->submatch_id >= 0 &&
755 node->submatch_id < SUBMATCH_ID_INVISIBLE_START)) > 0)
757 tre_ast_node_t *n;
758 int last = tre_stack_num_objects(stack);
760 STACK_PUSH(stack, voidptr, node);
761 STACK_PUSH(stack, int, ADDTAGS_UNION_RECURSE);
763 while (tre_stack_num_objects(stack) > last)
765 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
766 switch (symbol)
768 case ADDTAGS_UNION_RECURSE:
769 n = tre_stack_pop_voidptr(stack);
770 uni = n->obj;
771 left = uni->left;
773 /* Since the top union has num_submatches > 0,
774 * we set all the consecutive union's
775 * make_branches to 1 to force the generation
776 * of end tags for each union branch. */
777 n->make_branches = 1;
779 STACK_PUSH(stack, voidptr, n);
780 STACK_PUSH(stack, int,
781 ADDTAGS_UNION_RIGHT_RECURSE);
783 if (left->type == UNION)
785 STACK_PUSH(stack, voidptr, left);
786 STACK_PUSH(stack, int,
787 ADDTAGS_UNION_RECURSE);
789 else
791 DPRINT((" ADDTAGS_UNION_RECURSE "
792 "num_tags++ tag=%d\n", tag));
793 uni->left_tag = tag;
794 tag = next_tag;
795 num_tags++;
796 next_tag++;
798 break;
800 case ADDTAGS_UNION_RIGHT_RECURSE:
801 n = tre_stack_pop_voidptr(stack);
802 uni = n->obj;
803 right = uni->right;
805 if (right->type == UNION)
807 STACK_PUSH(stack, voidptr, right);
808 STACK_PUSH(stack, int,
809 ADDTAGS_UNION_RECURSE);
811 else
813 DPRINT((" ADDTAGS_UNION_RIGHT_RECURSE "
814 "num_tags++ tag=%d\n", tag));
815 uni->right_tag = tag;
816 tag = next_tag;
817 num_tags++;
818 next_tag++;
821 break;
823 default:
824 assert(0);
825 break;
827 } /* end switch(symbol) */
828 } /* end while(tre_stack_num_objects(stack) > last */
829 if (!first_pass)
831 STACK_PUSHX(stack, int, front_tag);
832 STACK_PUSHX(stack, voidptr, node);
833 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_TOP);
835 } /* end if (top_union && ...) */
837 uni = node->obj;
838 left = uni->left;
839 right = uni->right;
841 /* After processing right child. */
842 STACK_PUSHX(stack, voidptr, regset);
843 STACK_PUSHX(stack, int, regset_contains != 0);
844 STACK_PUSHX(stack, voidptr, node);
845 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
847 /* Process right child. */
848 STACK_PUSHX(stack, voidptr, right);
849 STACK_PUSHX(stack, int, ADDTAGS_RECURSE_NOT_TOP_UNION);
851 /* After processing left child. */
852 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
854 /* Process left child. */
855 STACK_PUSHX(stack, voidptr, left);
856 STACK_PUSHX(stack, int, ADDTAGS_RECURSE_NOT_TOP_UNION);
858 /* Regset is not empty, so add a tag here. */
859 if (regset_contains)
861 if (!first_pass)
863 status = tre_merge_branches(mem, node, NULL, front_tag,
864 tnfa->num_tags);
865 if (status != REG_OK)
866 break;
867 status = tre_add_tag_left(mem, node, front_tag);
868 if (status != REG_OK)
869 break;
870 if (regset_contains == REGSET_HAS_STARTS)
871 tnfa->tag_directions[front_tag] = TRE_TAG_LEFT_MAXIMIZE;
872 else
873 tnfa->tag_directions[front_tag] = direction;
874 DPRINT(("Setting t%d direction to %s\n", front_tag,
875 tag_dir_str[tnfa->tag_directions[front_tag]]));
876 tre_purge_regset(regset, tnfa, front_tag);
879 regset[0] = REGSET_UNSET;
880 regset_contains = 0;
883 break;
885 } /* end switch (node->type) */
887 break; /* end case: ADDTAGS_RECURSE */
889 case ADDTAGS_AFTER_ITERATION:
891 tre_iteration_t *iter;
892 tre_ast_node_t *orig;
893 int enter_tag;
895 node = tre_stack_pop_voidptr(stack);
896 orig = node->original ? node->original : node;
897 iter = (tre_iteration_t *)orig->obj;
898 enter_tag = tre_stack_pop_int(stack);
899 if (iter->minimal)
900 minimal_tag = enter_tag;
902 DPRINT(("After iteration\n"));
903 if (first_pass)
905 node->num_tags = iter->arg->num_tags + tre_stack_pop_int(stack);
906 DPRINT((" ADDTAGS_AFTER_ITERATION: node->num_tags = %d\n",
907 node->num_tags));
909 else
911 /* node->last_matched_branch will have the start tag (the tag
912 just *before* the iteration). iter->arg->last_matched_branch
913 will have the tag(s) inside the iteration, the ones that
914 may need to be reset if the iteration doesn't match. So
915 before we merge iter->arg into node, we need to set up
916 a new tre_last_matched_t and tre_last_matched_branch_t,
917 using any of the inside tags as cmp_tag (we choose the first
918 tag found by bit_ffs). If there are no inside tags, we
919 don't bother creating the extra structures. */
920 tre_last_matched_branch_pre_t *b =
921 iter->arg->last_matched_branch;
923 if (b && b->n_tags > 0)
925 tre_last_matched_pre_t *u;
927 bit_ffs(b->tags, num_tags, &b->cmp_tag);
928 DPRINT((" ADDTAGS_AFTER_ITERATION: n_tags=%d "
929 "cmp_tag = %d\n", b->n_tags, b->cmp_tag));
931 u = tre_mem_calloc(mem, sizeof(tre_last_matched_pre_t) +
932 sizeof(tre_last_matched_branch_pre_t)
933 + bitstr_size(tnfa->num_tags));
934 if (!u)
936 status = REG_ESPACE;
937 break;
939 u->branches = b;
940 u->n_branches = 1;
941 u->start_tag = b->cmp_tag;
942 u->tot_branches = b->tot_branches;
943 u->tot_last_matched = 1 + b->tot_last_matched;
944 u->tot_tags = b->tot_tags;
946 b = (tre_last_matched_branch_pre_t *)(u + 1);
947 b->last_matched = u;
948 b->n_last_matched = 1;
949 b->tot_branches = 1 + u->tot_branches;
950 b->tot_last_matched = u->tot_last_matched;
951 b->tot_tags = u->tot_tags;
953 iter->arg->last_matched_branch = b;
955 status = tre_merge_branches(mem, node, iter->arg, 0,
956 tnfa->num_tags);
957 if (status != REG_OK)
958 break;
960 if (iter->minimal)
962 /* Add a union with a left EMPTY literal and the right
963 being iter->arg. This should force the tags inside
964 the minimal iteration to prefer being unset */
965 if (iter->min == 0 && iter->max <= 1)
967 tre_ast_node_t *u, *e;
969 e = tre_ast_new_literal(mem, EMPTY, -1, -1);
970 if (e == NULL)
972 status = REG_ESPACE;
973 break;
975 u = tre_ast_new_union(mem, e, iter->arg);
976 if (u == NULL)
978 status = REG_ESPACE;
979 break;
981 iter->arg = u;
984 direction = TRE_TAG_MINIMIZE;
986 else
987 direction = TRE_TAG_MAXIMIZE;
988 DPRINT((" Setting direction to %s\n", tag_dir_str[direction]));
990 break;
993 case ADDTAGS_AFTER_CAT_LEFT:
995 int new_tag = tre_stack_pop_int(stack);
996 next_tag = tre_stack_pop_int(stack);
997 DPRINT(("After cat left, tag = %d, next_tag = %d\n",
998 tag, next_tag));
999 if (new_tag >= 0)
1001 DPRINT((" Setting tag to %d\n", new_tag));
1002 tag = new_tag;
1004 break;
1007 case ADDTAGS_AFTER_CAT_RIGHT:
1009 tre_catenation_t *cat;
1011 DPRINT(("After cat right\n"));
1012 node = tre_stack_pop_voidptr(stack);
1013 cat = node->obj;
1014 if (first_pass)
1016 node->num_tags = cat->left->num_tags + cat->right->num_tags;
1017 DPRINT((" ADDTAGS_AFTER_CAT_RIGHT: node->num_tags = %d\n",
1018 node->num_tags));
1020 else
1022 status = tre_merge_branches(mem, cat->left, cat->right, 0,
1023 tnfa->num_tags);
1024 if (status != REG_OK)
1025 break;
1026 status = tre_merge_branches(mem, node, cat->left, 0,
1027 tnfa->num_tags);
1029 break;
1032 case ADDTAGS_AFTER_UNION_LEFT:
1033 DPRINT(("After union left\n"));
1034 /* Lift the bottom of the `regset' array so that when processing
1035 the right operand the items currently in the array are
1036 invisible. The original bottom was saved at ADDTAGS_UNION and
1037 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1038 while (*regset != REGSET_UNSET)
1039 regset++;
1040 regset_contains = 0;
1041 break;
1043 case ADDTAGS_AFTER_UNION_RIGHT:
1045 int added_tags;
1046 tre_ast_node_t *orig;
1047 tre_union_t *uni;
1048 /* Note: node may not be a UNION, but a CATENATION with a left
1049 * tag. So that is why we pass the original node->obj on the
1050 * stack, to get the union's true values. */
1052 DPRINT(("After union right\n"));
1053 node = tre_stack_pop_voidptr(stack);
1054 orig = node->original ? node->original : node;
1055 uni = (tre_union_t *)orig->obj;
1056 added_tags = tre_stack_pop_int(stack);
1057 if (first_pass)
1059 node->num_tags = uni->left->num_tags + uni->right->num_tags
1060 + added_tags;
1061 if (uni->left_tag > 0)
1062 node->num_tags++;
1063 if (uni->right_tag > 0)
1064 node->num_tags++;
1065 DPRINT((" ADDTAGS_AFTER_UNION_RIGHT: node->num_tags = %d\n",
1066 node->num_tags));
1068 regset = tre_stack_pop_voidptr(stack);
1070 /* Add tags after both children, the left child gets a smaller
1071 tag than the right child. This guarantees that we prefer
1072 the left child over the right child. */
1073 /* XXX - This is not always necessary (if the children have
1074 tags which must be seen for every match of that child). */
1075 if (!first_pass && node->make_branches)
1077 tre_last_matched_branch_pre_t *lb =
1078 uni->left->last_matched_branch;
1079 tre_last_matched_branch_pre_t *rb =
1080 uni->right->last_matched_branch;
1081 tre_last_matched_pre_t *lu =
1082 uni->left->last_matched_in_progress;
1083 tre_last_matched_pre_t *ru =
1084 uni->right->last_matched_in_progress;
1085 tre_last_matched_pre_t *u;
1086 /* We don't need to call tre_merge_branches because these
1087 * tags don't participate in submatch ranges, so don't need
1088 * to be recorded. But we do set the cmp_tag entry of the
1089 * tre_last_matched_branch_pre_t, so we might call
1090 * tre_merge_branches if we need to create an empty
1091 * tre_last_matched_branch_pre_t. */
1092 if (uni->left_tag > 0)
1094 DPRINT(("Setting t%d direction to maximize\n",
1095 uni->left_tag));
1096 status = tre_add_tag_right(mem, uni->left, uni->left_tag);
1097 if (status != REG_OK)
1098 break;
1099 tnfa->tag_directions[uni->left_tag] = TRE_TAG_MAXIMIZE;
1100 if (!lb)
1102 status = tre_merge_branches(mem, uni->left, NULL, -1,
1103 tnfa->num_tags);
1104 if (status != REG_OK)
1105 break;
1106 lb = uni->left->last_matched_branch;
1108 lb->cmp_tag = uni->left_tag;
1110 if (uni->right_tag > 0)
1112 DPRINT(("Setting t%d direction to maximize\n",
1113 uni->right_tag));
1114 status = tre_add_tag_right(mem, uni->right, uni->right_tag);
1115 if (status != REG_OK)
1116 break;
1117 tnfa->tag_directions[uni->right_tag] = TRE_TAG_MAXIMIZE;
1118 if (!rb)
1120 status = tre_merge_branches(mem, uni->right, NULL, -1,
1121 tnfa->num_tags);
1122 if (status != REG_OK)
1123 break;
1124 rb = uni->right->last_matched_branch;
1126 rb->cmp_tag = uni->right_tag;
1128 /* Now merge the tre_last_matched_branch_pre_t into a
1129 tre_last_matched_pre_t */
1130 if (lu == NULL)
1132 if (ru == NULL)
1134 /* Create a new tre_last_matched_pre_t */
1135 u = tre_mem_calloc(mem, sizeof(tre_last_matched_pre_t));
1136 if (!u)
1138 status = REG_ESPACE;
1139 break;
1141 u->tot_last_matched = 1;
1143 if (lb)
1145 u->branches = lb;
1146 u->n_branches = 1;
1147 u->tot_branches += lb->tot_branches;
1148 u->tot_last_matched += lb->tot_last_matched;
1149 u->tot_tags += lb->tot_tags;
1150 if (rb)
1152 lb->next = rb;
1153 u->n_branches++;
1154 u->tot_branches += rb->tot_branches;
1155 u->tot_last_matched += rb->tot_last_matched;
1156 u->tot_tags += rb->tot_tags;
1159 else if (rb)
1161 u->branches = rb;
1162 u->n_branches = 1;
1163 u->tot_branches += rb->tot_branches;
1164 u->tot_last_matched += rb->tot_last_matched;
1165 u->tot_tags += rb->tot_tags;
1168 else
1170 /* Use ru, and add lb */
1171 u = ru;
1172 if (lb)
1174 lb->next = u->branches;
1175 u->branches = lb;
1176 u->n_branches++;
1177 u->tot_branches += lb->tot_branches;
1178 u->tot_last_matched += lb->tot_last_matched;
1179 u->tot_tags += lb->tot_tags;
1183 else if (ru == NULL)
1185 /* Use lu, and add rb */
1186 u = lu;
1187 if (rb)
1189 rb->next = u->branches;
1190 u->branches = rb;
1191 u->n_branches++;
1192 u->tot_branches += rb->tot_branches;
1193 u->tot_last_matched += rb->tot_last_matched;
1194 u->tot_tags += rb->tot_tags;
1197 else
1199 /* Merge lu and ru into lu */
1200 if (lu->branches)
1202 if (ru->branches)
1204 tre_last_matched_branch_pre_t *b = lu->branches;
1205 while (b->next) b = b->next;
1206 b->next = ru->branches;
1207 lu->n_branches += ru->n_branches;
1210 else if (ru->branches)
1212 lu->branches = ru->branches;
1213 lu->n_branches = ru->n_branches;
1215 lu->tot_branches += ru->tot_branches;
1216 lu->tot_last_matched += ru->tot_last_matched - 1;
1217 lu->tot_tags += ru->tot_tags;
1218 u = lu;
1220 node->last_matched_in_progress = u;
1222 direction = TRE_TAG_MAXIMIZE;
1223 break;
1226 case ADDTAGS_AFTER_UNION_TOP: /* only called when not first_pass */
1228 tre_last_matched_branch_pre_t *b;
1229 tre_last_matched_pre_t *u;
1230 int start_tag;
1232 DPRINT(("After union top\n"));
1233 node = tre_stack_pop_voidptr(stack);
1234 start_tag = tre_stack_pop_int(stack);
1235 b = tre_mem_calloc(mem, sizeof(tre_last_matched_branch_pre_t)
1236 + bitstr_size(tnfa->num_tags));
1237 if (!b)
1239 status = REG_ESPACE;
1240 break;
1243 u = node->last_matched_in_progress;
1244 u->start_tag = start_tag;
1245 b->tot_branches = 1 + u->tot_branches;
1246 b->tot_last_matched = u->tot_last_matched;
1247 b->tot_tags = u->tot_tags;
1248 b->last_matched = u;
1249 b->n_last_matched = 1;
1250 node->last_matched_branch = b;
1251 node->last_matched_in_progress = NULL;
1252 break;
1255 default:
1256 assert(0);
1257 break;
1259 } /* end switch(symbol) */
1260 } /* end while(tre_stack_num_objects(stack) > bottom) */
1262 if (status != REG_OK)
1264 DPRINT(("Error during %s pass\n", first_pass ? "first" : "second"));
1265 goto error_post_compile;
1268 if (!first_pass)
1270 int i;
1271 if (num_tags != tnfa->num_tags)
1273 DPRINT(("num_tags(%d) != tnfa->num_tags(%d)\n", num_tags,
1274 tnfa->num_tags));
1275 status = REG_BADPAT;
1276 goto error_post_compile;
1279 tre_purge_regset(regset, tnfa, tag);
1280 DPRINT(("Setting t%d to %s\n", num_tags,
1281 tag_dir_str[direction]));
1282 tnfa->tag_directions[num_tags] = direction;
1284 if (rtp > reorder_tags + 2 * tnfa->num_reorder_tags)
1286 DPRINT(("Processed %d reorder tags instead of %d\n",
1287 (int)(rtp - reorder_tags) / 2, tnfa->num_reorder_tags));
1288 status = REG_BADPAT;
1289 goto error_post_compile;
1291 *rtp = -1;
1292 #if TRE_DEBUG
1293 if (reorder_tags[0] >= 0)
1295 DPRINT(("reorder_tags:\n"));
1296 for (rtp = reorder_tags; *rtp >= 0;)
1298 DPRINT(("%d after ", *rtp++));
1299 DPRINT(("%d\n", *rtp++));
1302 else
1303 DPRINT(("No reorder_tags\n"));
1304 #endif /* TRE_DEBUG */
1306 /* Initialize to_reorder */
1307 for (i = 0; i < num_tags; i++)
1308 to_reorder[i] = i;
1309 /* Use to_seq_order to convert reorder_tags values, and use those to
1310 * reorder to_reorder */
1311 for (rtp = reorder_tags; *rtp >= 0;)
1313 int j, high, low;
1314 int ti = *rtp++;
1316 /* Skip reordering the final tag */
1317 if (ti >= num_tags)
1319 DPRINT(("Skipping reorder of %d\n", ti));
1320 rtp++;
1321 continue;
1323 /* The number of the tag to reorder */
1324 high = to_reorder[ti];
1325 /* Reorder after this tag */
1326 low = to_reorder[*rtp++];
1328 DPRINT(("ti=%d high=%d low=%d\n", ti, high, low));
1329 if (low > high)
1331 DPRINT(("Tag %d already before %d\n", high, low));
1332 continue;
1334 for (j = 0; j < num_tags; j++)
1335 if (to_reorder[j] > low && to_reorder[j] < high)
1336 to_reorder[j]++;
1337 to_reorder[ti] = low + 1;
1338 #ifdef TRE_DEBUG
1339 DPRINT(("to_reorder=("));
1340 for (j = 0; j < num_tags; j++)
1342 DPRINT(("%d", to_reorder[j]));
1343 if (j < num_tags - 1)
1344 DPRINT((","));
1346 DPRINT((")\n"));
1347 #endif /* TRE_DEBUG */
1349 /* Determine if reordering in really necessary */
1351 int need_reorder = 0;
1352 for (i = 0; i < num_tags; i++)
1353 if(to_reorder[i] != i)
1355 need_reorder = 1;
1356 break;
1358 /* If need_reorder is not set, free reorder_tags, and set to NULL,
1359 * indicating no reordering is needed */
1360 if (!need_reorder)
1362 DPRINT(("Don't need to reorder\n"));
1363 xfree(reorder_tags);
1364 reorder_tags = NULL;
1369 if (reorder_tags)
1371 int i;
1372 tre_tag_direction_t *new_tag_directions;
1373 #if TRE_DEBUG
1374 DPRINT(("to_reorder:"));
1375 for (i = 0; i < num_tags; i++)
1376 DPRINT((" %d->%d", i, to_reorder[i]));
1377 DPRINT(("\n"));
1378 #endif /* TRE_DEBUG */
1380 DPRINT(("Reordering submatch_data\n"));
1381 for (i = 0; i < tnfa->num_submatches; i++)
1383 #if TRE_DEBUG
1384 int so = tnfa->submatch_data[i].so_tag;
1385 int eo = tnfa->submatch_data[i].eo_tag;
1386 #endif /* TRE_DEBUG */
1387 tnfa->submatch_data[i].so_tag =
1388 to_reorder[tnfa->submatch_data[i].so_tag];
1389 tnfa->submatch_data[i].eo_tag =
1390 tnfa->submatch_data[i].eo_tag < num_tags ?
1391 to_reorder[tnfa->submatch_data[i].eo_tag] :
1392 tnfa->submatch_data[i].eo_tag;
1393 DPRINT(("pmatch[%d]: {%d, %d}->{%d, %d}\n", i, so, eo,
1394 tnfa->submatch_data[i].so_tag,
1395 tnfa->submatch_data[i].eo_tag));
1398 DPRINT(("Reordering tag_directions\n"));
1399 /* We only allocate num_tags directions and reorder them. The
1400 * num_tags-th direction (end tag) is left unchanged. */
1401 new_tag_directions = xmalloc(sizeof(*new_tag_directions) * num_tags);
1402 if (new_tag_directions == NULL)
1404 status = REG_ESPACE;
1405 goto error_post_compile;
1407 for (i = 0; i < num_tags; i++)
1409 new_tag_directions[to_reorder[i]] = tnfa->tag_directions[i];
1411 #if TRE_DEBUG
1412 for (i = 0; i < num_tags; i++)
1414 DPRINT(("t%d %s->%s\n", i,
1415 tag_dir_str[tnfa->tag_directions[i]],
1416 tag_dir_str[new_tag_directions[i]]));
1418 DPRINT(("t%d %s->%s\n", num_tags,
1419 tag_dir_str[tnfa->tag_directions[num_tags]],
1420 tag_dir_str[tnfa->tag_directions[num_tags]]));
1421 #endif /* TRE_DEBUG */
1422 memcpy(tnfa->tag_directions, new_tag_directions, sizeof(*new_tag_directions) * num_tags);
1423 xfree(new_tag_directions);
1425 DPRINT(("Reordering minimal_tags\n"));
1426 for (i = 0; tnfa->minimal_tags[i] >= 0; i++)
1427 tnfa->minimal_tags[i] = tnfa->minimal_tags[i] < num_tags ?
1428 to_reorder[tnfa->minimal_tags[i]] :
1429 tnfa->minimal_tags[i];
1431 DPRINT(("Reordering AST tags\n"));
1432 STACK_PUSH(stack, voidptr, tree);
1433 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1435 node = tre_stack_pop_voidptr(stack);
1437 switch (node->type)
1439 case LITERAL:
1441 tre_literal_t *lit = (tre_literal_t *)node->obj;
1442 if (IS_TAG(lit))
1443 lit->code_max = to_reorder[lit->code_max];
1444 break;
1447 case UNION:
1449 tre_union_t *uni = (tre_union_t *)node->obj;
1450 STACK_PUSHX(stack, voidptr, uni->right);
1451 STACK_PUSHX(stack, voidptr, uni->left);
1452 break;
1455 case CATENATION:
1457 tre_catenation_t *cat = (tre_catenation_t *)node->obj;
1458 STACK_PUSHX(stack, voidptr, cat->right);
1459 STACK_PUSHX(stack, voidptr, cat->left);
1460 break;
1463 case ITERATION:
1465 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
1466 STACK_PUSHX(stack, voidptr, iter->arg);
1467 break;
1470 default:
1471 assert(0);
1472 break;
1475 if (status != REG_OK)
1477 DPRINT(("Error while reordering tags\n"));
1478 goto error_post_compile;
1483 if (!first_pass)
1485 if (tree->last_matched_branch)
1487 tre_last_matched_branch_t *buf, *b, *bb;
1488 tre_last_matched_branch_pre_t *bp;
1489 tre_last_matched_t *u, *uu;
1490 tre_last_matched_pre_t *up;
1491 int *t;
1492 int i;
1493 #ifdef TRE_DEBUG
1494 tre_last_matched_branch_t *_b;
1495 tre_last_matched_t *_u;
1496 int *_t;
1498 DPRINT(("last_match_branch_pre:\n"));
1499 print_last_match_branch_pre(tree->last_matched_branch, 0, num_tags);
1500 #endif /* TRE_DEBUG */
1501 buf = (tre_last_matched_branch_t *)xcalloc(1,
1502 tree->last_matched_branch->tot_branches
1503 * sizeof(tre_last_matched_branch_t) +
1504 tree->last_matched_branch->tot_last_matched
1505 * sizeof(tre_last_matched_t) +
1506 tree->last_matched_branch->tot_tags *
1507 sizeof(int));
1508 if (!buf)
1510 status = REG_ESPACE;
1511 goto error_post_compile;
1514 b = buf;
1515 u = (tre_last_matched_t *)(b +
1516 tree->last_matched_branch->tot_branches);
1517 t = (int *)(u + tree->last_matched_branch->tot_last_matched);
1518 #ifdef TRE_DEBUG
1519 _b = b;
1520 _u = u;
1521 _t = t;
1522 #endif /* TRE_DEBUG */
1523 DPRINT(("Copying info_pre to info\n"));
1524 STACK_PUSH(stack, voidptr, tree->last_matched_branch);
1525 STACK_PUSH(stack, int, 1);
1526 STACK_PUSH(stack, int, COPY_LAST_MATCHED_BRANCH);
1528 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1530 switch (tre_stack_pop_int(stack))
1532 case COPY_LAST_MATCHED_BRANCH:
1533 i = tre_stack_pop_int(stack);
1534 /* The tre_last_matched_branch_pre_t * is still on the
1535 stack */
1536 STACK_PUSHX(stack, voidptr, b);
1537 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH_NEXT);
1538 b += i;
1539 break;
1541 case COPY_LAST_MATCHED_BRANCH_NEXT:
1542 bb = tre_stack_pop_voidptr(stack);
1543 bp = tre_stack_pop_voidptr(stack);
1544 bb->n_last_matched = bp->n_last_matched;
1545 bb->cmp_tag = bp->cmp_tag;
1546 if (bp->n_tags > 0)
1548 int n;
1549 n = bb->n_tags = bp->n_tags;
1550 bb->tags = t;
1551 for (i = 0; i < num_tags; i++)
1552 if (bit_test(bp->tags, i))
1554 *t++ = i;
1555 if (--n <= 0)
1556 break;
1559 if (bp->next)
1561 STACK_PUSHX(stack, voidptr, bp->next);
1562 STACK_PUSHX(stack, voidptr, bb + 1);
1563 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH_NEXT);
1565 if (bp->n_last_matched > 0)
1567 bb->last_matched = u;
1568 STACK_PUSHX(stack, voidptr, bp->last_matched);
1569 STACK_PUSHX(stack, int, bp->n_last_matched);
1570 STACK_PUSHX(stack, int, COPY_LAST_MATCHED);
1572 break;
1574 case COPY_LAST_MATCHED:
1575 i = tre_stack_pop_int(stack);
1576 /* The tre_last_matched_pre_t * is still on the stack */
1577 STACK_PUSHX(stack, voidptr, u);
1578 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_NEXT);
1579 u += i;
1580 break;
1582 case COPY_LAST_MATCHED_NEXT:
1583 uu = tre_stack_pop_voidptr(stack);
1584 up = tre_stack_pop_voidptr(stack);
1585 uu->n_branches = up->n_branches;
1586 uu->branches = b;
1587 uu->start_tag = up->start_tag;
1588 if (up->next)
1590 STACK_PUSHX(stack, voidptr, up->next);
1591 STACK_PUSHX(stack, voidptr, uu + 1);
1592 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_NEXT);
1594 STACK_PUSHX(stack, voidptr, up->branches);
1595 STACK_PUSHX(stack, int, up->n_branches);
1596 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH);
1597 break;
1600 if (status != REG_OK)
1601 goto error_post_compile;
1602 #ifdef TRE_DEBUG
1603 DPRINT(("last_matched_branch:\n"));
1604 print_last_match_branch(buf, 0);
1605 if (b != _b + tree->last_matched_branch->tot_branches)
1606 DPRINT(("b/%p != _b + tree->last_matched_branch->tot_branches/%p\n",
1607 b, _b + tree->last_matched_branch->tot_branches));
1608 if (u != _u + tree->last_matched_branch->tot_last_matched)
1609 DPRINT(("u/%p != _u + "
1610 "tree->last_matched_branch->tot_last_matched/%p\n",
1611 u, _u + tree->last_matched_branch->tot_last_matched));
1612 if (t != _t + tree->last_matched_branch->tot_tags)
1613 DPRINT(("t/%p != _t + tree->last_matched_branch->tot_tags/%p\n",
1614 t, _t + tree->last_matched_branch->tot_tags));
1615 #endif /* TRE_DEBUG */
1616 tnfa->last_matched_branch = buf;
1618 #ifdef TRE_DEBUG
1619 else
1620 DPRINT(("No last_match_branch_pre\n"));
1621 #endif /* TRE_DEBUG */
1624 DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n",
1625 first_pass? "First pass" : "Second pass", num_tags));
1626 #ifdef TRE_DEBUG
1627 tre_ast_print(tree);
1628 #endif /* TRE_DEBUG */
1629 DPRINT(("tre_add_tags: tree->num_tags=%d num_tags=%d\n", tree->num_tags,
1630 num_tags));
1631 assert(tree->num_tags == num_tags);
1632 tnfa->end_tag = num_tags;
1633 tnfa->num_tags = num_tags;
1634 tnfa->num_minimals = num_minimals;
1635 error_post_compile:
1636 xfree(reorder_tags);
1637 error_reorder_tags:
1638 xfree(orig_regset);
1639 error_regset:
1640 return status;
1646 AST to TNFA compilation routines.
1649 typedef enum {
1650 COPY_RECURSE,
1651 COPY_SET_RESULT_PTR
1652 } tre_copyast_symbol_t;
1654 /* Flags for tre_copy_ast(). */
1655 #define COPY_REMOVE_TAGS 1
1656 #define COPY_MAXIMIZE_FIRST_TAG 2
1658 static reg_errcode_t
1659 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1660 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1661 tre_ast_node_t **copy, int *max_pos)
1663 reg_errcode_t status = REG_OK;
1664 int bottom = tre_stack_num_objects(stack);
1665 int num_copied = 0;
1666 int first_tag = 1;
1667 tre_ast_node_t **result = copy;
1668 tre_copyast_symbol_t symbol;
1670 STACK_PUSH(stack, voidptr, ast);
1671 STACK_PUSH(stack, int, COPY_RECURSE);
1673 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1675 tre_ast_node_t *node;
1676 if (status != REG_OK)
1677 break;
1679 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1680 switch (symbol)
1682 case COPY_SET_RESULT_PTR:
1683 result = tre_stack_pop_voidptr(stack);
1684 break;
1685 case COPY_RECURSE:
1686 node = tre_stack_pop_voidptr(stack);
1687 switch (node->type)
1689 case LITERAL:
1691 tre_literal_t *lit = node->obj;
1692 int pos = lit->position;
1693 int min = lit->code_min;
1694 int max = lit->code_max;
1695 tre_bracket_match_list_t *list = !IS_SPECIAL(lit) ?
1696 lit->u.bracket_match_list :
1697 NULL;
1698 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1700 /* XXX - e.g. [ab] has only one position but two
1701 nodes, so we are creating holes in the state space
1702 here. Not fatal, just wastes memory. */
1703 pos += *pos_add;
1704 num_copied++;
1706 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1708 /* Change this tag to empty. */
1709 min = EMPTY;
1710 max = pos = -1;
1712 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1713 && first_tag)
1715 /* Maximize the first tag. */
1716 if (tag_directions[max] == TRE_TAG_LEFT_MAXIMIZE)
1717 tag_directions[max] = TRE_TAG_MAXIMIZE;
1718 first_tag = 0;
1720 *result = tre_ast_new_literal(mem, min, max, pos);
1721 if (*result == NULL)
1722 status = REG_ESPACE;
1724 if (pos > *max_pos)
1725 *max_pos = pos;
1727 if (!IS_SPECIAL(lit))
1728 ((tre_literal_t *)(*result)->obj)->u.bracket_match_list
1729 = list;
1730 break;
1732 case UNION:
1734 tre_union_t *uni = node->obj;
1735 tre_union_t *tmp;
1736 *result = tre_ast_new_union(mem, uni->left, uni->right);
1737 if (*result == NULL)
1739 status = REG_ESPACE;
1740 break;
1742 tmp = (*result)->obj;
1743 result = &tmp->left;
1744 STACK_PUSHX(stack, voidptr, uni->right);
1745 STACK_PUSHX(stack, int, COPY_RECURSE);
1746 STACK_PUSHX(stack, voidptr, &tmp->right);
1747 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1748 STACK_PUSHX(stack, voidptr, uni->left);
1749 STACK_PUSHX(stack, int, COPY_RECURSE);
1750 break;
1752 case CATENATION:
1754 tre_catenation_t *cat = node->obj;
1755 tre_catenation_t *tmp;
1756 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
1757 if (*result == NULL)
1759 status = REG_ESPACE;
1760 break;
1762 tmp = (*result)->obj;
1763 tmp->left = NULL;
1764 tmp->right = NULL;
1765 result = &tmp->left;
1767 STACK_PUSHX(stack, voidptr, cat->right);
1768 STACK_PUSHX(stack, int, COPY_RECURSE);
1769 STACK_PUSHX(stack, voidptr, &tmp->right);
1770 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1771 STACK_PUSHX(stack, voidptr, cat->left);
1772 STACK_PUSHX(stack, int, COPY_RECURSE);
1773 break;
1775 case ITERATION:
1777 tre_iteration_t *iter = node->obj;
1778 STACK_PUSHX(stack, voidptr, iter->arg);
1779 STACK_PUSHX(stack, int, COPY_RECURSE);
1780 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
1781 iter->max, iter->minimal);
1782 if (*result == NULL)
1784 status = REG_ESPACE;
1785 break;
1787 iter = (*result)->obj;
1788 result = &iter->arg;
1789 break;
1791 default:
1792 assert(0);
1793 break;
1795 break;
1798 *pos_add += num_copied;
1799 return status;
1802 typedef enum {
1803 EXPAND_RECURSE,
1804 EXPAND_AFTER_ITER
1805 } tre_expand_ast_symbol_t;
1807 /* Expands each iteration node that has a finite nonzero minimum or maximum
1808 iteration count to a catenated sequence of copies of the node. */
1809 static reg_errcode_t
1810 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1811 int *position, tre_tag_direction_t *tag_directions,
1812 int *max_depth)
1814 reg_errcode_t status = REG_OK;
1815 int bottom = tre_stack_num_objects(stack);
1816 int pos_add = 0;
1817 int pos_add_total = 0;
1818 int max_pos = 0;
1819 #ifdef TRE_APPROX
1820 /* Current approximate matching parameters. */
1821 int params[TRE_PARAM_LAST];
1822 /* Approximate parameter nesting level. */
1823 int params_depth = 0;
1824 #endif /* TRE_APPROX */
1825 int iter_depth = 0;
1826 #ifdef TRE_APPROX
1827 int i;
1828 #endif /* TRE_APPROX */
1830 #ifdef TRE_APPROX
1831 for (i = 0; i < TRE_PARAM_LAST; i++)
1832 params[i] = TRE_PARAM_DEFAULT;
1833 #endif /* TRE_APPROX */
1835 STACK_PUSHR(stack, voidptr, ast);
1836 STACK_PUSHR(stack, int, EXPAND_RECURSE);
1837 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1839 tre_ast_node_t *node;
1840 tre_expand_ast_symbol_t symbol;
1842 if (status != REG_OK)
1843 break;
1845 DPRINT(("pos_add %d\n", pos_add));
1847 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1848 node = tre_stack_pop_voidptr(stack);
1849 switch (symbol)
1851 case EXPAND_RECURSE:
1852 switch (node->type)
1854 case LITERAL:
1856 tre_literal_t *lit= node->obj;
1857 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1859 lit->position += pos_add;
1860 if (lit->position > max_pos)
1861 max_pos = lit->position;
1863 break;
1865 case UNION:
1867 tre_union_t *uni = node->obj;
1868 STACK_PUSHX(stack, voidptr, uni->right);
1869 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1870 STACK_PUSHX(stack, voidptr, uni->left);
1871 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1872 break;
1874 case CATENATION:
1876 tre_catenation_t *cat = node->obj;
1877 STACK_PUSHX(stack, voidptr, cat->right);
1878 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1879 STACK_PUSHX(stack, voidptr, cat->left);
1880 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1881 break;
1883 case ITERATION:
1885 tre_iteration_t *iter = node->obj;
1886 STACK_PUSHX(stack, int, pos_add);
1887 STACK_PUSHX(stack, voidptr, node);
1888 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1889 STACK_PUSHX(stack, voidptr, iter->arg);
1890 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1891 /* If we are going to expand this node at EXPAND_AFTER_ITER
1892 then don't increase the `pos' fields of the nodes now, it
1893 will get done when expanding. */
1894 if (iter->min > 1 || iter->max > 1)
1895 pos_add = 0;
1896 iter_depth++;
1897 DPRINT(("iter\n"));
1898 break;
1900 default:
1901 assert(0);
1902 break;
1904 break;
1905 case EXPAND_AFTER_ITER:
1907 tre_iteration_t *iter = node->obj;
1908 int pos_add_last;
1909 pos_add = tre_stack_pop_int(stack);
1910 pos_add_last = pos_add;
1911 /* Originally (in tre_parse_bound), if min == 0 && max == 0, we
1912 immediate replace the whole iteration with EMPTY. This
1913 unfortunately drops any submatches, and messes up setting the
1914 pmatch values (we can get tags of -1, and tag values in the
1915 billions). So we left it there and replace with EMPTY here. */
1916 if (iter->min == 0 && iter->max == 0)
1918 tre_ast_node_t *empty = tre_ast_new_literal(mem, EMPTY, -1, -1);
1919 if (empty == NULL)
1920 return REG_ESPACE;
1921 node->obj = empty->obj;
1922 node->type = empty->type;
1924 else if (iter->min > 1 || iter->max > 1)
1926 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1927 int j;
1928 int pos_add_save = pos_add;
1930 /* Create a catenated sequence of copies of the node. */
1931 for (j = 0; j < iter->min; j++)
1933 tre_ast_node_t *copy;
1934 /* Remove tags from all but the last copy. */
1935 int flags = ((j + 1 < iter->min)
1936 ? COPY_REMOVE_TAGS
1937 : COPY_MAXIMIZE_FIRST_TAG);
1938 DPRINT((" pos_add %d\n", pos_add));
1939 pos_add_save = pos_add;
1940 status = tre_copy_ast(mem, stack, iter->arg, flags,
1941 &pos_add, tag_directions, &copy,
1942 &max_pos);
1943 if (status != REG_OK)
1944 return status;
1945 if (seq1 != NULL)
1946 seq1 = tre_ast_new_catenation(mem, seq1, copy);
1947 else
1948 seq1 = copy;
1949 if (seq1 == NULL)
1950 return REG_ESPACE;
1953 if (iter->max == -1)
1955 /* No upper limit. */
1956 pos_add_save = pos_add;
1957 status = tre_copy_ast(mem, stack, iter->arg, 0,
1958 &pos_add, NULL, &seq2, &max_pos);
1959 if (status != REG_OK)
1960 return status;
1961 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1962 if (seq2 == NULL)
1963 return REG_ESPACE;
1965 else
1967 for (j = iter->min; j < iter->max; j++)
1969 tre_ast_node_t *tmp, *copy;
1970 pos_add_save = pos_add;
1971 status = tre_copy_ast(mem, stack, iter->arg, 0,
1972 &pos_add, NULL, &copy, &max_pos);
1973 if (status != REG_OK)
1974 return status;
1975 if (seq2 != NULL)
1976 seq2 = tre_ast_new_catenation(mem, copy, seq2);
1977 else
1978 seq2 = copy;
1979 if (seq2 == NULL)
1980 return REG_ESPACE;
1981 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1982 if (tmp == NULL)
1983 return REG_ESPACE;
1984 seq2 = tre_ast_new_union(mem, tmp, seq2);
1985 if (seq2 == NULL)
1986 return REG_ESPACE;
1990 pos_add = pos_add_save;
1991 if (seq1 == NULL)
1992 seq1 = seq2;
1993 else if (seq2 != NULL)
1994 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1995 if (seq1 == NULL)
1996 return REG_ESPACE;
1997 node->obj = seq1->obj;
1998 node->type = seq1->type;
2001 iter_depth--;
2002 pos_add_total += pos_add - pos_add_last;
2003 if (iter_depth == 0)
2004 pos_add = pos_add_total;
2006 #ifdef TRE_APPROX
2007 /* If approximate parameters are specified, surround the result
2008 with two parameter setting nodes. The one on the left sets
2009 the specified parameters, and the one on the right restores
2010 the old parameters. */
2011 if (iter->params)
2013 tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy;
2014 int *old_params;
2016 tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1);
2017 if (!tmp_l)
2018 return REG_ESPACE;
2019 ((tre_literal_t *)tmp_l->obj)->u.params = iter->params;
2020 iter->params[TRE_PARAM_DEPTH] = params_depth + 1;
2021 tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1);
2022 if (!tmp_r)
2023 return REG_ESPACE;
2024 old_params = tre_mem_alloc(mem, sizeof(*old_params)
2025 * TRE_PARAM_LAST);
2026 if (!old_params)
2027 return REG_ESPACE;
2028 for (i = 0; i < TRE_PARAM_LAST; i++)
2029 old_params[i] = params[i];
2030 ((tre_literal_t *)tmp_r->obj)->u.params = old_params;
2031 old_params[TRE_PARAM_DEPTH] = params_depth;
2032 /* XXX - this is the only place where ast_new_node is
2033 needed -- should be moved inside AST module. */
2034 node_copy = tre_ast_new_node(mem, ITERATION,
2035 sizeof(tre_iteration_t));
2036 if (!node_copy)
2037 return REG_ESPACE;
2038 node_copy->obj = node->obj;
2039 tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy);
2040 if (!tmp_node)
2041 return REG_ESPACE;
2042 tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r);
2043 if (!tmp_node)
2044 return REG_ESPACE;
2045 /* Replace the contents of `node' with `tmp_node'. */
2046 memcpy(node, tmp_node, sizeof(*node));
2047 node->obj = tmp_node->obj;
2048 node->type = tmp_node->type;
2049 params_depth++;
2050 if (params_depth > *max_depth)
2051 *max_depth = params_depth;
2053 #endif /* TRE_APPROX */
2054 break;
2056 default:
2057 assert(0);
2058 break;
2062 *position += pos_add_total;
2064 /* `max_pos' should never be larger than `*position' if the above
2065 code works, but just an extra safeguard let's make sure
2066 `*position' is set large enough so enough memory will be
2067 allocated for the transition table. */
2068 if (max_pos > *position)
2069 *position = max_pos;
2071 #ifdef TRE_DEBUG
2072 DPRINT(("Expanded AST:\n"));
2073 tre_ast_print(ast);
2074 DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
2075 #endif
2077 return status;
2080 static tre_pos_and_tags_t *
2081 tre_set_empty(tre_mem_t mem)
2083 tre_pos_and_tags_t *new_set;
2085 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2086 if (new_set == NULL)
2087 return NULL;
2089 new_set[0].position = -1;
2090 new_set[0].code_min = -1;
2091 new_set[0].code_max = -1;
2093 return new_set;
2096 static tre_pos_and_tags_t *
2097 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2098 tre_bracket_match_list_t *bracket_match_list, int backref)
2100 tre_pos_and_tags_t *new_set;
2102 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2103 if (new_set == NULL)
2104 return NULL;
2106 new_set[0].position = position;
2107 new_set[0].code_min = code_min;
2108 new_set[0].code_max = code_max;
2109 new_set[0].bracket_match_list = bracket_match_list;
2110 new_set[0].backref = backref;
2111 new_set[1].position = -1;
2112 new_set[1].code_min = -1;
2113 new_set[1].code_max = -1;
2115 return new_set;
2118 static tre_pos_and_tags_t *
2119 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2120 int *tags, int assertions, int *params)
2122 int s1, s2, i, j;
2123 tre_pos_and_tags_t *new_set;
2124 int *new_tags;
2125 int num_tags;
2127 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2128 for (s1 = 0; set1[s1].position >= 0; s1++);
2129 for (s2 = 0; set2[s2].position >= 0; s2++);
2130 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2131 if (!new_set )
2132 return NULL;
2134 for (s1 = 0; set1[s1].position >= 0; s1++)
2136 new_set[s1].position = set1[s1].position;
2137 new_set[s1].code_min = set1[s1].code_min;
2138 new_set[s1].code_max = set1[s1].code_max;
2139 new_set[s1].assertions = set1[s1].assertions | assertions;
2140 new_set[s1].bracket_match_list = set1[s1].bracket_match_list;
2141 new_set[s1].backref = set1[s1].backref;
2142 if (set1[s1].tags == NULL && tags == NULL)
2143 new_set[s1].tags = NULL;
2144 else
2146 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2147 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2148 * (i + num_tags + 1)));
2149 if (new_tags == NULL)
2150 return NULL;
2151 for (j = 0; j < i; j++)
2152 new_tags[j] = set1[s1].tags[j];
2153 for (i = 0; i < num_tags; i++)
2154 new_tags[j + i] = tags[i];
2155 new_tags[j + i] = -1;
2156 new_set[s1].tags = new_tags;
2158 if (set1[s1].params)
2159 new_set[s1].params = set1[s1].params;
2160 if (params)
2162 if (!new_set[s1].params)
2163 new_set[s1].params = params;
2164 else
2166 new_set[s1].params = tre_mem_alloc(mem, sizeof(*params) *
2167 TRE_PARAM_LAST);
2168 if (!new_set[s1].params)
2169 return NULL;
2170 for (i = 0; i < TRE_PARAM_LAST; i++)
2171 if (params[i] != TRE_PARAM_UNSET)
2172 new_set[s1].params[i] = params[i];
2177 for (s2 = 0; set2[s2].position >= 0; s2++)
2179 new_set[s1 + s2].position = set2[s2].position;
2180 new_set[s1 + s2].code_min = set2[s2].code_min;
2181 new_set[s1 + s2].code_max = set2[s2].code_max;
2182 /* XXX - why not | assertions here as well? */
2183 new_set[s1 + s2].assertions = set2[s2].assertions;
2184 new_set[s1 + s2].bracket_match_list = set2[s2].bracket_match_list;
2185 new_set[s1 + s2].backref = set2[s2].backref;
2186 if (set2[s2].tags == NULL)
2187 new_set[s1 + s2].tags = NULL;
2188 else
2190 for (i = 0; set2[s2].tags[i] >= 0; i++);
2191 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2192 if (new_tags == NULL)
2193 return NULL;
2194 for (j = 0; j < i; j++)
2195 new_tags[j] = set2[s2].tags[j];
2196 new_tags[j] = -1;
2197 new_set[s1 + s2].tags = new_tags;
2199 if (set2[s2].params)
2200 new_set[s1 + s2].params = set2[s2].params;
2201 if (params)
2203 if (!new_set[s1 + s2].params)
2204 new_set[s1 + s2].params = params;
2205 else
2207 new_set[s1 + s2].params = tre_mem_alloc(mem, sizeof(*params) *
2208 TRE_PARAM_LAST);
2209 if (!new_set[s1 + s2].params)
2210 return NULL;
2211 for (i = 0; i < TRE_PARAM_LAST; i++)
2212 if (params[i] != TRE_PARAM_UNSET)
2213 new_set[s1 + s2].params[i] = params[i];
2217 new_set[s1 + s2].position = -1;
2218 return new_set;
2221 /* Finds the empty path through `node' which is the one that should be
2222 taken according to POSIX.2 rules, and adds the tags on that path to
2223 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2224 set to the number of tags seen on the path. */
2225 static reg_errcode_t
2226 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2227 int *assertions, int *params, int *num_tags_seen,
2228 int *params_seen)
2230 tre_literal_t *lit;
2231 tre_union_t *uni;
2232 tre_catenation_t *cat;
2233 tre_iteration_t *iter;
2234 int i;
2235 int bottom = tre_stack_num_objects(stack);
2236 reg_errcode_t status = REG_OK;
2237 if (num_tags_seen)
2238 *num_tags_seen = 0;
2239 if (params_seen)
2240 *params_seen = 0;
2242 status = tre_stack_push_voidptr(stack, node);
2244 /* Walk through the tree recursively. */
2245 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2247 node = tre_stack_pop_voidptr(stack);
2249 switch (node->type)
2251 case LITERAL:
2252 lit = (tre_literal_t *)node->obj;
2253 switch (lit->code_min)
2255 case TAG:
2256 if (lit->code_max >= 0)
2258 if (tags != NULL)
2260 /* Add the tag to `tags'. */
2261 for (i = 0; tags[i] >= 0; i++)
2262 if (tags[i] == lit->code_max)
2263 break;
2264 if (tags[i] < 0)
2266 tags[i] = lit->code_max;
2267 tags[i + 1] = -1;
2270 if (num_tags_seen)
2271 (*num_tags_seen)++;
2273 break;
2274 case ASSERTION:
2275 assert(lit->code_max >= 1
2276 || lit->code_max <= ASSERT_LAST);
2277 if (assertions != NULL)
2278 *assertions |= lit->code_max;
2279 break;
2280 case PARAMETER:
2281 if (params != NULL)
2282 for (i = 0; i < TRE_PARAM_LAST; i++)
2283 params[i] = lit->u.params[i];
2284 if (params_seen != NULL)
2285 *params_seen = 1;
2286 break;
2287 case EMPTY:
2288 break;
2289 default:
2290 assert(0);
2291 break;
2293 break;
2295 case UNION:
2296 /* Subexpressions starting earlier take priority over ones
2297 starting later, so we prefer the left subexpression over the
2298 right subexpression. */
2299 uni = (tre_union_t *)node->obj;
2300 if (uni->left->nullable)
2301 STACK_PUSHX(stack, voidptr, uni->left)
2302 else if (uni->right->nullable)
2303 STACK_PUSHX(stack, voidptr, uni->right)
2304 else
2305 assert(0);
2306 break;
2308 case CATENATION:
2309 /* The path must go through both children. */
2310 cat = (tre_catenation_t *)node->obj;
2311 assert(cat->left->nullable);
2312 assert(cat->right->nullable);
2313 STACK_PUSHX(stack, voidptr, cat->left);
2314 STACK_PUSHX(stack, voidptr, cat->right);
2315 break;
2317 case ITERATION:
2318 /* A match with an empty string is preferred over no match at
2319 all, so we go through the argument if possible. */
2320 iter = (tre_iteration_t *)node->obj;
2321 if (iter->arg->nullable)
2322 STACK_PUSHX(stack, voidptr, iter->arg);
2323 break;
2325 default:
2326 assert(0);
2327 break;
2331 return status;
2335 typedef enum {
2336 NFL_RECURSE,
2337 NFL_POST_UNION,
2338 NFL_POST_CATENATION,
2339 NFL_POST_ITERATION
2340 } tre_nfl_stack_symbol_t;
2343 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2344 the nodes of the AST `tree'. */
2345 static reg_errcode_t
2346 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2348 int bottom = tre_stack_num_objects(stack);
2350 STACK_PUSHR(stack, voidptr, tree);
2351 STACK_PUSHR(stack, int, NFL_RECURSE);
2353 while (tre_stack_num_objects(stack) > bottom)
2355 tre_nfl_stack_symbol_t symbol;
2356 tre_ast_node_t *node;
2358 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2359 node = tre_stack_pop_voidptr(stack);
2360 switch (symbol)
2362 case NFL_RECURSE:
2363 switch (node->type)
2365 case LITERAL:
2367 tre_literal_t *lit = (tre_literal_t *)node->obj;
2368 if (IS_BACKREF(lit))
2370 /* Back references: nullable = false, firstpos = {i},
2371 lastpos = {i}. */
2372 node->nullable = 0;
2373 node->firstpos = tre_set_one(mem, lit->position, 0,
2374 TRE_CHAR_MAX, NULL, -1);
2375 if (!node->firstpos)
2376 return REG_ESPACE;
2377 node->lastpos = tre_set_one(mem, lit->position, 0,
2378 TRE_CHAR_MAX, NULL,
2379 (int)lit->code_max);
2380 if (!node->lastpos)
2381 return REG_ESPACE;
2383 else if (lit->code_min < 0)
2385 /* Tags, empty strings, params, and zero width assertions:
2386 nullable = true, firstpos = {}, and lastpos = {}. */
2387 node->nullable = 1;
2388 node->firstpos = tre_set_empty(mem);
2389 if (!node->firstpos)
2390 return REG_ESPACE;
2391 node->lastpos = tre_set_empty(mem);
2392 if (!node->lastpos)
2393 return REG_ESPACE;
2395 else
2397 /* Literal at position i: nullable = false, firstpos = {i},
2398 lastpos = {i}. */
2399 node->nullable = 0;
2400 node->firstpos =
2401 tre_set_one(mem, lit->position, (int)lit->code_min,
2402 (int)lit->code_max, NULL, -1);
2403 if (!node->firstpos)
2404 return REG_ESPACE;
2405 node->lastpos = tre_set_one(mem, lit->position,
2406 (int)lit->code_min,
2407 (int)lit->code_max,
2408 lit->u.bracket_match_list,
2409 -1);
2410 if (!node->lastpos)
2411 return REG_ESPACE;
2413 break;
2416 case UNION:
2417 /* Compute the attributes for the two subtrees, and after that
2418 for this node. */
2419 STACK_PUSHR(stack, voidptr, node);
2420 STACK_PUSHR(stack, int, NFL_POST_UNION);
2421 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2422 STACK_PUSHR(stack, int, NFL_RECURSE);
2423 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2424 STACK_PUSHR(stack, int, NFL_RECURSE);
2425 break;
2427 case CATENATION:
2428 /* Compute the attributes for the two subtrees, and after that
2429 for this node. */
2430 STACK_PUSHR(stack, voidptr, node);
2431 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2432 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2433 STACK_PUSHR(stack, int, NFL_RECURSE);
2434 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2435 STACK_PUSHR(stack, int, NFL_RECURSE);
2436 break;
2438 case ITERATION:
2439 /* Compute the attributes for the subtree, and after that for
2440 this node. */
2441 STACK_PUSHR(stack, voidptr, node);
2442 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2443 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2444 STACK_PUSHR(stack, int, NFL_RECURSE);
2445 break;
2447 break; /* end case: NFL_RECURSE */
2449 case NFL_POST_UNION:
2451 tre_union_t *uni = (tre_union_t *)node->obj;
2452 node->nullable = uni->left->nullable || uni->right->nullable;
2453 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2454 uni->right->firstpos, NULL, 0, NULL);
2455 if (!node->firstpos)
2456 return REG_ESPACE;
2457 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2458 uni->right->lastpos, NULL, 0, NULL);
2459 if (!node->lastpos)
2460 return REG_ESPACE;
2461 break;
2464 case NFL_POST_ITERATION:
2466 int num_tags, *tags, assertions, params_seen;
2467 int *params;
2468 reg_errcode_t status;
2469 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2471 /* From Ville Laurikari's original 2001 Master's thesis, the
2472 firstpos(n) and lastpos(n) of an iteration is just the
2473 corresponding values of the iteration's argument. Unfortunately,
2474 this isn't sufficient for the following BRE:
2476 \(a*\)*b\(\1\) matched against ab
2478 The backreference wants to force the first subexpression to
2479 be the empty string, but there is no transition for this. So
2480 we need to modify the lastpos(n) of an iteration to be the
2481 equivalent of that of catentation. Using the same notation as
2482 in the thesis, lastpos(n) is redefined as:
2484 if nullable(c1) then
2485 lastpos(c1) U
2486 addtags(lastpos(c1),
2487 emptymatch(c1))
2488 else
2489 lastpos(c1)
2491 where c1 is the argument node. firstpos(n) remains the same. */
2493 /* Compute lastpos. */
2494 if (iter->min == 0 || iter->arg->nullable)
2496 node->nullable = 1;
2497 if (iter->arg->nullable)
2499 /* The arg matches the empty string. Make a first pass
2500 with tre_match_empty() to get the number of tags and
2501 parameters. */
2502 status = tre_match_empty(stack, iter->arg,
2503 NULL, NULL, NULL, &num_tags,
2504 &params_seen);
2505 if (status != REG_OK)
2506 return status;
2507 /* Allocate arrays for the tags and parameters. */
2508 tags = xmalloc(sizeof(int) * (num_tags + 1));
2509 if (!tags)
2510 return REG_ESPACE;
2511 tags[0] = -1;
2512 assertions = 0;
2513 params = NULL;
2514 if (params_seen)
2516 params = tre_mem_alloc(mem, sizeof(*params)
2517 * TRE_PARAM_LAST);
2518 if (!params)
2520 xfree(tags);
2521 return REG_ESPACE;
2524 /* Second pass with tre_mach_empty() to get the list of
2525 tags and parameters. */
2526 status = tre_match_empty(stack, iter->arg, tags,
2527 &assertions, params, NULL, NULL);
2528 if (status != REG_OK)
2530 xfree(tags);
2531 return status;
2533 node->lastpos =
2534 tre_set_union(mem, iter->arg->lastpos, iter->arg->lastpos,
2535 tags, assertions, params);
2536 xfree(tags);
2537 if (!node->lastpos)
2538 return REG_ESPACE;
2540 else
2541 node->lastpos = iter->arg->lastpos;
2543 else
2545 node->nullable = 0;
2546 node->lastpos = iter->arg->lastpos;
2548 node->firstpos = iter->arg->firstpos;
2549 break;
2552 case NFL_POST_CATENATION:
2554 int num_tags, *tags, assertions, params_seen;
2555 int *params;
2556 reg_errcode_t status;
2557 tre_catenation_t *cat = node->obj;
2558 node->nullable = cat->left->nullable && cat->right->nullable;
2560 /* Compute firstpos. */
2561 if (cat->left->nullable)
2563 /* The left side matches the empty string. Make a first pass
2564 with tre_match_empty() to get the number of tags and
2565 parameters. */
2566 status = tre_match_empty(stack, cat->left,
2567 NULL, NULL, NULL, &num_tags,
2568 &params_seen);
2569 if (status != REG_OK)
2570 return status;
2571 /* Allocate arrays for the tags and parameters. */
2572 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2573 if (!tags)
2574 return REG_ESPACE;
2575 tags[0] = -1;
2576 assertions = 0;
2577 params = NULL;
2578 if (params_seen)
2580 params = tre_mem_alloc(mem, sizeof(*params)
2581 * TRE_PARAM_LAST);
2582 if (!params)
2584 xfree(tags);
2585 return REG_ESPACE;
2588 /* Second pass with tre_mach_empty() to get the list of
2589 tags and parameters. */
2590 status = tre_match_empty(stack, cat->left, tags,
2591 &assertions, params, NULL, NULL);
2592 if (status != REG_OK)
2594 xfree(tags);
2595 return status;
2597 node->firstpos =
2598 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2599 tags, assertions, params);
2600 xfree(tags);
2601 if (!node->firstpos)
2602 return REG_ESPACE;
2604 else
2606 node->firstpos = cat->left->firstpos;
2609 /* Compute lastpos. */
2610 if (cat->right->nullable)
2612 /* The right side matches the empty string. Make a first pass
2613 with tre_match_empty() to get the number of tags and
2614 parameters. */
2615 status = tre_match_empty(stack, cat->right,
2616 NULL, NULL, NULL, &num_tags,
2617 &params_seen);
2618 if (status != REG_OK)
2619 return status;
2620 /* Allocate arrays for the tags and parameters. */
2621 tags = xmalloc(sizeof(int) * (num_tags + 1));
2622 if (!tags)
2623 return REG_ESPACE;
2624 tags[0] = -1;
2625 assertions = 0;
2626 params = NULL;
2627 if (params_seen)
2629 params = tre_mem_alloc(mem, sizeof(*params)
2630 * TRE_PARAM_LAST);
2631 if (!params)
2633 xfree(tags);
2634 return REG_ESPACE;
2637 /* Second pass with tre_mach_empty() to get the list of
2638 tags and parameters. */
2639 status = tre_match_empty(stack, cat->right, tags,
2640 &assertions, params, NULL, NULL);
2641 if (status != REG_OK)
2643 xfree(tags);
2644 return status;
2646 node->lastpos =
2647 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2648 tags, assertions, params);
2649 xfree(tags);
2650 if (!node->lastpos)
2651 return REG_ESPACE;
2653 else
2655 node->lastpos = cat->right->lastpos;
2657 break;
2660 default:
2661 assert(0);
2662 break;
2666 return REG_OK;
2670 /* Adds a transition from each position in `p1' to each position in `p2'. */
2671 static reg_errcode_t
2672 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2673 tre_tnfa_transition_t *transitions,
2674 int *counts, int *offs)
2676 tre_pos_and_tags_t *orig_p2 = p2;
2677 tre_tnfa_transition_t *trans;
2678 int i, j, k, l, dup, prev_p2_pos;
2680 if (transitions != NULL)
2681 while (p1->position >= 0)
2683 p2 = orig_p2;
2684 prev_p2_pos = -1;
2685 while (p2->position >= 0)
2687 /* Optimization: if this position was already handled, skip it. */
2688 if (p2->position == prev_p2_pos)
2690 p2++;
2691 continue;
2693 prev_p2_pos = p2->position;
2694 /* Set `trans' to point to the next unused transition from
2695 position `p1->position'. */
2696 trans = transitions + offs[p1->position];
2697 while (trans->state != NULL)
2699 #if 0
2700 /* If we find a previous transition from `p1->position' to
2701 `p2->position', it is overwritten. This can happen only
2702 if there are nested loops in the regexp, like in "((a)*)*".
2703 In POSIX.2 repetition using the outer loop is always
2704 preferred over using the inner loop. Therefore the
2705 transition for the inner loop is useless and can be thrown
2706 away. */
2707 /* XXX - The same position is used for all nodes in a bracket
2708 expression, so this optimization cannot be used (it will
2709 break bracket expressions) unless I figure out a way to
2710 detect it here. */
2711 if (trans->state_id == p2->position)
2713 DPRINT(("*"));
2714 break;
2716 #endif
2717 trans++;
2720 if (trans->state == NULL)
2721 (trans + 1)->state = NULL;
2722 /* Use the character ranges, assertions, etc. from `p1' for
2723 the transition from `p1' to `p2'. */
2724 trans->code_min = p1->code_min;
2725 trans->code_max = p1->code_max;
2726 trans->state = transitions + offs[p2->position];
2727 trans->state_id = p2->position;
2728 trans->assertions = p1->assertions | p2->assertions
2729 | (p1->bracket_match_list != NULL ? ASSERT_BRACKET_MATCH : 0);
2730 if (p1->backref >= 0)
2732 assert((trans->assertions & ASSERT_BRACKET_MATCH) == 0);
2733 assert(p2->backref < 0);
2734 trans->u.backref = p1->backref;
2735 trans->assertions |= ASSERT_BACKREF;
2737 if (p1->bracket_match_list != NULL)
2739 trans->u.bracket_match_list =
2740 xmalloc(SIZEOF_BRACKET_MATCH_LIST(p1->bracket_match_list));
2741 if (trans->u.bracket_match_list == NULL)
2742 return REG_ESPACE;
2743 memcpy(trans->u.bracket_match_list, p1->bracket_match_list,
2744 SIZEOF_BRACKET_MATCH_LIST(p1->bracket_match_list));
2747 /* Find out how many tags this transition has. */
2748 i = 0;
2749 if (p1->tags != NULL)
2750 while(p1->tags[i] >= 0)
2751 i++;
2752 j = 0;
2753 if (p2->tags != NULL)
2754 while(p2->tags[j] >= 0)
2755 j++;
2757 /* If we are overwriting a transition, free the old tag array. */
2758 if (trans->tags != NULL)
2759 xfree(trans->tags);
2760 trans->tags = NULL;
2762 /* If there were any tags, allocate an array and fill it. */
2763 if (i + j > 0)
2765 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2766 if (!trans->tags)
2767 return REG_ESPACE;
2768 i = 0;
2769 if (p1->tags != NULL)
2770 while(p1->tags[i] >= 0)
2772 trans->tags[i] = p1->tags[i];
2773 i++;
2775 l = i;
2776 j = 0;
2777 if (p2->tags != NULL)
2778 while (p2->tags[j] >= 0)
2780 /* Don't add duplicates. */
2781 dup = 0;
2782 for (k = 0; k < i; k++)
2783 if (trans->tags[k] == p2->tags[j])
2785 dup = 1;
2786 break;
2788 if (!dup)
2789 trans->tags[l++] = p2->tags[j];
2790 j++;
2792 trans->tags[l] = -1;
2795 /* Set the parameter array. If both `p2' and `p1' have same
2796 parameters, the values in `p2' override those in `p1'. */
2797 if (p1->params || p2->params)
2799 if (!trans->params)
2800 trans->params = xmalloc(sizeof(*trans->params)
2801 * TRE_PARAM_LAST);
2802 if (!trans->params)
2803 return REG_ESPACE;
2804 for (i = 0; i < TRE_PARAM_LAST; i++)
2806 trans->params[i] = TRE_PARAM_UNSET;
2807 if (p1->params && p1->params[i] != TRE_PARAM_UNSET)
2808 trans->params[i] = p1->params[i];
2809 if (p2->params && p2->params[i] != TRE_PARAM_UNSET)
2810 trans->params[i] = p2->params[i];
2813 else
2815 if (trans->params)
2816 xfree(trans->params);
2817 trans->params = NULL;
2821 #ifdef TRE_DEBUG
2823 int *tags;
2825 DPRINT((" %2d -> %2d on %3d", p1->position, p2->position,
2826 p1->code_min));
2827 if (p1->code_max != p1->code_min)
2828 DPRINT(("-%3d", p1->code_max));
2829 tags = trans->tags;
2830 if (tags)
2832 DPRINT((", tags ["));
2833 while (*tags >= 0)
2835 DPRINT(("%d", *tags));
2836 tags++;
2837 if (*tags >= 0)
2838 DPRINT((","));
2840 DPRINT(("]"));
2842 if (trans->assertions)
2843 DPRINT((", assert %d", trans->assertions));
2844 if (trans->assertions & ASSERT_BACKREF)
2845 DPRINT((", backref %d", trans->u.backref));
2846 else if (trans->assertions & ASSERT_BRACKET_MATCH)
2847 DPRINT((", bracket_match_list %p",
2848 trans->u.bracket_match_list));
2849 if (trans->params)
2851 DPRINT((", "));
2852 tre_print_params(trans->params);
2854 DPRINT(("\n"));
2856 #endif /* TRE_DEBUG */
2857 p2++;
2859 p1++;
2861 else
2862 /* Compute a maximum limit for the number of transitions leaving
2863 from each state. */
2864 while (p1->position >= 0)
2866 p2 = orig_p2;
2867 while (p2->position >= 0)
2869 counts[p1->position]++;
2870 p2++;
2872 p1++;
2874 return REG_OK;
2877 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
2878 labelled with one character range (there are no transitions on empty
2879 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
2880 the regexp. */
2881 static reg_errcode_t
2882 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2883 int *counts, int *offs)
2885 tre_union_t *uni;
2886 tre_catenation_t *cat;
2887 tre_iteration_t *iter;
2888 reg_errcode_t errcode = REG_OK;
2890 /* XXX - recurse using a stack!. */
2891 switch (node->type)
2893 case LITERAL:
2894 break;
2895 case UNION:
2896 uni = (tre_union_t *)node->obj;
2897 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2898 if (errcode != REG_OK)
2899 return errcode;
2900 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2901 break;
2903 case CATENATION:
2904 cat = (tre_catenation_t *)node->obj;
2905 /* Add a transition from each position in cat->left->lastpos
2906 to each position in cat->right->firstpos. */
2907 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2908 transitions, counts, offs);
2909 if (errcode != REG_OK)
2910 return errcode;
2911 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2912 if (errcode != REG_OK)
2913 return errcode;
2914 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2915 break;
2917 case ITERATION:
2918 iter = (tre_iteration_t *)node->obj;
2919 assert(iter->max == -1 || iter->max == 1);
2921 if (iter->max == -1)
2923 assert(iter->min == 0 || iter->min == 1);
2924 /* Add a transition from each last position in the iterated
2925 expression to each first position. */
2926 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2927 transitions, counts, offs);
2928 if (errcode != REG_OK)
2929 return errcode;
2931 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2932 break;
2934 return errcode;
2938 #define ERROR_EXIT(err) \
2939 do \
2941 errcode = err; \
2942 if (/*CONSTCOND*/1) \
2943 goto error_exit; \
2945 while (/*CONSTCOND*/0)
2949 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags,
2950 locale_t loc)
2952 tre_stack_t *stack;
2953 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2954 tre_pos_and_tags_t *p;
2955 int *counts = NULL, *offs = NULL;
2956 int i, add = 0;
2957 tre_tnfa_transition_t *transitions, *initial;
2958 tre_tnfa_t *tnfa = NULL;
2959 tre_submatch_data_t *submatch_data = NULL;
2960 tre_tag_direction_t *tag_directions = NULL;
2961 reg_errcode_t errcode;
2962 tre_mem_t mem;
2964 /* Parse context. */
2965 tre_parse_ctx_t parse_ctx;
2967 /* Allocate a stack used throughout the compilation process for various
2968 purposes. */
2969 stack = tre_stack_new(512, 10240, 128);
2970 if (!stack)
2971 return REG_ESPACE;
2972 /* Allocate a fast memory allocator. */
2973 mem = tre_mem_new();
2974 if (!mem)
2976 tre_stack_destroy(stack);
2977 return REG_ESPACE;
2980 /* Parse the regexp. */
2981 memset(&parse_ctx, 0, sizeof(parse_ctx));
2982 parse_ctx.mem = mem;
2983 parse_ctx.stack = stack;
2984 parse_ctx.re = regex;
2985 parse_ctx.len = n;
2986 /* Only allow REG_UNGREEDY to be set if both REG_ENHANCED and REG_EXTENDED
2987 are also set */
2988 if ((cflags & (REG_ENHANCED | REG_EXTENDED)) != (REG_ENHANCED | REG_EXTENDED))
2989 cflags &= ~REG_UNGREEDY;
2990 parse_ctx.cflags = cflags;
2991 parse_ctx.max_backref = -1;
2992 parse_ctx.loc = loc;
2993 parse_ctx.submatch_id_invisible = SUBMATCH_ID_INVISIBLE_START;
2995 DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex));
2996 errcode = tre_parse(&parse_ctx);
2997 if (errcode != REG_OK)
2998 ERROR_EXIT(errcode);
2999 preg->re_nsub = parse_ctx.submatch_id - 1;
3000 tree = parse_ctx.result;
3002 /* Back references and approximate matching cannot currently be used
3003 in the same regexp. */
3004 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
3005 ERROR_EXIT(REG_BADPAT);
3007 #ifdef TRE_DEBUG
3008 tre_ast_print(tree);
3009 #endif /* TRE_DEBUG */
3011 /* Referring to nonexistent subexpressions is illegal. */
3012 if (parse_ctx.max_backref > (int)preg->re_nsub)
3013 ERROR_EXIT(REG_ESUBREG);
3015 /* Allocate the TNFA struct. */
3016 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
3017 if (tnfa == NULL)
3018 ERROR_EXIT(REG_ESPACE);
3019 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
3020 tnfa->have_approx = parse_ctx.have_approx;
3021 tnfa->num_submatches = parse_ctx.submatch_id;
3022 tnfa->num_submatches_invisible = parse_ctx.submatch_id_invisible
3023 - SUBMATCH_ID_INVISIBLE_START;
3024 tnfa->num_reorder_tags = parse_ctx.num_reorder_tags;
3025 tnfa->loc = parse_ctx.loc;
3027 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
3028 regexp does not have back references, this can be skipped. */
3029 if (tnfa->num_reorder_tags > 0 || !(cflags & REG_NOSUB))
3031 DPRINT(("tre_compile: setting up tags\n"));
3033 /* Figure out how many tags we will need. */
3034 errcode = tre_add_tags(NULL, stack, tree, tnfa);
3035 if (errcode != REG_OK)
3036 ERROR_EXIT(errcode);
3037 #ifdef TRE_DEBUG
3038 tre_ast_print(tree);
3039 #endif /* TRE_DEBUG */
3041 if (tnfa->num_tags > 0)
3043 tag_directions = xmalloc(sizeof(*tag_directions)
3044 * (tnfa->num_tags + 1));
3045 if (tag_directions == NULL)
3046 ERROR_EXIT(REG_ESPACE);
3047 tnfa->tag_directions = tag_directions;
3048 memset(tag_directions, -1,
3049 sizeof(*tag_directions) * (tnfa->num_tags + 1));
3051 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 3,
3052 sizeof(tnfa->minimal_tags));
3053 if (tnfa->minimal_tags == NULL)
3054 ERROR_EXIT(REG_ESPACE);
3056 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
3057 sizeof(*submatch_data));
3058 if (submatch_data == NULL)
3059 ERROR_EXIT(REG_ESPACE);
3060 /* Set the eo_tag value to -1 to indicate that that corresponding
3061 * submatch has not be completed yet */
3062 for (i = 0; i < parse_ctx.submatch_id; i++)
3064 submatch_data[i].eo_tag = -1;
3066 tnfa->submatch_data = submatch_data;
3068 errcode = tre_add_tags(mem, stack, tree, tnfa);
3069 if (errcode != REG_OK)
3070 ERROR_EXIT(errcode);
3072 #ifdef TRE_DEBUG
3073 for (i = 0; i < parse_ctx.submatch_id; i++)
3074 DPRINT(("pmatch[%d] = {t%d, t%d}\n",
3075 i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
3076 for (i = 0; i <= tnfa->num_tags; i++)
3077 DPRINT(("t%d is %s\n", i, tag_dir_str[tag_directions[i]]));
3078 #endif /* TRE_DEBUG */
3081 /* Expand iteration nodes. */
3082 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
3083 tag_directions, &tnfa->params_depth);
3084 if (errcode != REG_OK)
3085 ERROR_EXIT(errcode);
3087 /* Add a dummy node for the final state.
3088 XXX - For certain patterns this dummy node can be optimized away,
3089 for example "a*" or "ab*". Figure out a simple way to detect
3090 this possibility. */
3091 tmp_ast_l = tree;
3092 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
3093 if (tmp_ast_r == NULL)
3094 ERROR_EXIT(REG_ESPACE);
3096 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
3097 if (tree == NULL)
3098 ERROR_EXIT(REG_ESPACE);
3100 #ifdef TRE_DEBUG
3101 tre_ast_print(tree);
3102 DPRINT(("Number of states: %d\n", parse_ctx.position));
3103 if (submatch_data)
3104 for (i = 0; i < parse_ctx.submatch_id; i++)
3105 DPRINT(("pmatch[%d] = {t%d, t%d}\n",
3106 i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
3107 if (tag_directions)
3108 for (i = 0; i <= tnfa->num_tags; i++)
3109 DPRINT(("t%d is %s\n", i, tag_dir_str[tag_directions[i]]));
3110 #endif /* TRE_DEBUG */
3112 errcode = tre_compute_nfl(mem, stack, tree);
3113 if (errcode != REG_OK)
3114 ERROR_EXIT(errcode);
3116 counts = xmalloc(sizeof(int) * parse_ctx.position);
3117 if (counts == NULL)
3118 ERROR_EXIT(REG_ESPACE);
3120 offs = xmalloc(sizeof(int) * parse_ctx.position);
3121 if (offs == NULL)
3122 ERROR_EXIT(REG_ESPACE);
3124 for (i = 0; i < parse_ctx.position; i++)
3125 counts[i] = 0;
3126 tre_ast_to_tnfa(tree, NULL, counts, NULL);
3128 add = 0;
3129 for (i = 0; i < parse_ctx.position; i++)
3131 offs[i] = add;
3132 add += counts[i] + 1;
3133 counts[i] = 0;
3135 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
3136 if (transitions == NULL)
3137 ERROR_EXIT(REG_ESPACE);
3138 tnfa->transitions = transitions;
3139 tnfa->num_transitions = add;
3141 DPRINT(("Converting to TNFA:\n"));
3142 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
3143 if (errcode != REG_OK)
3144 ERROR_EXIT(errcode);
3147 /* Set first_char only if there is only one character that can be the
3148 first character of a match */
3149 tnfa->first_char = -1;
3150 if (!tmp_ast_l->nullable)
3152 int scanning = 1;
3153 for (p = tree->firstpos; scanning && p->position >= 0; p++)
3155 tre_tnfa_transition_t *j = transitions + offs[p->position];
3156 while (j->state != NULL)
3158 if (j->code_min <= j->code_max)
3160 if (j->code_max != j->code_min || j->code_min == -1 || tnfa->first_char != -1)
3162 tnfa->first_char = -1;
3163 scanning = 0;
3164 break;
3166 tnfa->first_char = j->code_min;
3168 j++;
3171 #ifdef TRE_DEBUG
3172 if (tnfa->first_char >= 0)
3173 DPRINT(("first char must be %d\n", tnfa->first_char));
3174 #endif /* TRE_DEBUG */
3177 p = tree->firstpos;
3178 i = 0;
3179 while (p->position >= 0)
3181 i++;
3183 #ifdef TRE_DEBUG
3185 int *tags;
3186 DPRINT(("initial: %d", p->position));
3187 tags = p->tags;
3188 if (tags != NULL)
3190 if (*tags >= 0)
3191 DPRINT(("/"));
3192 while (*tags >= 0)
3194 DPRINT(("%d", *tags));
3195 tags++;
3196 if (*tags >= 0)
3197 DPRINT((","));
3200 DPRINT((", assert %d", p->assertions));
3201 if (p->params)
3203 DPRINT((", "));
3204 tre_print_params(p->params);
3206 DPRINT(("\n"));
3208 #endif /* TRE_DEBUG */
3210 p++;
3213 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
3214 if (initial == NULL)
3215 ERROR_EXIT(REG_ESPACE);
3216 tnfa->initial = initial;
3218 i = 0;
3219 for (p = tree->firstpos; p->position >= 0; p++)
3221 initial[i].state = transitions + offs[p->position];
3222 initial[i].state_id = p->position;
3223 initial[i].tags = NULL;
3224 /* Copy the arrays p->tags, and p->params, they are allocated
3225 from a tre_mem object. */
3226 if (p->tags)
3228 int j;
3229 for (j = 0; p->tags[j] >= 0; j++);
3230 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
3231 if (!initial[i].tags)
3232 ERROR_EXIT(REG_ESPACE);
3233 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
3235 initial[i].params = NULL;
3236 if (p->params)
3238 initial[i].params = xmalloc(sizeof(*p->params) * TRE_PARAM_LAST);
3239 if (!initial[i].params)
3240 ERROR_EXIT(REG_ESPACE);
3241 memcpy(initial[i].params, p->params,
3242 sizeof(*p->params) * TRE_PARAM_LAST);
3244 initial[i].assertions = p->assertions;
3245 i++;
3247 initial[i].state = NULL;
3249 tnfa->num_transitions = add;
3250 tnfa->final = transitions + offs[tree->lastpos[0].position];
3251 tnfa->num_states = parse_ctx.position;
3252 tnfa->cflags = cflags;
3254 DPRINT(("final state %d (%p)\n", tree->lastpos[0].position,
3255 (void *)tnfa->final));
3257 tre_mem_destroy(mem);
3258 tre_stack_destroy(stack);
3259 xfree(counts);
3260 xfree(offs);
3262 #ifdef TRE_USE_SYSTEM_REGEX_H
3263 preg->re_magic = RE_MAGIC;
3264 #endif /* TRE_USE_SYSTEM_REGEX_H */
3265 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3266 xlocale_retain(tnfa->loc);
3267 return REG_OK;
3269 error_exit:
3270 /* Free everything that was allocated and return the error code. */
3271 tre_mem_destroy(mem);
3272 if (stack != NULL)
3273 tre_stack_destroy(stack);
3274 if (counts != NULL)
3275 xfree(counts);
3276 if (offs != NULL)
3277 xfree(offs);
3279 /* Set tnfa into preg, so that calling tre_free() will free the contents
3280 of tnfa. NULL out the loc field since we never retained the locale. */
3281 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
3282 if(tnfa) tnfa->loc = NULL;
3284 tre_free(preg);
3285 return errcode;
3291 void
3292 tre_free(regex_t *preg)
3294 tre_tnfa_t *tnfa;
3295 unsigned int i;
3296 tre_tnfa_transition_t *trans;
3298 #ifdef TRE_USE_SYSTEM_REGEX_H
3299 preg->re_magic = 0;
3300 #endif /* TRE_USE_SYSTEM_REGEX_H */
3301 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
3302 if (!tnfa)
3303 return;
3304 preg->TRE_REGEX_T_FIELD = NULL;
3306 for (i = 0; i < tnfa->num_transitions; i++)
3307 if (tnfa->transitions[i].state)
3309 if (tnfa->transitions[i].tags)
3310 xfree(tnfa->transitions[i].tags);
3311 if (tnfa->transitions[i].assertions & ASSERT_BRACKET_MATCH)
3312 xfree(tnfa->transitions[i].u.bracket_match_list);
3313 if (tnfa->transitions[i].params)
3314 xfree(tnfa->transitions[i].params);
3316 if (tnfa->transitions)
3317 xfree(tnfa->transitions);
3319 if (tnfa->initial)
3321 for (trans = tnfa->initial; trans->state; trans++)
3323 if (trans->tags)
3324 xfree(trans->tags);
3325 if (trans->params)
3326 xfree(trans->params);
3328 xfree(tnfa->initial);
3331 if (tnfa->submatch_data)
3333 xfree(tnfa->submatch_data);
3336 if (tnfa->tag_directions)
3337 xfree(tnfa->tag_directions);
3338 if (tnfa->minimal_tags)
3339 xfree(tnfa->minimal_tags);
3341 if (tnfa->loc)
3342 xlocale_release(tnfa->loc);
3344 if (tnfa->last_matched_branch)
3345 xfree(tnfa->last_matched_branch);
3347 xfree(tnfa);
3350 char *
3351 tre_version(void)
3353 static char str[256];
3354 char *version;
3356 if (str[0] == 0)
3358 (void) tre_config(TRE_CONFIG_VERSION, &version);
3359 (void) snprintf(str, sizeof(str), "TRE %s (BSD)", version);
3361 return str;
3365 tre_config(int query, void *result)
3367 int *int_result = result;
3368 const char **string_result = result;
3370 switch (query)
3372 case TRE_CONFIG_APPROX:
3373 #ifdef TRE_APPROX
3374 *int_result = 1;
3375 #else /* !TRE_APPROX */
3376 *int_result = 0;
3377 #endif /* !TRE_APPROX */
3378 return REG_OK;
3380 case TRE_CONFIG_WCHAR:
3381 #ifdef TRE_WCHAR
3382 *int_result = 1;
3383 #else /* !TRE_WCHAR */
3384 *int_result = 0;
3385 #endif /* !TRE_WCHAR */
3386 return REG_OK;
3388 case TRE_CONFIG_MULTIBYTE:
3389 #ifdef TRE_MULTIBYTE
3390 *int_result = 1;
3391 #else /* !TRE_MULTIBYTE */
3392 *int_result = 0;
3393 #endif /* !TRE_MULTIBYTE */
3394 return REG_OK;
3396 case TRE_CONFIG_SYSTEM_ABI:
3397 #ifdef TRE_CONFIG_SYSTEM_ABI
3398 *int_result = 1;
3399 #else /* !TRE_CONFIG_SYSTEM_ABI */
3400 *int_result = 0;
3401 #endif /* !TRE_CONFIG_SYSTEM_ABI */
3402 return REG_OK;
3404 case TRE_CONFIG_VERSION:
3405 *string_result = TRE_VERSION;
3406 return REG_OK;
3409 return REG_NOMATCH;
3413 /* EOF */