2 tre-parse.c - Regexp parser
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
10 This parser is just a simple recursive descent parser for POSIX.2
11 regexps. The parser supports both the obsolete default syntax and
12 the "extended" syntax, and some nonstandard extensions.
18 #endif /* HAVE_CONFIG_H */
27 #include "tre-stack.h"
28 #include "tre-parse.h"
30 #include "xlocale_private.h"
34 Before looking up a collating symbol, check if the name matches in
35 the character names (cnames) array; if so, use the corresponding
38 Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
39 the implementation choice that for ERE, a non-numeric character following
40 a left brace that would normally be a bound, causes the left brace to be
42 #define BSD_COMPATIBILITY
43 #ifdef BSD_COMPATIBILITY
45 #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
46 #endif /* BSD_COMPATIBILITY */
48 /* Characters with special meanings in regexp syntax. */
49 #define CHAR_PIPE L'|'
50 #define CHAR_LPAREN L'('
51 #define CHAR_RPAREN L')'
52 #define CHAR_LBRACE L'{'
53 #define CHAR_RBRACE L'}'
54 #define CHAR_LBRACKET L'['
55 #define CHAR_RBRACKET L']'
56 #define CHAR_MINUS L'-'
57 #define CHAR_STAR L'*'
58 #define CHAR_QUESTIONMARK L'?'
59 #define CHAR_PLUS L'+'
60 #define CHAR_PERIOD L'.'
61 #define CHAR_COLON L':'
62 #define CHAR_EQUAL L'='
63 #define CHAR_COMMA L','
64 #define CHAR_CARET L'^'
65 #define CHAR_DOLLAR L'$'
66 #define CHAR_BACKSLASH L'\\'
67 #define CHAR_HASH L'#'
68 #define CHAR_TILDE L'~'
71 /* Some macros for expanding \w, \s, etc. */
72 static const struct tre_macro_struct
{
74 const char *expansion
;
76 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
77 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
78 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
79 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
84 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
85 must have at least `len' items. Sets buf[0] to zero if the there
86 is no match in `tre_macros'. */
88 tre_expand_macro(const tre_char_t
*regex
, const tre_char_t
*regex_end
,
89 tre_char_t
*buf
, size_t buf_len
)
94 if (regex
>= regex_end
)
97 for (i
= 0; tre_macros
[i
].expansion
; i
++)
99 if (tre_macros
[i
].c
== *regex
)
102 DPRINT(("Expanding macro '%c' => '%s'\n",
103 tre_macros
[i
].c
, tre_macros
[i
].expansion
));
104 for (j
= 0; tre_macros
[i
].expansion
[j
] && j
< buf_len
; j
++)
105 buf
[j
] = tre_macros
[i
].expansion
[j
];
113 tre_new_item(tre_mem_t mem
, int type
, int val
, int *max_i
,
114 tre_bracket_match_list_t
**items
)
116 reg_errcode_t status
= REG_OK
;
117 tre_bracket_match_list_t
*array
= *items
;
118 int i
= array
->num_bracket_matches
;
119 /* Allocate more space if necessary. */
122 tre_bracket_match_list_t
*new_items
;
123 DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i
));
124 /* If the array is already 1024 items large, give up -- there's
125 probably an error in the regexp (e.g. not a '\0' terminated
126 string and missing ']') */
130 new_items
= xrealloc(array
, SIZEOF_BRACKET_MATCH_LIST_N(*max_i
));
131 if (new_items
== NULL
)
133 *items
= array
= new_items
;
135 array
->bracket_matches
[i
].type
= type
;
136 array
->bracket_matches
[i
].value
= val
;
137 array
->num_bracket_matches
++;
141 #ifndef TRE_USE_SYSTEM_WCTYPE
143 /* isalnum() and the rest may be macros, so wrap them to functions. */
144 int tre_isalnum_func(tre_cint_t c
) { return tre_isalnum(c
); }
145 int tre_isalpha_func(tre_cint_t c
) { return tre_isalpha(c
); }
148 int tre_isascii_func(tre_cint_t c
) { return tre_isascii(c
); }
149 #else /* !tre_isascii */
150 int tre_isascii_func(tre_cint_t c
) { return !(c
>> 7); }
151 #endif /* !tre_isascii */
154 int tre_isblank_func(tre_cint_t c
) { return tre_isblank(c
); }
155 #else /* !tre_isblank */
156 int tre_isblank_func(tre_cint_t c
) { return ((c
== ' ') || (c
== '\t')); }
157 #endif /* !tre_isblank */
159 int tre_iscntrl_func(tre_cint_t c
) { return tre_iscntrl(c
); }
160 int tre_isdigit_func(tre_cint_t c
) { return tre_isdigit(c
); }
161 int tre_isgraph_func(tre_cint_t c
) { return tre_isgraph(c
); }
162 int tre_islower_func(tre_cint_t c
) { return tre_islower(c
); }
163 int tre_isprint_func(tre_cint_t c
) { return tre_isprint(c
); }
164 int tre_ispunct_func(tre_cint_t c
) { return tre_ispunct(c
); }
165 int tre_isspace_func(tre_cint_t c
) { return tre_isspace(c
); }
166 int tre_isupper_func(tre_cint_t c
) { return tre_isupper(c
); }
167 int tre_isxdigit_func(tre_cint_t c
) { return tre_isxdigit(c
); }
171 int (*func
)(tre_cint_t
);
172 } tre_ctype_map
[] = {
173 { "alnum", &tre_isalnum_func
},
174 { "alpha", &tre_isalpha_func
},
176 { "ascii", &tre_isascii_func
},
177 #endif /* tre_isascii */
179 { "blank", &tre_isblank_func
},
180 #endif /* tre_isblank */
181 { "cntrl", &tre_iscntrl_func
},
182 { "digit", &tre_isdigit_func
},
183 { "graph", &tre_isgraph_func
},
184 { "lower", &tre_islower_func
},
185 { "print", &tre_isprint_func
},
186 { "punct", &tre_ispunct_func
},
187 { "space", &tre_isspace_func
},
188 { "upper", &tre_isupper_func
},
189 { "xdigit", &tre_isxdigit_func
},
193 tre_ctype_t
tre_ctype(const char *name
)
196 for (i
= 0; tre_ctype_map
[i
].name
!= NULL
; i
++)
198 if (strcmp(name
, tre_ctype_map
[i
].name
) == 0)
199 return tre_ctype_map
[i
].func
;
201 return (tre_ctype_t
)0;
203 #endif /* !TRE_USE_SYSTEM_WCTYPE */
205 #define REST(re) (int)(ctx->re_end - (re)), (re)
207 #define START_COLLATING_SYMBOLS 16
208 #define MAX_COLLATING_SYMBOL_LEN 4
211 const tre_char_t
*start
;
213 } tre_collating_symbol
;
215 #ifdef BSD_COMPATIBILITY
217 tre_search_cnames(const wchar_t *name
, size_t len
)
220 size_t high
= NCNAMES
- 1;
226 cur
= (low
+ high
) / 2;
227 cmp
= wcsncmp(name
, cnames
[cur
].name
, len
);
228 if (cmp
== 0 && cnames
[cur
].name
[len
] == 0) return cnames
[cur
].code
;
229 if (cmp
> 0) low
= cur
+ 1;
234 #endif /* BSD_COMPATIBILITY */
236 /* Scan the contents of a bracket expression, and create a
237 * tre_bracket_match_list_t encoding the bracket expression. If during
238 * the scan, multi-character collating symbols are detected, switch
239 * into a mode to collect those MCCSs into a tre_collating_symbol
240 * list and pass them back. tre_parse_bracket will use that to
241 * create a new string composed of a union of the bracket expression
242 * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
243 * call tre_parse (recursive) to parse that new string (which will
244 * call tre_parse_bracket and tre_parse_bracket_items again. */
246 tre_parse_bracket_items(tre_parse_ctx_t
*ctx
, tre_bracket_match_list_t
**items
,
247 int *items_size
, tre_collating_symbol
**result
)
249 const tre_char_t
*re
= ctx
->re
;
250 const tre_char_t
*re_end
= ctx
->re_end
;
251 tre_collating_symbol
*col_syms
= NULL
;
252 tre_collating_symbol
*cp
= NULL
;
254 reg_errcode_t status
;
255 int max_i
= *items_size
;
256 int other
= 0; /* contains content other than multi-character collating
258 int range
= -1; /* -1 unset, 0 begin range set, +1 end range expected */
260 int invert
= ((*items
)->flags
& TRE_BRACKET_MATCH_FLAG_NEGATE
);
261 int collect_MCCS
= 0;
262 const tre_char_t
*start
;
264 for ( ;re
< re_end
; re
++)
272 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
278 /* The hyphen is the end range */
281 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
283 goto process_end_range
;
285 if (re
+ 1 >= re_end
)
290 /* The hyphen is at the end */
291 if (re
[1] == CHAR_RBRACKET
)
293 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
295 goto process_begin_range
;
297 /* Two ranges are not allowed to share an endpoint, or begin
298 * range is illegal. */
304 range
= 1; /* Expect end range */
305 DPRINT(("tre_parse_bracket: range: '%.*" STRF
"'\n", REST(re
)));
309 if (re
+ 1 >= re_end
)
324 status
= REG_ECOLLATE
;
327 if (*re
== CHAR_PERIOD
)
329 if (re
+ 1 >= re_end
)
331 status
= REG_ECOLLATE
;
335 if (re
[1] == CHAR_RBRACKET
)
337 DPRINT(("tre_parse_bracket: collating "
338 "symbol: '%.*" STRF
"'\n",
343 status
= REG_ECOLLATE
;
346 #ifdef BSD_COMPATIBILITY
347 /* Check if the name is in cnames; if so, use
348 the corresponding code */
349 c
= tre_search_cnames(start
, re
- start
);
350 if (c
!= (wchar_t)-1)
353 goto process_single_character
;
355 #endif /* BSD_COMPATIBILITY */
356 /* Verify this is a known sequence */
357 if (__collate_equiv_value(ctx
->loc
, start
,
360 status
= REG_ECOLLATE
;
363 /* Process single character collating symbols */
368 goto process_single_character
;
370 /* Inverted MCCSs are undefined */
373 status
= REG_ECOLLATE
;
376 /* Can't have MCCSs as an endpoint to a range */
383 /* Switch into MCCS collection mode (if not
389 DPRINT(("tre_parse_bracket: Detected MCCS\n"));
391 #else /* !TRE_DEBUG */
393 #endif /* !TRE_DEBUG */
394 /* Allocate a memory block the first time */
397 if ((col_syms
= xmalloc(sizeof(*col_syms
) *
398 (START_COLLATING_SYMBOLS
+ 2)))
402 n_col_syms
= START_COLLATING_SYMBOLS
;
404 /* Enlarge the memory block is more is needed */
405 if ((cp
- col_syms
) - 1 >= n_col_syms
)
408 tre_collating_symbol
*tmp
=
409 xrealloc(col_syms
, sizeof(*col_syms
) *
410 ((n_col_syms
*= 2) + 2));
416 DPRINT(("tre_list_collating_symbols: "
417 "Enlarging col_syms to %d\n",
420 cp
= col_syms
+ i
+ 1;
423 cp
->len
= re
- start
;
436 /* Process equivalence and character classes */
437 tre_char_t kind
= re
[1];
439 /* Can't have a class as an endpoint to a range */
445 if (!collect_MCCS
&& range
== 0)
447 status
= tre_new_item(ctx
->mem
, TRE_BRACKET_MATCH_TYPE_CHAR
,
449 if (status
!= REG_OK
)
459 status
= kind
== CHAR_EQUAL
? REG_ECOLLATE
: REG_ECTYPE
;
464 if (re
+ 1 >= re_end
)
466 status
= kind
== CHAR_EQUAL
? REG_ECOLLATE
:
471 if (re
[1] == CHAR_RBRACKET
)
475 /* Empty class name */
476 status
= kind
== CHAR_EQUAL
? REG_ECOLLATE
:
480 /* Process equivalence class */
481 if (kind
== CHAR_EQUAL
)
485 DPRINT(("tre_parse_bracket: equivalence: '%.*"
486 STRF
"'\n", REST(start
- 2)));
488 /* While we find the collation value even for
489 multi-character collating elements , we
490 don't (yet) match any collation values
491 against multi-character sequences. We'd have
492 to enumerate those multi-character sequences
493 and like multi-character collating symbols,
494 create a union of those sequences with the
495 rest of the bracket expression. While
496 doable, a bracket expression matching
497 multiple characters, that doesn't explicitly
498 contain multi-character sequences, might
499 be unexpected, so we punt for now. */
500 if ((equiv
= __collate_equiv_value(ctx
->loc
,
501 start
, re
- start
)) <= 0)
503 /* The standard says that if no collating
504 element if found, we use the collating
505 symbol itself. But __collate_equiv_value
506 doesn't make a distinction between
507 an element that is in a equvalence
508 class with others, or is the only member,
509 so we already know there is no collating
510 symbol. (Note that in the case of a
511 collating element whose collation value
512 is unique, matching against the
513 collating element itself, or against
514 its collation value, is equivalent.) */
515 #ifdef BSD_COMPATIBILITY
516 /* Check if the name is in cnames; if so,
517 use the corresponding code */
518 c
= tre_search_cnames(start
, re
- start
);
519 if (c
!= (wchar_t)-1)
522 goto process_single_character
;
524 #endif /* BSD_COMPATIBILITY */
525 status
= REG_ECOLLATE
;
530 status
= tre_new_item(ctx
->mem
,
531 TRE_BRACKET_MATCH_TYPE_EQUIVALENCE
,
532 equiv
, &max_i
, items
);
533 if (status
!= REG_OK
)
539 /* Process character class */
540 DPRINT(("tre_parse_bracket: class: '%.*" STRF
541 "'\n", REST(start
- 2)));
546 int len
= MIN(re
- start
, 63);
549 tre_char_t tmp_wcs
[64];
550 wcsncpy(tmp_wcs
, start
, (size_t)len
);
551 tmp_wcs
[len
] = L
'\0';
552 #if defined HAVE_WCSRTOMBS
555 const tre_char_t
*src
= tmp_wcs
;
556 memset(&state
, '\0', sizeof(state
));
557 len
= wcsrtombs_l(tmp_str
, &src
,
558 sizeof(tmp_str
), &state
,
561 #elif defined HAVE_WCSTOMBS
562 len
= wcstombs(tmp_str
, tmp_wcs
, 63);
563 #endif /* defined HAVE_WCSTOMBS */
565 #else /* !TRE_WCHAR */
566 strncpy(tmp_str
, (const char*)start
, len
);
567 #endif /* !TRE_WCHAR */
569 DPRINT((" class name: %s\n", tmp_str
));
570 class = tre_ctype_l(tmp_str
, ctx
->loc
);
576 status
= tre_new_item(ctx
->mem
,
577 TRE_BRACKET_MATCH_TYPE_CLASS
,
578 class, &max_i
, items
);
579 if (status
!= REG_OK
)
593 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
595 goto process_single_character
;
601 /* A first right bracket */
604 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
613 DPRINT(("tre_parse_bracket: done: '%.*" STRF
"'\n",
617 /* Mark the character following the right bracket. Set len
618 * to whether there are other things besides the
619 * multi-character collating symbols */
620 col_syms
->start
= re
+ 1;
621 col_syms
->len
= other
;
622 /* Mark the end of the list */
628 /* range > 0 is not possible, since we did a lookahead after the
632 status
= tre_new_item(ctx
->mem
, TRE_BRACKET_MATCH_TYPE_CHAR
,
634 if (status
!= REG_OK
)
637 DPRINT(("tre_parse_bracket: done: '%.*" STRF
"'\n", REST(re
)));
643 DPRINT(("tre_parse_bracket: char: '%.*" STRF
"'\n", REST(re
)));
645 process_single_character
:
646 /* Process single character */
652 /* Get collation equivalence values */
653 mine
= __collate_equiv_value(ctx
->loc
, &min
, 1);
654 maxe
= __collate_equiv_value(ctx
->loc
, &c
, 1);
662 status
= tre_new_item(ctx
->mem
,
663 TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN
,
664 mine
, &max_i
, items
);
665 if (status
!= REG_OK
)
667 status
= tre_new_item(ctx
->mem
,
668 TRE_BRACKET_MATCH_TYPE_RANGE_END
,
669 maxe
, &max_i
, items
);
670 if (status
!= REG_OK
)
682 status
= tre_new_item(ctx
->mem
,
683 TRE_BRACKET_MATCH_TYPE_CHAR
,
685 if (status
!= REG_OK
)
698 DPRINT(("tre_parse_bracket: error: '%.*" STRF
"', status=%d\n",
706 static const char *bracket_match_type_str
[] = {
714 #endif /* TRE_DEBUG */
717 tre_parse_bracket(tre_parse_ctx_t
*ctx
, tre_ast_node_t
**result
)
719 tre_ast_node_t
*node
;
720 reg_errcode_t status
= REG_OK
;
721 tre_bracket_match_list_t
*items
;
723 tre_collating_symbol
*col_syms
= NULL
;
725 /* Handle special cases [[:<:]] and [[:>:]] */
726 if (ctx
->re_end
- ctx
->re
>= 6 && ctx
->re
[0] == CHAR_LBRACKET
727 && ctx
->re
[1] == CHAR_COLON
&& (ctx
->re
[2] == L
'<' || ctx
->re
[2] == L
'>')
728 && ctx
->re
[3] == CHAR_COLON
&& ctx
->re
[4] == CHAR_RBRACKET
729 && ctx
->re
[5] == CHAR_RBRACKET
)
731 *result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
732 (ctx
->re
[2] == L
'<') ? ASSERT_AT_BOW
: ASSERT_AT_EOW
,
734 DPRINT(("tre_parse_bracket: special case %s\n", (ctx
->re
[2] == L
'<') ?
735 "[[:<:]]" : "[[:>:]]"));
737 return *result
? REG_OK
: REG_ESPACE
;
740 /* Start off with an array of `max_i' elements. */
741 items
= xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i
));
745 if (*ctx
->re
== CHAR_CARET
)
747 DPRINT(("tre_parse_bracket: negate: '%.*" STRF
"'\n", REST(ctx
->re
)));
748 items
->flags
|= TRE_BRACKET_MATCH_FLAG_NEGATE
;
752 status
= tre_parse_bracket_items(ctx
, &items
, &max_i
, &col_syms
);
754 if (status
!= REG_OK
)
755 goto parse_bracket_done
;
757 /* If there are collating symbols, split off the multi-character ones
758 * into a union of the bracket expression (without the collating symbols)
759 * and the multiple-character sequences. We create an equivalent input
760 * string and run tre_parse() recursively */
763 tre_char_t
*str
, *sp
;
764 tre_collating_symbol
*cp
;
765 tre_parse_ctx_t subctx
;
767 /* Allocate a new string. We start with the size of the original
768 * bracket expression (minus 1) and add 2 (for a leading "[" and
769 * a trailing nil; don't need a "^", since it is illegal to have
770 * inverted MCCSs). Since a multi-character collating symbols
771 * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
772 * need to worry about the new string getting too long. */
774 str
= xmalloc(sizeof(*str
) * ((col_syms
->start
- ctx
->re
) + 2));
781 if (col_syms
->len
> 0)
783 /* There are other items in the bracket expression besides the
784 * multi-character collating symbols, so create a new bracket
785 * expression with only those other itmes. */
786 const tre_char_t
*re
;
791 for (cp
= col_syms
+ 1; cp
->start
; cp
++)
793 /* The "- 2" is to account for the "[." */
794 if ((i
= ((cp
->start
- re
) - 2)) > 0)
796 memcpy(sp
, re
, sizeof(*sp
) * i
);
799 /* The "+ 2" is to account for the ".]" */
800 re
= cp
->start
+ cp
->len
+ 2;
802 i
= col_syms
->start
- re
; /* Includes the trailing right bracket */
803 memcpy(sp
, re
, sizeof(*sp
) * i
);
807 for (cp
= col_syms
+ 1; cp
->start
; cp
++)
809 memcpy(sp
, cp
->start
, sizeof(*sp
) * cp
->len
);
815 DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
818 memcpy(&subctx
, ctx
, sizeof(subctx
));
820 subctx
.len
= sp
- str
;
821 subctx
.nofirstsub
= 1;
822 subctx
.cflags
|= REG_EXTENDED
; /* Force extended mode for parsing */
823 status
= tre_parse(&subctx
);
825 if (status
!= REG_OK
)
830 ctx
->re
= col_syms
->start
;
831 ctx
->position
= subctx
.position
;
833 *result
= subctx
.result
;
834 DPRINT(("tre_parse_bracket: Returning to original string\n"));
838 DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
839 node
= tre_ast_new_literal(ctx
->mem
, 0, TRE_CHAR_MAX
, ctx
->position
);
843 goto parse_bracket_done
;
847 tre_literal_t
*l
= node
->obj
;
848 l
->u
.bracket_match_list
= tre_mem_alloc(ctx
->mem
,
849 SIZEOF_BRACKET_MATCH_LIST(items
));
850 if (l
->u
.bracket_match_list
== NULL
)
853 goto parse_bracket_done
;
855 memcpy(l
->u
.bracket_match_list
, items
, SIZEOF_BRACKET_MATCH_LIST(items
));
861 tre_bracket_match_t
*b
;
862 DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
863 items
->num_bracket_matches
, items
->flags
));
864 for (i
= 0, b
= items
->bracket_matches
;
865 i
< items
->num_bracket_matches
; i
++, b
++)
867 DPRINT((" %d: %s %d\n", i
, bracket_match_type_str
[b
->type
],
871 #endif /* TRE_DEBUG */
881 /* Parses a positive decimal integer. Returns -1 if the string does not
882 contain a valid number. */
884 tre_parse_int(const tre_char_t
**regex
, const tre_char_t
*regex_end
)
887 const tre_char_t
*r
= *regex
;
888 while (r
< regex_end
&& *r
>= L
'0' && *r
<= L
'9')
892 num
= num
* 10 + *r
- L
'0';
901 tre_parse_bound(tre_parse_ctx_t
*ctx
, tre_ast_node_t
**result
)
906 int cost_ins
, cost_del
, cost_subst
, cost_max
;
907 int limit_ins
, limit_del
, limit_subst
, limit_err
;
908 const tre_char_t
*start
;
909 #endif /* TRE_APPROX */
910 const tre_char_t
*r
= ctx
->re
;
911 int minimal
= (ctx
->cflags
& REG_UNGREEDY
) ? 1 : 0;
917 cost_ins
= cost_del
= cost_subst
= cost_max
= TRE_PARAM_UNSET
;
918 limit_ins
= limit_del
= limit_subst
= limit_err
= TRE_PARAM_UNSET
;
919 #endif /* TRE_APPROX */
921 /* Parse number (minimum repetition count). */
923 if (r
>= ctx
->re_end
)
924 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
925 return (ctx
->cflags
& REG_EXTENDED
) ? REG_NOMATCH
: REG_EBRACE
;
926 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
928 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
929 if (*r
>= L
'0' && *r
<= L
'9') {
930 DPRINT(("tre_parse: min count: '%.*" STRF
"'\n", REST(r
)));
931 min
= tre_parse_int(&r
, ctx
->re_end
);
935 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
936 /* For ERE, return REG_NOMATCH to signal that the lbrace should
937 be treated as a literal */
938 return (ctx
->cflags
& REG_EXTENDED
) ? REG_NOMATCH
: REG_BADBR
;
939 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
941 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
942 #endif /* !TRE_APPROX */
944 /* Parse comma and second number (maximum repetition count). */
946 if (r
< ctx
->re_end
&& *r
== CHAR_COMMA
)
949 DPRINT(("tre_parse: max count: '%.*" STRF
"'\n", REST(r
)));
950 max
= tre_parse_int(&r
, ctx
->re_end
);
953 /* Check that the repeat counts are sane. */
954 if ((max
>= 0 && min
> max
) || min
> RE_DUP_MAX
|| max
> RE_DUP_MAX
)
961 optionally followed immediately by a number == minimum repcount
962 optionally followed by , then a number == maximum repcount
963 + then a number == maximum insertion count
964 - then a number == maximum deletion count
965 # then a number == maximum substitution count
966 ~ then a number == maximum number of errors
967 Any of +, -, # or ~ without followed by a number means that
968 the maximum count/number of errors is infinite.
970 An equation of the form
972 can be specified to set costs and the cost limit to a value
973 different from the default value:
974 - X is the cost of an insertion
975 - Y is the cost of a deletion
976 - Z is the cost of a substitution
977 - C is the maximum cost
979 If no count limit or cost is set for an operation, the operation
980 is not allowed at all.
988 /* Parse count limit settings */
991 while (r
+ 1 < ctx
->re_end
&& !done
)
995 case CHAR_PLUS
: /* Insert limit */
996 DPRINT(("tre_parse: ins limit: '%.*" STRF
"'\n", REST(r
)));
998 limit_ins
= tre_parse_int(&r
, ctx
->re_end
);
1000 limit_ins
= INT_MAX
;
1003 case CHAR_MINUS
: /* Delete limit */
1004 DPRINT(("tre_parse: del limit: '%.*" STRF
"'\n", REST(r
)));
1006 limit_del
= tre_parse_int(&r
, ctx
->re_end
);
1008 limit_del
= INT_MAX
;
1011 case CHAR_HASH
: /* Substitute limit */
1012 DPRINT(("tre_parse: subst limit: '%.*" STRF
"'\n", REST(r
)));
1014 limit_subst
= tre_parse_int(&r
, ctx
->re_end
);
1015 if (limit_subst
< 0)
1016 limit_subst
= INT_MAX
;
1019 case CHAR_TILDE
: /* Maximum number of changes */
1020 DPRINT(("tre_parse: count limit: '%.*" STRF
"'\n", REST(r
)));
1022 limit_err
= tre_parse_int(&r
, ctx
->re_end
);
1024 limit_err
= INT_MAX
;
1042 /* Parse cost restriction equation. */
1045 while (r
+ 1 < ctx
->re_end
&& !done
)
1054 DPRINT(("tre_parse: max cost: '%.*" STRF
"'\n", REST(r
)));
1058 cost_max
= tre_parse_int(&r
, ctx
->re_end
);
1070 if (*r
>= L
'0' && *r
<= L
'9')
1073 const tre_char_t
*sr
= r
;
1074 #endif /* TRE_DEBUG */
1075 int cost
= tre_parse_int(&r
, ctx
->re_end
);
1076 /* XXX - make sure r is not past end. */
1079 case L
'i': /* Insert cost */
1080 DPRINT(("tre_parse: ins cost: '%.*" STRF
"'\n",
1086 case L
'd': /* Delete cost */
1087 DPRINT(("tre_parse: del cost: '%.*" STRF
"'\n",
1093 case L
's': /* Substitute cost */
1094 DPRINT(("tre_parse: subst cost: '%.*" STRF
"'\n",
1111 } while (start
!= r
);
1112 #endif /* TRE_APPROX */
1114 /*{*//* Missing }. */
1115 if (r
>= ctx
->re_end
)
1118 /* Empty contents of {}. */
1122 /* Parse the ending '}' or '\}'.*/
1123 if (ctx
->cflags
& REG_EXTENDED
)
1125 if (r
>= ctx
->re_end
|| *r
!= CHAR_RBRACE
)
1128 /* Parse trailing '?' marking minimal repetition. */
1129 if (r
< ctx
->re_end
)
1131 if (*r
== CHAR_QUESTIONMARK
)
1133 /* Process the question mark only in enhanced mode.
1134 Otherwise, the question mark is an error in ERE
1135 or a literal in BRE */
1136 if (ctx
->cflags
& REG_ENHANCED
)
1138 minimal
= !(ctx
->cflags
& REG_UNGREEDY
);
1141 else return REG_BADRPT
;
1143 else if (*r
== CHAR_STAR
|| *r
== CHAR_PLUS
)
1145 /* These are reserved for future extensions. */
1152 if (r
+ 1 >= ctx
->re_end
1153 || *r
!= CHAR_BACKSLASH
1154 || *(r
+ 1) != CHAR_RBRACE
)
1157 if (r
< ctx
->re_end
&& *r
== CHAR_STAR
)
1159 /* This is reserved for future extensions. */
1165 ctx
->num_reorder_tags
++;
1167 if (!result
) goto parse_bound_exit
;
1168 /* Create the AST node(s). */
1169 /* Originally, if min == 0 && max == 0, we immediately replace the whole
1170 iteration with EMPTY. This unfortunately drops any submatches, and
1171 messes up setting the pmatch values (we can get tags of -1, and
1172 tag values in the billions). So we leave it and process this case as
1173 usual, and wait until tre_expand_ast() to replace with EMPTY */
1175 if (min
< 0 && max
< 0)
1176 /* Only approximate parameters set, no repetitions. */
1178 #endif /* TRE_APPROX */
1180 *result
= tre_ast_new_iter(ctx
->mem
, *result
, min
, max
, minimal
);
1185 /* If approximate matching parameters are set, add them to the
1187 if (approx
|| costs_set
|| counts_set
)
1190 tre_iteration_t
*iter
= (*result
)->obj
;
1192 if (costs_set
|| counts_set
)
1194 if (limit_ins
== TRE_PARAM_UNSET
)
1196 if (cost_ins
== TRE_PARAM_UNSET
)
1199 limit_ins
= INT_MAX
;
1202 if (limit_del
== TRE_PARAM_UNSET
)
1204 if (cost_del
== TRE_PARAM_UNSET
)
1207 limit_del
= INT_MAX
;
1210 if (limit_subst
== TRE_PARAM_UNSET
)
1212 if (cost_subst
== TRE_PARAM_UNSET
)
1215 limit_subst
= INT_MAX
;
1219 if (cost_max
== TRE_PARAM_UNSET
)
1221 if (limit_err
== TRE_PARAM_UNSET
)
1222 limit_err
= INT_MAX
;
1224 ctx
->have_approx
= 1;
1225 params
= tre_mem_alloc(ctx
->mem
, sizeof(*params
) * TRE_PARAM_LAST
);
1228 for (i
= 0; i
< TRE_PARAM_LAST
; i
++)
1229 params
[i
] = TRE_PARAM_UNSET
;
1230 params
[TRE_PARAM_COST_INS
] = cost_ins
;
1231 params
[TRE_PARAM_COST_DEL
] = cost_del
;
1232 params
[TRE_PARAM_COST_SUBST
] = cost_subst
;
1233 params
[TRE_PARAM_COST_MAX
] = cost_max
;
1234 params
[TRE_PARAM_MAX_INS
] = limit_ins
;
1235 params
[TRE_PARAM_MAX_DEL
] = limit_del
;
1236 params
[TRE_PARAM_MAX_SUBST
] = limit_subst
;
1237 params
[TRE_PARAM_MAX_ERR
] = limit_err
;
1238 iter
->params
= params
;
1240 #endif /* TRE_APPROX */
1244 DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
1245 "limits [%d,%d,%d, total %d]\n",
1246 min
, max
, cost_ins
, cost_del
, cost_subst
, cost_max
,
1247 limit_ins
, limit_del
, limit_subst
, limit_err
));
1248 #else /* !TRE_APPROX */
1249 DPRINT(("tre_parse_bound: min %d, max %d\n", min
, max
));
1250 #endif /* !TRE_APPROX */
1257 /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
1258 non-self-contained options, like (?i), this causes ((?i)fu)bar to be
1259 treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
1260 Because we now set up tags for even non-capturing parenthesized
1261 subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we
1262 pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
1263 have it restore cflags after the subexpression, we don't need to have
1264 a separate PARSE_RESTORE_CFLAGS, and then after processing the
1265 non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
1266 This has the side-benefit of now matching the perl behavior: the RE
1267 foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
1268 of foo AND (?i) (bar OR zap). */
1272 PARSE_MARK_FOR_SUBMATCH
,
1276 PARSE_POST_CATENATION
,
1280 } tre_parse_re_stack_symbol_t
;
1284 tre_parse(tre_parse_ctx_t
*ctx
)
1286 tre_ast_node_t
*result
= NULL
;
1287 tre_parse_re_stack_symbol_t symbol
;
1288 reg_errcode_t status
= REG_OK
;
1289 tre_stack_t
*stack
= ctx
->stack
;
1290 int bottom
= tre_stack_num_objects(stack
);
1292 int temporary_cflags
= 0;
1293 int bre_branch_begin
;
1295 const tre_char_t
*tmp_re
;
1298 DPRINT(("tre_parse: parsing '%.*" STRF
"', len = %d cflags = 0%o\n",
1299 ctx
->len
, ctx
->re
, ctx
->len
, ctx
->cflags
));
1301 if (ctx
->len
<= 0) return REG_EMPTY
;
1302 if (!ctx
->nofirstsub
)
1304 STACK_PUSH(stack
, int, ctx
->cflags
);
1305 STACK_PUSH(stack
, int, ctx
->submatch_id
);
1306 STACK_PUSH(stack
, int, PARSE_MARK_FOR_SUBMATCH
);
1309 STACK_PUSH(stack
, int, 0); // bre_branch_begin
1310 STACK_PUSH(stack
, int, PARSE_RE
);
1311 ctx
->re_start
= ctx
->re
;
1312 ctx
->re_end
= ctx
->re
+ ctx
->len
;
1315 /* The following is basically just a recursive descent parser. I use
1316 an explicit stack instead of recursive functions mostly because of
1317 two reasons: compatibility with systems which have an overflowable
1318 call stack, and efficiency (both in lines of code and speed). */
1319 while (tre_stack_num_objects(stack
) > bottom
)
1321 symbol
= tre_stack_pop_int(stack
);
1325 /* Parse a full regexp. A regexp is one or more branches,
1326 separated by the union operator `|'. */
1327 bre_branch_begin
= tre_stack_pop_int(stack
);
1330 !(ctx
->cflags
& REG_LITERAL
) &&
1331 #endif /* REG_LITERAL */
1332 ctx
->cflags
& (REG_EXTENDED
| REG_ENHANCED
))
1333 STACK_PUSHX(stack
, int, PARSE_UNION
);
1334 STACK_PUSHX(stack
, int, bre_branch_begin
);
1335 STACK_PUSHX(stack
, int, PARSE_BRANCH
);
1339 /* Parse a branch. A branch is one or more pieces, concatenated.
1340 A piece is an atom possibly followed by a postfix operator. */
1341 bre_branch_begin
= tre_stack_pop_int(stack
);
1342 STACK_PUSHX(stack
, int, PARSE_CATENATION
);
1343 STACK_PUSHX(stack
, int, bre_branch_begin
);
1344 STACK_PUSHX(stack
, int, PARSE_PIECE
);
1348 /* Parse a piece. A piece is an atom possibly followed by one
1349 or more postfix operators. */
1350 bre_branch_begin
= tre_stack_pop_int(stack
);
1351 STACK_PUSHX(stack
, int, PARSE_POSTFIX
);
1352 STACK_PUSHX(stack
, int, bre_branch_begin
);
1353 STACK_PUSHX(stack
, int, PARSE_ATOM
);
1356 case PARSE_CATENATION
:
1357 /* If the expression has not ended, parse another piece. */
1360 if (ctx
->re
>= ctx
->re_end
)
1364 if (!(ctx
->cflags
& REG_LITERAL
))
1366 #endif /* REG_LITERAL */
1367 if ((ctx
->cflags
& REG_EXTENDED
&& c
== CHAR_PIPE
) ||
1368 ((ctx
->cflags
& (REG_EXTENDED
| REG_ENHANCED
)) == REG_ENHANCED
1369 && ctx
->re
+ 1 < ctx
->re_end
&& c
== CHAR_BACKSLASH
&&
1370 *(ctx
->re
+ 1) == CHAR_PIPE
))
1372 if ((ctx
->cflags
& REG_EXTENDED
1373 && c
== CHAR_RPAREN
&& depth
> 0)
1374 || (!(ctx
->cflags
& REG_EXTENDED
)
1375 && ctx
->re
+ 1 < ctx
->re_end
&& c
== CHAR_BACKSLASH
1376 && *(ctx
->re
+ 1) == CHAR_RPAREN
))
1378 if (!(ctx
->cflags
& REG_EXTENDED
) && depth
== 0)
1380 DPRINT(("tre_parse: group end: '%.*" STRF
"'\n",
1383 if (!(ctx
->cflags
& (REG_EXTENDED
| REG_ENHANCED
)))
1389 #endif /* REG_LITERAL */
1391 #ifdef REG_LEFT_ASSOC
1392 if (ctx
->cflags
& REG_LEFT_ASSOC
)
1394 /* Left associative concatenation. */
1395 STACK_PUSHX(stack
, int, PARSE_CATENATION
);
1396 STACK_PUSHX(stack
, voidptr
, result
);
1397 STACK_PUSHX(stack
, int, PARSE_POST_CATENATION
);
1398 STACK_PUSHX(stack
, int, 0); // bre_branch_begin
1399 STACK_PUSHX(stack
, int, PARSE_PIECE
);
1402 #endif /* REG_LEFT_ASSOC */
1404 /* Default case, right associative concatenation. */
1405 STACK_PUSHX(stack
, voidptr
, result
);
1406 STACK_PUSHX(stack
, int, PARSE_POST_CATENATION
);
1407 STACK_PUSHX(stack
, int, PARSE_CATENATION
);
1408 STACK_PUSHX(stack
, int, 0); // bre_branch_begin
1409 STACK_PUSHX(stack
, int, PARSE_PIECE
);
1414 case PARSE_POST_CATENATION
:
1416 tre_ast_node_t
*tree
= tre_stack_pop_voidptr(stack
);
1417 tre_ast_node_t
*tmp_node
;
1418 tmp_node
= tre_ast_new_catenation(ctx
->mem
, tree
, result
);
1426 if (ctx
->re
>= ctx
->re_end
)
1429 if (ctx
->cflags
& REG_LITERAL
)
1431 #endif /* REG_LITERAL */
1432 if (!(ctx
->cflags
& REG_EXTENDED
))
1434 if (*ctx
->re
!= CHAR_BACKSLASH
|| ctx
->re
+ 1 >= ctx
->re_end
)
1441 DPRINT(("tre_parse: union: '%.*" STRF
"'\n",
1443 STACK_PUSHX(stack
, int, PARSE_UNION
);
1444 STACK_PUSHX(stack
, voidptr
, (void *)ctx
->re
);
1445 STACK_PUSHX(stack
, voidptr
, result
);
1446 STACK_PUSHX(stack
, int, PARSE_POST_UNION
);
1447 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1448 indicate if this is the beginning of a BRE extended branch. */
1449 STACK_PUSHX(stack
, int, (ctx
->cflags
& (REG_EXTENDED
| REG_ENHANCED
)) == REG_ENHANCED
); // bre_branch_begin
1450 STACK_PUSHX(stack
, int, PARSE_BRANCH
);
1459 if (!(ctx
->cflags
& REG_EXTENDED
))
1465 case PARSE_POST_UNION
:
1467 tre_ast_node_t
*tmp_node
;
1468 tre_ast_node_t
*tree
= tre_stack_pop_voidptr(stack
);
1469 const tre_char_t
*pipechar
= tre_stack_pop_voidptr(stack
);
1470 /* error on empty expression at end of union */
1471 if (pipechar
== ctx
->re
- 1)
1475 tmp_node
= tre_ast_new_union(ctx
->mem
, tree
, result
);
1483 /* Parse postfix operators. */
1484 if (ctx
->re
>= ctx
->re_end
)
1487 if (ctx
->cflags
& REG_LITERAL
)
1489 #endif /* REG_LITERAL */
1490 int minimal
= (ctx
->cflags
& REG_UNGREEDY
) ? 1 : 0;
1499 case CHAR_QUESTIONMARK
:
1500 if (!(ctx
->cflags
& REG_EXTENDED
))
1505 tre_ast_node_t
*tmp_node
;
1507 const char *tstr
= "star";
1511 handle_plus_or_question
:
1512 /* error on iteration of raw assertion (not in subexpression) */
1513 if (result
->type
== LITERAL
&& result
->submatch_id
< 0 &&
1514 IS_ASSERTION((tre_literal_t
*)result
->obj
))
1516 if (!(ctx
->cflags
& REG_EXTENDED
)) break;
1519 if (*ctx
->re
== CHAR_PLUS
)
1526 if (*ctx
->re
== CHAR_QUESTIONMARK
)
1530 tstr
= "questionmark";
1534 if (ctx
->cflags
& REG_EXTENDED
)
1536 if (ctx
->re
+ 1 < ctx
->re_end
)
1538 if (*(ctx
->re
+ 1) == CHAR_QUESTIONMARK
)
1540 /* Process the question mark only in enhanced mode.
1541 Otherwise, the question mark is an error in ERE */
1542 if (ctx
->cflags
& REG_ENHANCED
)
1544 minimal
= !(ctx
->cflags
& REG_UNGREEDY
);
1547 else return REG_BADRPT
;
1549 else if (*(ctx
->re
+ 1) == CHAR_STAR
1550 || *(ctx
->re
+ 1) == CHAR_PLUS
)
1552 /* These are reserved for future extensions. */
1559 if (ctx
->re
+ 1 < ctx
->re_end
&& *(ctx
->re
+ 1) == CHAR_STAR
)
1561 /* This is reserved for future extensions. */
1564 if (ctx
->re
+ 2 < ctx
->re_end
)
1566 if (*(ctx
->re
+ 1) == CHAR_BACKSLASH
&& *(ctx
->re
+ 2) == CHAR_QUESTIONMARK
)
1568 /* Process the question mark only in enhanced mode.
1569 Otherwise, the question mark is a literal in BRE */
1570 if (ctx
->cflags
& REG_ENHANCED
)
1572 minimal
= !(ctx
->cflags
& REG_UNGREEDY
);
1576 else if (*(ctx
->re
+ 1) == CHAR_BACKSLASH
&& *(ctx
->re
+ 2) == CHAR_PLUS
)
1578 /* This is reserved for future extensions. */
1585 ctx
->num_reorder_tags
++;
1587 DPRINT(("tre_parse: %s %s: '%.*" STRF
"'\n",
1588 minimal
? " minimal" : "greedy", tstr
, REST(tmp_re
)));
1591 if (ctx
->cflags
& REG_EXTENDED
) return REG_BADRPT
;
1592 else goto parse_literal
;
1595 tmp_node
= tre_ast_new_iter(ctx
->mem
, result
, rep_min
, rep_max
,
1597 if (tmp_node
== NULL
)
1601 /* Set the iterator with a submatch id in the invisible range
1602 * (which will be overridden if a real submatch is needed) */
1603 result
->submatch_id
= ctx
->submatch_id_invisible
++;
1606 /* We don't allow multiple postfixes, but this might be needed
1607 to support approximate matching */
1608 STACK_PUSHX(stack
, int, PARSE_POSTFIX
);
1613 case CHAR_BACKSLASH
:
1614 /* "\{" is special without REG_EXTENDED */
1615 /* "\+" and "\?" are special with REG_ENHANCED for BRE */
1616 if (!(ctx
->cflags
& REG_EXTENDED
)
1617 && ctx
->re
+ 1 < ctx
->re_end
)
1619 switch (*(ctx
->re
+ 1))
1628 case CHAR_QUESTIONMARK
:
1629 if (ctx
->cflags
& REG_ENHANCED
)
1635 goto handle_plus_or_question
;
1648 /* "{" is literal without REG_EXTENDED */
1649 if (!(ctx
->cflags
& REG_EXTENDED
))
1656 /* error on iteration of raw assertion (not in subexpression),
1657 but wait until after parsing bounds */
1658 raw_assertion
= (result
->type
== LITERAL
1659 && result
->submatch_id
< 0
1660 && IS_ASSERTION((tre_literal_t
*)result
->obj
));
1663 status
= tre_parse_bound(ctx
, &result
);
1664 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1665 /* For ERE, if status is REG_NOMATCH, this mean the lbrace
1666 is to be treated as a literal. */
1667 if (status
== REG_NOMATCH
)
1672 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1673 DPRINT(("tre_parse: bound: '%.*" STRF
"'\n",
1674 REST(ctx
->re
- lbrace_off
)));
1675 if (status
!= REG_OK
)
1677 if (raw_assertion
) return REG_BADRPT
;
1679 /* Set the iterator with a submatch id in the invisible range
1680 * (which will be overridden if a real submatch is needed) */
1681 if (result
->type
== ITERATION
)
1682 result
->submatch_id
= ctx
->submatch_id_invisible
++;
1685 /* We don't allow multiple postfixes, but this might be needed
1686 to support approximate matching */
1687 STACK_PUSHX(stack
, int, PARSE_POSTFIX
);
1696 /* Parse an atom. An atom is a regular expression enclosed in `()',
1697 an empty set of `()', a bracket expression, `.', `^', `$',
1698 a `\' followed by a character, or a single character. */
1700 /* The stack contains a boolean value, whether PARSE_ATOM is
1701 being called just after the start of a group (left paren)
1703 bre_branch_begin
= tre_stack_pop_int(stack
);
1705 /* End of regexp? (empty string). */
1706 if (ctx
->re
>= ctx
->re_end
)
1710 if (ctx
->cflags
& REG_LITERAL
)
1712 #endif /* REG_LITERAL */
1716 case CHAR_LPAREN
: /* parenthesized subexpression */
1718 /* Handle "(?...)" extensions. They work in a way similar
1719 to Perls corresponding extensions. */
1720 if ((ctx
->cflags
& (REG_EXTENDED
|REG_ENHANCED
)) ==
1721 (REG_EXTENDED
|REG_ENHANCED
)
1722 && *(ctx
->re
+ 1) == CHAR_QUESTIONMARK
)
1724 int new_cflags
= ctx
->cflags
;
1726 int invisible_submatch
= 0;
1727 DPRINT(("tre_parse: extension: '%.*" STRF
"'\n",
1730 while (/*CONSTCOND*/1)
1732 if (*ctx
->re
== L
'i')
1734 DPRINT(("tre_parse: icase: '%.*" STRF
"'\n",
1737 new_cflags
|= REG_ICASE
;
1739 new_cflags
&= ~REG_ICASE
;
1742 else if (*ctx
->re
== L
'n')
1744 DPRINT(("tre_parse: newline: '%.*" STRF
"'\n",
1747 new_cflags
|= REG_NEWLINE
;
1749 new_cflags
&= ~REG_NEWLINE
;
1752 #ifdef REG_LEFT_ASSOC
1753 else if (*ctx
->re
== L
'l')
1755 DPRINT(("tre_parse: left assoc: '%.*" STRF
"'\n",
1758 new_cflags
|= REG_LEFT_ASSOC
;
1760 new_cflags
&= ~REG_LEFT_ASSOC
;
1763 #endif /* REG_LEFT_ASSOC */
1765 else if (*ctx
->re
== L
'U')
1767 DPRINT(("tre_parse: ungreedy: '%.*" STRF
"'\n",
1770 new_cflags
|= REG_UNGREEDY
;
1772 new_cflags
&= ~REG_UNGREEDY
;
1775 #endif /* REG_UNGREEDY */
1776 else if (*ctx
->re
== CHAR_MINUS
)
1778 DPRINT(("tre_parse: turn off: '%.*" STRF
"'\n",
1783 else if (*ctx
->re
== CHAR_COLON
)
1785 DPRINT(("tre_parse: no group: '%.*" STRF
1786 "', (invisible submatch %d)\n",
1787 REST(ctx
->re
), ctx
->submatch_id_invisible
));
1790 invisible_submatch
= 1;
1793 else if (*ctx
->re
== CHAR_HASH
)
1795 DPRINT(("tre_parse: comment: '%.*" STRF
"'\n",
1797 /* A comment can contain any character except a
1798 right parenthesis */
1799 while (*ctx
->re
!= CHAR_RPAREN
1800 && ctx
->re
< ctx
->re_end
)
1802 if (*ctx
->re
== CHAR_RPAREN
&& ctx
->re
< ctx
->re_end
)
1810 else if (*ctx
->re
== CHAR_RPAREN
)
1819 /* Turn on the cflags changes for the rest of the
1821 if (invisible_submatch
)
1823 STACK_PUSHX(stack
, int, ctx
->cflags
);
1824 STACK_PUSHX(stack
, int, ctx
->submatch_id_invisible
);
1825 STACK_PUSHX(stack
, int, PARSE_MARK_FOR_SUBMATCH
);
1826 ctx
->submatch_id_invisible
++;
1827 STACK_PUSHX(stack
, int, 0); // bre_branch_begin
1828 STACK_PUSHX(stack
, int, PARSE_RE
);
1831 STACK_PUSHX(stack
, int, 0); // bre_branch_begin
1832 STACK_PUSHX(stack
, int, PARSE_ATOM
);
1834 ctx
->cflags
= new_cflags
;
1838 if (ctx
->cflags
& REG_EXTENDED
)
1841 DPRINT(("tre_parse: group begin: '%.*" STRF
1842 "', submatch %d\n", REST(ctx
->re
),
1845 /* First parse a whole RE, then mark the resulting tree
1847 STACK_PUSHX(stack
, int, ctx
->cflags
);
1848 STACK_PUSHX(stack
, int, ctx
->submatch_id
);
1849 STACK_PUSHX(stack
, int, PARSE_MARK_FOR_SUBMATCH
);
1850 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1851 indicate if this is the beginning of a BRE group. */
1852 STACK_PUSHX(stack
, int, !(ctx
->cflags
& REG_EXTENDED
));
1853 STACK_PUSHX(stack
, int, PARSE_RE
);
1861 case CHAR_RPAREN
: /* end of current subexpression */
1862 if (ctx
->cflags
& REG_EXTENDED
&& depth
> 0)
1864 parse_bre_rparen_empty
:
1865 if (!(ctx
->cflags
& REG_EXTENDED
) && depth
== 0)
1867 DPRINT(("tre_parse: empty: '%.*" STRF
"'\n",
1869 /* We were expecting an atom, but instead the current
1870 subexpression was closed. POSIX leaves the meaning of
1871 this to be implementation-defined. We interpret this as
1872 an empty expression (which matches an empty string). */
1873 result
= tre_ast_new_literal(ctx
->mem
, EMPTY
, -1, -1);
1876 if (!(ctx
->cflags
& REG_EXTENDED
))
1883 case CHAR_LBRACKET
: /* bracket expression */
1884 DPRINT(("tre_parse: bracket: '%.*" STRF
"'\n",
1887 status
= tre_parse_bracket(ctx
, &result
);
1888 if (status
!= REG_OK
)
1892 case CHAR_BACKSLASH
:
1893 /* Deal with "\(", "\)" or "\{" for BREs */
1894 if (!(ctx
->cflags
& REG_EXTENDED
)
1895 && ctx
->re
+ 1 < ctx
->re_end
)
1897 if (*(ctx
->re
+ 1) == CHAR_LPAREN
)
1900 goto parse_bre_lparen
;
1902 else if (*(ctx
->re
+ 1) == CHAR_RPAREN
)
1905 goto parse_bre_rparen_empty
;
1907 if (*(ctx
->re
+ 1) == CHAR_LBRACE
) goto parse_literal
;
1910 if (ctx
->re
+ 1 >= ctx
->re_end
)
1911 /* Trailing backslash. */
1914 if (!(ctx
->cflags
& REG_ENHANCED
))
1916 DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF
"'\n", REST(ctx
->re
)));
1918 goto unenhanced_backslash
;
1921 /* If a macro is used, parse the expanded macro recursively. */
1924 tre_expand_macro(ctx
->re
+ 1, ctx
->re_end
,
1925 buf
, elementsof(buf
));
1928 tre_parse_ctx_t subctx
;
1929 memcpy(&subctx
, ctx
, sizeof(subctx
));
1931 subctx
.len
= tre_strlen(buf
);
1932 subctx
.nofirstsub
= 1;
1933 status
= tre_parse(&subctx
);
1934 if (status
!= REG_OK
)
1937 ctx
->position
= subctx
.position
;
1938 result
= subctx
.result
;
1944 if (*(ctx
->re
+ 1) == L
'Q')
1946 DPRINT(("tre_parse: tmp literal: '%.*" STRF
"'\n",
1948 ctx
->cflags
|= REG_LITERAL
;
1949 temporary_cflags
|= REG_LITERAL
;
1951 STACK_PUSHX(stack
, int, 0);
1952 STACK_PUSHX(stack
, int, PARSE_ATOM
);
1955 #endif /* REG_LITERAL */
1957 DPRINT(("tre_parse: bleep: '%.*" STRF
"'\n", REST(ctx
->re
)));
1962 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
1967 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
1968 ASSERT_AT_WB_NEG
, -1);
1972 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
1977 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
1983 if (ctx
->re
[0] != CHAR_LBRACE
&& ctx
->re
< ctx
->re_end
)
1985 /* 8 bit hex char. */
1986 char tmp
[3] = {0, 0, 0};
1988 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF
"'\n",
1989 REST(ctx
->re
- 2)));
1991 if (tre_isxdigit_l(ctx
->re
[0], ctx
->loc
) &&
1992 ctx
->re
< ctx
->re_end
)
1994 tmp
[0] = (char)ctx
->re
[0];
1997 if (tre_isxdigit_l(ctx
->re
[0], ctx
->loc
) &&
1998 ctx
->re
< ctx
->re_end
)
2000 tmp
[1] = (char)ctx
->re
[0];
2003 val
= strtol(tmp
, NULL
, 16);
2004 result
= tre_ast_new_literal(ctx
->mem
, (int)val
,
2005 (int)val
, ctx
->position
);
2009 else if (ctx
->re
< ctx
->re_end
)
2016 while (ctx
->re_end
- ctx
->re
>= 0)
2018 if (ctx
->re
[0] == CHAR_RBRACE
)
2020 if (tre_isxdigit_l(ctx
->re
[0], ctx
->loc
))
2022 tmp
[i
] = (char)ctx
->re
[0];
2031 val
= strtol(tmp
, NULL
, 16);
2032 result
= tre_ast_new_literal(ctx
->mem
, (int)val
, (int)val
,
2040 unenhanced_backslash
:
2041 if ((ctx
->cflags
& (REG_EXTENDED
| REG_ENHANCED
)) !=
2043 tre_isdigit_l(*ctx
->re
, ctx
->loc
) && *ctx
->re
!= L
'0')
2045 /* Back reference (only in BRE or enhanced). */
2046 int val
= *ctx
->re
- L
'0';
2047 DPRINT(("tre_parse: backref: '%.*" STRF
"'\n",
2048 REST(ctx
->re
- 1)));
2049 result
= tre_ast_new_literal(ctx
->mem
, BACKREF
, val
,
2054 /* Set the backref with a submatch id in the invisible
2055 * range (which will be overridden if a real submatch
2057 result
->submatch_id
= ctx
->submatch_id_invisible
++;
2060 ctx
->num_reorder_tags
++;
2061 ctx
->max_backref
= MAX(val
, ctx
->max_backref
);
2066 /* Escaped character. */
2067 DPRINT(("tre_parse: escaped: '%.*" STRF
"'\n",
2068 REST(ctx
->re
- 1)));
2069 result
= tre_ast_new_literal(ctx
->mem
, *ctx
->re
, *ctx
->re
,
2080 case CHAR_PERIOD
: /* the any-symbol */
2081 DPRINT(("tre_parse: any: '%.*" STRF
"'\n",
2083 if (ctx
->cflags
& REG_NEWLINE
)
2085 tre_ast_node_t
*tmp1
;
2086 tre_ast_node_t
*tmp2
;
2087 tmp1
= tre_ast_new_literal(ctx
->mem
, 0, L
'\n' - 1,
2091 tmp2
= tre_ast_new_literal(ctx
->mem
, L
'\n' + 1, TRE_CHAR_MAX
,
2095 result
= tre_ast_new_union(ctx
->mem
, tmp1
, tmp2
);
2102 result
= tre_ast_new_literal(ctx
->mem
, 0, TRE_CHAR_MAX
,
2111 case CHAR_CARET
: /* beginning of line assertion */
2112 /* '^' has a special meaning everywhere in EREs, at the
2113 beginning of the RE and after \( is BREs. It is also
2114 special in enhanced BREs at the beginning of each branches
2116 if (ctx
->cflags
& REG_EXTENDED
2118 || ctx
->re
== ctx
->re_start
)
2120 DPRINT(("tre_parse: BOL: '%.*" STRF
"'\n",
2122 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
2132 case CHAR_DOLLAR
: /* end of line assertion. */
2133 /* '$' is special everywhere in EREs, and in the end of the
2134 string and before \) is BREs. */
2135 if (ctx
->cflags
& REG_EXTENDED
2136 || (ctx
->re
+ 2 < ctx
->re_end
2137 && *(ctx
->re
+ 1) == CHAR_BACKSLASH
2138 && *(ctx
->re
+ 2) == CHAR_RPAREN
)
2139 || ctx
->re
+ 1 == ctx
->re_end
)
2141 DPRINT(("tre_parse: EOL: '%.*" STRF
"'\n",
2143 result
= tre_ast_new_literal(ctx
->mem
, ASSERTION
,
2156 if (temporary_cflags
&& ctx
->re
+ 1 < ctx
->re_end
2157 && *ctx
->re
== CHAR_BACKSLASH
&& *(ctx
->re
+ 1) == L
'E')
2159 DPRINT(("tre_parse: end tmps: '%.*" STRF
"'\n",
2161 ctx
->cflags
&= ~temporary_cflags
;
2162 temporary_cflags
= 0;
2164 if (ctx
->re
< ctx
->re_end
)
2166 STACK_PUSHX(stack
, int, 0);
2167 STACK_PUSHX(stack
, int, PARSE_ATOM
);
2171 result
= tre_ast_new_literal(ctx
->mem
, EMPTY
, -1, -1);
2172 if (!result
) return REG_ESPACE
;
2178 /* We are expecting an atom. If the subexpression (or the whole
2179 regexp ends here, we interpret it as an empty expression
2180 (which matches an empty string), which is an error.
2181 Iterations of an empty expression is also an error. */
2183 if (!(ctx
->cflags
& REG_LITERAL
))
2185 #endif /* REG_LITERAL */
2186 /* error on end of string */
2187 if (ctx
->re
>= ctx
->re_end
) return depth
> 0 ? REG_EPAREN
2189 /* error on unions and iterations of empty expressions */
2190 if (ctx
->cflags
& REG_EXTENDED
)
2192 if (ctx
->re
< ctx
->re_end
)
2194 if (*ctx
->re
== CHAR_PIPE
) return REG_EMPTY
;
2195 if (*ctx
->re
== CHAR_LBRACE
)
2199 /* We need to parse the bound first and return
2200 any error, before returning REG_BADRPT */
2201 status
= tre_parse_bound(ctx
, NULL
);
2202 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2203 /* For ERE, if REG_NOMATCH is returned, we
2204 treat the lbrace as a literal. */
2205 if (status
== REG_NOMATCH
)
2208 /* Drop down to literal-handling code */
2212 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2213 if (status
!= REG_OK
)
2216 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2218 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2220 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2222 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2223 if (*ctx
->re
== CHAR_STAR
2224 || *ctx
->re
== CHAR_PLUS
2225 || *ctx
->re
== CHAR_QUESTIONMARK
)
2231 else if (ctx
->re
+ 1 < ctx
->re_end
2232 && *ctx
->re
== CHAR_BACKSLASH
2233 && *(ctx
->re
+ 1) == CHAR_LBRACE
)
2236 goto empty_parse_bound
;
2240 #endif /* REG_LITERAL */
2242 DPRINT(("tre_parse: literal: '%.*" STRF
"'\n",
2244 /* Note that we can't use an tre_isalpha() test here, since there
2245 may be characters which are alphabetic but neither upper or
2247 if (ctx
->cflags
& REG_ICASE
2248 && (tre_isupper_l(*ctx
->re
, ctx
->loc
) ||
2249 tre_islower_l(*ctx
->re
, ctx
->loc
)))
2251 tre_ast_node_t
*tmp1
;
2252 tre_ast_node_t
*tmp2
;
2254 /* XXX - Can there be more than one opposite-case
2255 counterpoints for some character in some locale? Or
2256 more than two characters which all should be regarded
2257 the same character if case is ignored? If yes, there
2258 does not seem to be a portable way to detect it. I guess
2259 that at least for multi-character collating elements there
2260 could be several opposite-case counterpoints, but they
2261 cannot be supported portably anyway. */
2262 tmp1
= tre_ast_new_literal(ctx
->mem
,
2263 tre_toupper_l(*ctx
->re
, ctx
->loc
),
2264 tre_toupper_l(*ctx
->re
, ctx
->loc
),
2268 tmp2
= tre_ast_new_literal(ctx
->mem
,
2269 tre_tolower_l(*ctx
->re
, ctx
->loc
),
2270 tre_tolower_l(*ctx
->re
, ctx
->loc
),
2274 result
= tre_ast_new_union(ctx
->mem
, tmp1
, tmp2
);
2280 result
= tre_ast_new_literal(ctx
->mem
, *ctx
->re
, *ctx
->re
,
2292 case PARSE_MARK_FOR_SUBMATCH
:
2294 int submatch_id
= tre_stack_pop_int(stack
);
2296 ctx
->cflags
= tre_stack_pop_int(stack
); /* restore cflags */
2297 if (result
->submatch_id
>= 0 &&
2298 result
->submatch_id
< SUBMATCH_ID_INVISIBLE_START
)
2300 tre_ast_node_t
*n
, *tmp_node
;
2301 if (submatch_id
>= SUBMATCH_ID_INVISIBLE_START
)
2303 n
= tre_ast_new_literal(ctx
->mem
, EMPTY
, -1, -1);
2306 tmp_node
= tre_ast_new_catenation(ctx
->mem
, n
, result
);
2307 if (tmp_node
== NULL
)
2309 tmp_node
->num_submatches
= result
->num_submatches
;
2312 result
->submatch_id
= submatch_id
;
2313 if (submatch_id
< SUBMATCH_ID_INVISIBLE_START
)
2314 result
->num_submatches
++;
2324 /* Check for missing closing parentheses. */
2328 ctx
->result
= result
;