sys/vfs/hammer2: Fix multiple "radii" -> "radix"
[dragonfly.git] / contrib / tre / lib / tre-parse.c
blob232bec1e32d14e2875120b117df844c80244b30d
1 /*
2 tre-parse.c - Regexp parser
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
7 */
9 /*
10 This parser is just a simple recursive descent parser for POSIX.2
11 regexps. The parser supports both the obsolete default syntax and
12 the "extended" syntax, and some nonstandard extensions.
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <string.h>
20 #include <assert.h>
21 #include <limits.h>
22 #include <stddef.h>
24 #include "xmalloc.h"
25 #include "tre-mem.h"
26 #include "tre-ast.h"
27 #include "tre-stack.h"
28 #include "tre-parse.h"
30 #include "xlocale_private.h"
31 #include "collate.h"
33 /* BSD compatibility:
34 Before looking up a collating symbol, check if the name matches in
35 the character names (cnames) array; if so, use the corresponding
36 character.
38 Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
39 the implementation choice that for ERE, a non-numeric character following
40 a left brace that would normally be a bound, causes the left brace to be
41 literal. */
42 #define BSD_COMPATIBILITY
43 #ifdef BSD_COMPATIBILITY
44 #include "cname.h"
45 #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
46 #endif /* BSD_COMPATIBILITY */
48 /* Characters with special meanings in regexp syntax. */
49 #define CHAR_PIPE L'|'
50 #define CHAR_LPAREN L'('
51 #define CHAR_RPAREN L')'
52 #define CHAR_LBRACE L'{'
53 #define CHAR_RBRACE L'}'
54 #define CHAR_LBRACKET L'['
55 #define CHAR_RBRACKET L']'
56 #define CHAR_MINUS L'-'
57 #define CHAR_STAR L'*'
58 #define CHAR_QUESTIONMARK L'?'
59 #define CHAR_PLUS L'+'
60 #define CHAR_PERIOD L'.'
61 #define CHAR_COLON L':'
62 #define CHAR_EQUAL L'='
63 #define CHAR_COMMA L','
64 #define CHAR_CARET L'^'
65 #define CHAR_DOLLAR L'$'
66 #define CHAR_BACKSLASH L'\\'
67 #define CHAR_HASH L'#'
68 #define CHAR_TILDE L'~'
71 /* Some macros for expanding \w, \s, etc. */
72 static const struct tre_macro_struct {
73 const char c;
74 const char *expansion;
75 } tre_macros[] =
76 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
77 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
78 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
79 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
80 { 0, NULL }
84 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
85 must have at least `len' items. Sets buf[0] to zero if the there
86 is no match in `tre_macros'. */
87 static void
88 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
89 tre_char_t *buf, size_t buf_len)
91 int i;
93 buf[0] = 0;
94 if (regex >= regex_end)
95 return;
97 for (i = 0; tre_macros[i].expansion; i++)
99 if (tre_macros[i].c == *regex)
101 unsigned int j;
102 DPRINT(("Expanding macro '%c' => '%s'\n",
103 tre_macros[i].c, tre_macros[i].expansion));
104 for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
105 buf[j] = tre_macros[i].expansion[j];
106 buf[j] = 0;
107 break;
112 static reg_errcode_t
113 tre_new_item(tre_mem_t mem, int type, int val, int *max_i,
114 tre_bracket_match_list_t **items)
116 reg_errcode_t status = REG_OK;
117 tre_bracket_match_list_t *array = *items;
118 int i = array->num_bracket_matches;
119 /* Allocate more space if necessary. */
120 if (i >= *max_i)
122 tre_bracket_match_list_t *new_items;
123 DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
124 /* If the array is already 1024 items large, give up -- there's
125 probably an error in the regexp (e.g. not a '\0' terminated
126 string and missing ']') */
127 if (*max_i >= 1024)
128 return REG_ESPACE;
129 *max_i *= 2;
130 new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
131 if (new_items == NULL)
132 return REG_ESPACE;
133 *items = array = new_items;
135 array->bracket_matches[i].type = type;
136 array->bracket_matches[i].value = val;
137 array->num_bracket_matches++;
138 return status;
141 #ifndef TRE_USE_SYSTEM_WCTYPE
143 /* isalnum() and the rest may be macros, so wrap them to functions. */
144 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
145 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
147 #ifdef tre_isascii
148 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
149 #else /* !tre_isascii */
150 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
151 #endif /* !tre_isascii */
153 #ifdef tre_isblank
154 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
155 #else /* !tre_isblank */
156 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
157 #endif /* !tre_isblank */
159 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
160 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
161 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
162 int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
163 int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
164 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
165 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
166 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
167 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
169 struct {
170 char *name;
171 int (*func)(tre_cint_t);
172 } tre_ctype_map[] = {
173 { "alnum", &tre_isalnum_func },
174 { "alpha", &tre_isalpha_func },
175 #ifdef tre_isascii
176 { "ascii", &tre_isascii_func },
177 #endif /* tre_isascii */
178 #ifdef tre_isblank
179 { "blank", &tre_isblank_func },
180 #endif /* tre_isblank */
181 { "cntrl", &tre_iscntrl_func },
182 { "digit", &tre_isdigit_func },
183 { "graph", &tre_isgraph_func },
184 { "lower", &tre_islower_func },
185 { "print", &tre_isprint_func },
186 { "punct", &tre_ispunct_func },
187 { "space", &tre_isspace_func },
188 { "upper", &tre_isupper_func },
189 { "xdigit", &tre_isxdigit_func },
190 { NULL, NULL}
193 tre_ctype_t tre_ctype(const char *name)
195 int i;
196 for (i = 0; tre_ctype_map[i].name != NULL; i++)
198 if (strcmp(name, tre_ctype_map[i].name) == 0)
199 return tre_ctype_map[i].func;
201 return (tre_ctype_t)0;
203 #endif /* !TRE_USE_SYSTEM_WCTYPE */
205 #define REST(re) (int)(ctx->re_end - (re)), (re)
207 #define START_COLLATING_SYMBOLS 16
208 #define MAX_COLLATING_SYMBOL_LEN 4
210 typedef struct {
211 const tre_char_t *start;
212 int len;
213 } tre_collating_symbol;
215 #ifdef BSD_COMPATIBILITY
216 static wchar_t
217 tre_search_cnames(const wchar_t *name, size_t len)
219 size_t low = 0;
220 size_t high = NCNAMES - 1;
221 size_t cur;
222 int cmp;
224 while(low <= high)
226 cur = (low + high) / 2;
227 cmp = wcsncmp(name, cnames[cur].name, len);
228 if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
229 if (cmp > 0) low = cur + 1;
230 else high = cur - 1;
232 return (wchar_t)-1;
234 #endif /* BSD_COMPATIBILITY */
236 /* Scan the contents of a bracket expression, and create a
237 * tre_bracket_match_list_t encoding the bracket expression. If during
238 * the scan, multi-character collating symbols are detected, switch
239 * into a mode to collect those MCCSs into a tre_collating_symbol
240 * list and pass them back. tre_parse_bracket will use that to
241 * create a new string composed of a union of the bracket expression
242 * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
243 * call tre_parse (recursive) to parse that new string (which will
244 * call tre_parse_bracket and tre_parse_bracket_items again. */
245 static reg_errcode_t
246 tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
247 int *items_size, tre_collating_symbol **result)
249 const tre_char_t *re = ctx->re;
250 const tre_char_t *re_end = ctx->re_end;
251 tre_collating_symbol *col_syms = NULL;
252 tre_collating_symbol *cp = NULL;
253 int n_col_syms = 0;
254 reg_errcode_t status;
255 int max_i = *items_size;
256 int other = 0; /* contains content other than multi-character collating
257 * symbols */
258 int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
259 tre_cint_t min, c;
260 int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
261 int collect_MCCS = 0;
262 const tre_char_t *start;
264 for ( ;re < re_end; re++)
266 switch (*re)
268 case CHAR_MINUS:
269 /* A first hyphen */
270 if (re == ctx->re)
272 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
273 min = CHAR_MINUS;
274 other++;
275 range = 0;
276 break;
278 /* The hyphen is the end range */
279 if (range > 0)
281 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
282 c = CHAR_MINUS;
283 goto process_end_range;
285 if (re + 1 >= re_end)
287 status = REG_EBRACK;
288 goto error;
290 /* The hyphen is at the end */
291 if (re[1] == CHAR_RBRACKET)
293 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
294 c = CHAR_MINUS;
295 goto process_begin_range;
297 /* Two ranges are not allowed to share an endpoint, or begin
298 * range is illegal. */
299 if (range < 0)
301 status = REG_ERANGE;
302 goto error;
304 range = 1; /* Expect end range */
305 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
306 break;
308 case CHAR_LBRACKET:
309 if (re + 1 >= re_end)
311 status = REG_EBRACK;
312 goto error;
314 switch (re[1])
316 case CHAR_PERIOD:
318 re += 2;
319 start = re;
320 for (;; re++)
322 if (re >= re_end)
324 status = REG_ECOLLATE;
325 goto error;
327 if (*re == CHAR_PERIOD)
329 if (re + 1 >= re_end)
331 status = REG_ECOLLATE;
332 goto error;
334 /* Found end */
335 if (re[1] == CHAR_RBRACKET)
337 DPRINT(("tre_parse_bracket: collating "
338 "symbol: '%.*" STRF "'\n",
339 REST(start - 2)));
340 /* Empty name */
341 if (re == start)
343 status = REG_ECOLLATE;
344 goto error;
346 #ifdef BSD_COMPATIBILITY
347 /* Check if the name is in cnames; if so, use
348 the corresponding code */
349 c = tre_search_cnames(start, re - start);
350 if (c != (wchar_t)-1)
352 re++;
353 goto process_single_character;
355 #endif /* BSD_COMPATIBILITY */
356 /* Verify this is a known sequence */
357 if (__collate_equiv_value(ctx->loc, start,
358 re - start) <= 0)
360 status = REG_ECOLLATE;
361 goto error;
363 /* Process single character collating symbols */
364 if (re - start == 1)
366 c = *start;
367 re++;
368 goto process_single_character;
370 /* Inverted MCCSs are undefined */
371 if (invert)
373 status = REG_ECOLLATE;
374 goto error;
376 /* Can't have MCCSs as an endpoint to a range */
377 if (range > 0)
379 status = REG_ERANGE;
380 goto error;
382 range = -1;
383 /* Switch into MCCS collection mode (if not
384 * already there */
385 #if TRE_DEBUG
386 if (!collect_MCCS)
388 collect_MCCS = 1;
389 DPRINT(("tre_parse_bracket: Detected MCCS\n"));
391 #else /* !TRE_DEBUG */
392 collect_MCCS = 1;
393 #endif /* !TRE_DEBUG */
394 /* Allocate a memory block the first time */
395 if (!cp)
397 if ((col_syms = xmalloc(sizeof(*col_syms) *
398 (START_COLLATING_SYMBOLS + 2)))
399 == NULL)
400 return REG_ESPACE;
401 cp = col_syms + 1;
402 n_col_syms = START_COLLATING_SYMBOLS;
404 /* Enlarge the memory block is more is needed */
405 if ((cp - col_syms) - 1 >= n_col_syms)
407 int i = n_col_syms;
408 tre_collating_symbol *tmp =
409 xrealloc(col_syms, sizeof(*col_syms) *
410 ((n_col_syms *= 2) + 2));
411 if (tmp == NULL)
413 xfree(col_syms);
414 return REG_ESPACE;
416 DPRINT(("tre_list_collating_symbols: "
417 "Enlarging col_syms to %d\n",
418 n_col_syms));
419 col_syms = tmp;
420 cp = col_syms + i + 1;
422 cp->start = start;
423 cp->len = re - start;
424 cp++;
425 re++;
426 break;
430 break;
433 case CHAR_EQUAL:
434 case CHAR_COLON:
436 /* Process equivalence and character classes */
437 tre_char_t kind = re[1];
439 /* Can't have a class as an endpoint to a range */
440 if (range > 0)
442 status = REG_ERANGE;
443 goto error;
445 if (!collect_MCCS && range == 0)
447 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
448 min, &max_i, items);
449 if (status != REG_OK)
450 goto error;
452 range = -1;
453 re += 2;
454 start = re;
455 for (;; re++)
457 if (re >= re_end)
459 status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
460 goto error;
462 if (*re == kind)
464 if (re + 1 >= re_end)
466 status = kind == CHAR_EQUAL ? REG_ECOLLATE :
467 REG_ECTYPE;
468 goto error;
470 /* Found end */
471 if (re[1] == CHAR_RBRACKET)
473 if (re == start)
475 /* Empty class name */
476 status = kind == CHAR_EQUAL ? REG_ECOLLATE :
477 REG_ECTYPE;
478 goto error;
480 /* Process equivalence class */
481 if (kind == CHAR_EQUAL)
483 int equiv;
485 DPRINT(("tre_parse_bracket: equivalence: '%.*"
486 STRF "'\n", REST(start - 2)));
488 /* While we find the collation value even for
489 multi-character collating elements , we
490 don't (yet) match any collation values
491 against multi-character sequences. We'd have
492 to enumerate those multi-character sequences
493 and like multi-character collating symbols,
494 create a union of those sequences with the
495 rest of the bracket expression. While
496 doable, a bracket expression matching
497 multiple characters, that doesn't explicitly
498 contain multi-character sequences, might
499 be unexpected, so we punt for now. */
500 if ((equiv = __collate_equiv_value(ctx->loc,
501 start, re - start)) <= 0)
503 /* The standard says that if no collating
504 element if found, we use the collating
505 symbol itself. But __collate_equiv_value
506 doesn't make a distinction between
507 an element that is in a equvalence
508 class with others, or is the only member,
509 so we already know there is no collating
510 symbol. (Note that in the case of a
511 collating element whose collation value
512 is unique, matching against the
513 collating element itself, or against
514 its collation value, is equivalent.) */
515 #ifdef BSD_COMPATIBILITY
516 /* Check if the name is in cnames; if so,
517 use the corresponding code */
518 c = tre_search_cnames(start, re - start);
519 if (c != (wchar_t)-1)
521 re++;
522 goto process_single_character;
524 #endif /* BSD_COMPATIBILITY */
525 status = REG_ECOLLATE;
526 goto error;
528 if (!collect_MCCS)
530 status = tre_new_item(ctx->mem,
531 TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
532 equiv, &max_i, items);
533 if (status != REG_OK)
534 goto error;
537 else
539 /* Process character class */
540 DPRINT(("tre_parse_bracket: class: '%.*" STRF
541 "'\n", REST(start - 2)));
542 if (!collect_MCCS)
544 char tmp_str[64];
545 tre_ctype_t class;
546 int len = MIN(re - start, 63);
547 #ifdef TRE_WCHAR
549 tre_char_t tmp_wcs[64];
550 wcsncpy(tmp_wcs, start, (size_t)len);
551 tmp_wcs[len] = L'\0';
552 #if defined HAVE_WCSRTOMBS
554 mbstate_t state;
555 const tre_char_t *src = tmp_wcs;
556 memset(&state, '\0', sizeof(state));
557 len = wcsrtombs_l(tmp_str, &src,
558 sizeof(tmp_str), &state,
559 ctx->loc);
561 #elif defined HAVE_WCSTOMBS
562 len = wcstombs(tmp_str, tmp_wcs, 63);
563 #endif /* defined HAVE_WCSTOMBS */
565 #else /* !TRE_WCHAR */
566 strncpy(tmp_str, (const char*)start, len);
567 #endif /* !TRE_WCHAR */
568 tmp_str[len] = '\0';
569 DPRINT((" class name: %s\n", tmp_str));
570 class = tre_ctype_l(tmp_str, ctx->loc);
571 if (!class)
573 status = REG_ECTYPE;
574 goto error;
576 status = tre_new_item(ctx->mem,
577 TRE_BRACKET_MATCH_TYPE_CLASS,
578 class, &max_i, items);
579 if (status != REG_OK)
580 goto error;
583 re++;
584 break;
588 other++;
589 break;
592 default:
593 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
594 c = CHAR_LBRACKET;
595 goto process_single_character;
596 break;
598 break;
600 case CHAR_RBRACKET:
601 /* A first right bracket */
602 if (re == ctx->re)
604 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
605 min = CHAR_RBRACKET;
606 range = 0;
607 other++;
608 break;
610 /* Done */
611 if (collect_MCCS)
613 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n",
614 REST(re)));
615 if (col_syms)
617 /* Mark the character following the right bracket. Set len
618 * to whether there are other things besides the
619 * multi-character collating symbols */
620 col_syms->start = re + 1;
621 col_syms->len = other;
622 /* Mark the end of the list */
623 cp->start = NULL;
625 *result = col_syms;
626 return REG_OK;
628 /* range > 0 is not possible, since we did a lookahead after the
629 * hyphen */
630 if (range == 0)
632 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
633 min, &max_i, items);
634 if (status != REG_OK)
635 goto error;
637 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
638 *items_size = max_i;
639 ctx->re = re + 1;
640 return REG_OK;
642 default:
643 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
644 c = *re;
645 process_single_character:
646 /* Process single character */
647 if (range > 0)
649 int mine, maxe;
651 process_end_range:
652 /* Get collation equivalence values */
653 mine = __collate_equiv_value(ctx->loc, &min, 1);
654 maxe = __collate_equiv_value(ctx->loc, &c, 1);
655 if (maxe < mine)
657 status = REG_ERANGE;
658 goto error;
660 if (!collect_MCCS)
662 status = tre_new_item(ctx->mem,
663 TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
664 mine, &max_i, items);
665 if (status != REG_OK)
666 goto error;
667 status = tre_new_item(ctx->mem,
668 TRE_BRACKET_MATCH_TYPE_RANGE_END,
669 maxe, &max_i, items);
670 if (status != REG_OK)
671 goto error;
673 range = -1;
675 else
677 process_begin_range:
678 if (!collect_MCCS)
680 if (range == 0)
682 status = tre_new_item(ctx->mem,
683 TRE_BRACKET_MATCH_TYPE_CHAR,
684 min, &max_i, items);
685 if (status != REG_OK)
686 goto error;
688 min = c;
690 range = 0;
692 other++;
693 break;
696 status = REG_EBRACK;
697 error:
698 DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n",
699 REST(re), status));
700 if (col_syms)
701 xfree(col_syms);
702 return status;
705 #ifdef TRE_DEBUG
706 static const char *bracket_match_type_str[] = {
707 "unused",
708 "char",
709 "range begin",
710 "range end",
711 "class",
712 "equivalence value",
714 #endif /* TRE_DEBUG */
716 static reg_errcode_t
717 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
719 tre_ast_node_t *node;
720 reg_errcode_t status = REG_OK;
721 tre_bracket_match_list_t *items;
722 int max_i = 32;
723 tre_collating_symbol *col_syms = NULL;
725 /* Handle special cases [[:<:]] and [[:>:]] */
726 if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
727 && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
728 && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
729 && ctx->re[5] == CHAR_RBRACKET)
731 *result = tre_ast_new_literal(ctx->mem, ASSERTION,
732 (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
733 -1);
734 DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
735 "[[:<:]]" : "[[:>:]]"));
736 ctx->re += 6;
737 return *result ? REG_OK : REG_ESPACE;
740 /* Start off with an array of `max_i' elements. */
741 items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
742 if (items == NULL)
743 return REG_ESPACE;
745 if (*ctx->re == CHAR_CARET)
747 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
748 items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
749 ctx->re++;
752 status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
754 if (status != REG_OK)
755 goto parse_bracket_done;
757 /* If there are collating symbols, split off the multi-character ones
758 * into a union of the bracket expression (without the collating symbols)
759 * and the multiple-character sequences. We create an equivalent input
760 * string and run tre_parse() recursively */
761 if (col_syms)
763 tre_char_t *str, *sp;
764 tre_collating_symbol *cp;
765 tre_parse_ctx_t subctx;
767 /* Allocate a new string. We start with the size of the original
768 * bracket expression (minus 1) and add 2 (for a leading "[" and
769 * a trailing nil; don't need a "^", since it is illegal to have
770 * inverted MCCSs). Since a multi-character collating symbols
771 * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
772 * need to worry about the new string getting too long. */
773 xfree(items);
774 str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
775 if (str == NULL)
777 xfree(col_syms);
778 return REG_ESPACE;
780 sp = str;
781 if (col_syms->len > 0)
783 /* There are other items in the bracket expression besides the
784 * multi-character collating symbols, so create a new bracket
785 * expression with only those other itmes. */
786 const tre_char_t *re;
787 ptrdiff_t i;
789 *sp++ = '[';
790 re = ctx->re;
791 for (cp = col_syms + 1; cp->start; cp++)
793 /* The "- 2" is to account for the "[." */
794 if ((i = ((cp->start - re) - 2)) > 0)
796 memcpy(sp, re, sizeof(*sp) * i);
797 sp += i;
799 /* The "+ 2" is to account for the ".]" */
800 re = cp->start + cp->len + 2;
802 i = col_syms->start - re; /* Includes the trailing right bracket */
803 memcpy(sp, re, sizeof(*sp) * i);
804 sp += i;
805 *sp++ = '|';
807 for (cp = col_syms + 1; cp->start; cp++)
809 memcpy(sp, cp->start, sizeof(*sp) * cp->len);
810 sp += cp->len;
811 if (cp[1].start)
812 *sp++ = '|';
814 *sp = 0;
815 DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
816 str));
818 memcpy(&subctx, ctx, sizeof(subctx));
819 subctx.re = str;
820 subctx.len = sp - str;
821 subctx.nofirstsub = 1;
822 subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
823 status = tre_parse(&subctx);
824 xfree(str);
825 if (status != REG_OK)
827 xfree(col_syms);
828 return status;
830 ctx->re = col_syms->start;
831 ctx->position = subctx.position;
832 xfree(col_syms);
833 *result = subctx.result;
834 DPRINT(("tre_parse_bracket: Returning to original string\n"));
835 return REG_OK;
838 DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
839 node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
840 if (node == NULL)
842 status = REG_ESPACE;
843 goto parse_bracket_done;
845 else
847 tre_literal_t *l = node->obj;
848 l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
849 SIZEOF_BRACKET_MATCH_LIST(items));
850 if (l->u.bracket_match_list == NULL)
852 status = REG_ESPACE;
853 goto parse_bracket_done;
855 memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
858 #ifdef TRE_DEBUG
860 int i;
861 tre_bracket_match_t *b;
862 DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
863 items->num_bracket_matches, items->flags));
864 for (i = 0, b = items->bracket_matches;
865 i < items->num_bracket_matches; i++, b++)
867 DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type],
868 b->value));
871 #endif /* TRE_DEBUG */
873 parse_bracket_done:
874 xfree(items);
875 ctx->position++;
876 *result = node;
877 return status;
881 /* Parses a positive decimal integer. Returns -1 if the string does not
882 contain a valid number. */
883 static int
884 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
886 int num = -1;
887 const tre_char_t *r = *regex;
888 while (r < regex_end && *r >= L'0' && *r <= L'9')
890 if (num < 0)
891 num = 0;
892 num = num * 10 + *r - L'0';
893 r++;
895 *regex = r;
896 return num;
900 static reg_errcode_t
901 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
903 int min, max;
904 #ifdef TRE_APPROX
905 int i;
906 int cost_ins, cost_del, cost_subst, cost_max;
907 int limit_ins, limit_del, limit_subst, limit_err;
908 const tre_char_t *start;
909 #endif /* TRE_APPROX */
910 const tre_char_t *r = ctx->re;
911 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
912 #ifdef TRE_APPROX
913 int approx = 0;
914 int costs_set = 0;
915 int counts_set = 0;
917 cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
918 limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
919 #endif /* TRE_APPROX */
921 /* Parse number (minimum repetition count). */
922 min = -1;
923 if (r >= ctx->re_end)
924 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
925 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
926 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
927 return REG_EBRACE;
928 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
929 if (*r >= L'0' && *r <= L'9') {
930 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r)));
931 min = tre_parse_int(&r, ctx->re_end);
933 #ifndef TRE_APPROX
934 else
935 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
936 /* For ERE, return REG_NOMATCH to signal that the lbrace should
937 be treated as a literal */
938 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
939 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
940 return REG_BADBR;
941 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
942 #endif /* !TRE_APPROX */
944 /* Parse comma and second number (maximum repetition count). */
945 max = min;
946 if (r < ctx->re_end && *r == CHAR_COMMA)
948 r++;
949 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r)));
950 max = tre_parse_int(&r, ctx->re_end);
953 /* Check that the repeat counts are sane. */
954 if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
955 return REG_BADBR;
958 #ifdef TRE_APPROX
961 optionally followed immediately by a number == minimum repcount
962 optionally followed by , then a number == maximum repcount
963 + then a number == maximum insertion count
964 - then a number == maximum deletion count
965 # then a number == maximum substitution count
966 ~ then a number == maximum number of errors
967 Any of +, -, # or ~ without followed by a number means that
968 the maximum count/number of errors is infinite.
970 An equation of the form
971 Xi + Yd + Zs < C
972 can be specified to set costs and the cost limit to a value
973 different from the default value:
974 - X is the cost of an insertion
975 - Y is the cost of a deletion
976 - Z is the cost of a substitution
977 - C is the maximum cost
979 If no count limit or cost is set for an operation, the operation
980 is not allowed at all.
984 do {
985 int done;
986 start = r;
988 /* Parse count limit settings */
989 done = 0;
990 if (!counts_set)
991 while (r + 1 < ctx->re_end && !done)
993 switch (*r)
995 case CHAR_PLUS: /* Insert limit */
996 DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r)));
997 r++;
998 limit_ins = tre_parse_int(&r, ctx->re_end);
999 if (limit_ins < 0)
1000 limit_ins = INT_MAX;
1001 counts_set = 1;
1002 break;
1003 case CHAR_MINUS: /* Delete limit */
1004 DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r)));
1005 r++;
1006 limit_del = tre_parse_int(&r, ctx->re_end);
1007 if (limit_del < 0)
1008 limit_del = INT_MAX;
1009 counts_set = 1;
1010 break;
1011 case CHAR_HASH: /* Substitute limit */
1012 DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
1013 r++;
1014 limit_subst = tre_parse_int(&r, ctx->re_end);
1015 if (limit_subst < 0)
1016 limit_subst = INT_MAX;
1017 counts_set = 1;
1018 break;
1019 case CHAR_TILDE: /* Maximum number of changes */
1020 DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
1021 r++;
1022 limit_err = tre_parse_int(&r, ctx->re_end);
1023 if (limit_err < 0)
1024 limit_err = INT_MAX;
1025 approx = 1;
1026 break;
1027 case CHAR_COMMA:
1028 r++;
1029 break;
1030 case L' ':
1031 r++;
1032 break;
1033 case L'}':
1034 done = 1;
1035 break;
1036 default:
1037 done = 1;
1038 break;
1042 /* Parse cost restriction equation. */
1043 done = 0;
1044 if (!costs_set)
1045 while (r + 1 < ctx->re_end && !done)
1047 switch (*r)
1049 case CHAR_PLUS:
1050 case L' ':
1051 r++;
1052 break;
1053 case L'<':
1054 DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r)));
1055 r++;
1056 while (*r == L' ')
1057 r++;
1058 cost_max = tre_parse_int(&r, ctx->re_end);
1059 if (cost_max < 0)
1060 cost_max = INT_MAX;
1061 else
1062 cost_max--;
1063 approx = 1;
1064 break;
1065 case CHAR_COMMA:
1066 r++;
1067 done = 1;
1068 break;
1069 default:
1070 if (*r >= L'0' && *r <= L'9')
1072 #ifdef TRE_DEBUG
1073 const tre_char_t *sr = r;
1074 #endif /* TRE_DEBUG */
1075 int cost = tre_parse_int(&r, ctx->re_end);
1076 /* XXX - make sure r is not past end. */
1077 switch (*r)
1079 case L'i': /* Insert cost */
1080 DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n",
1081 REST(sr)));
1082 r++;
1083 cost_ins = cost;
1084 costs_set = 1;
1085 break;
1086 case L'd': /* Delete cost */
1087 DPRINT(("tre_parse: del cost: '%.*" STRF "'\n",
1088 REST(sr)));
1089 r++;
1090 cost_del = cost;
1091 costs_set = 1;
1092 break;
1093 case L's': /* Substitute cost */
1094 DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n",
1095 REST(sr)));
1096 r++;
1097 cost_subst = cost;
1098 costs_set = 1;
1099 break;
1100 default:
1101 return REG_BADBR;
1104 else
1106 done = 1;
1107 break;
1111 } while (start != r);
1112 #endif /* TRE_APPROX */
1114 /*{*//* Missing }. */
1115 if (r >= ctx->re_end)
1116 return REG_EBRACE;
1118 /* Empty contents of {}. */
1119 if (r == ctx->re)
1120 return REG_BADBR;
1122 /* Parse the ending '}' or '\}'.*/
1123 if (ctx->cflags & REG_EXTENDED)
1125 if (r >= ctx->re_end || *r != CHAR_RBRACE)
1126 return REG_BADBR;
1127 r++;
1128 /* Parse trailing '?' marking minimal repetition. */
1129 if (r < ctx->re_end)
1131 if (*r == CHAR_QUESTIONMARK)
1133 /* Process the question mark only in enhanced mode.
1134 Otherwise, the question mark is an error in ERE
1135 or a literal in BRE */
1136 if (ctx->cflags & REG_ENHANCED)
1138 minimal = !(ctx->cflags & REG_UNGREEDY);
1139 r++;
1141 else return REG_BADRPT;
1143 else if (*r == CHAR_STAR || *r == CHAR_PLUS)
1145 /* These are reserved for future extensions. */
1146 return REG_BADRPT;
1150 else
1152 if (r + 1 >= ctx->re_end
1153 || *r != CHAR_BACKSLASH
1154 || *(r + 1) != CHAR_RBRACE)
1155 return REG_BADBR;
1156 r += 2;
1157 if (r < ctx->re_end && *r == CHAR_STAR)
1159 /* This is reserved for future extensions. */
1160 return REG_BADRPT;
1164 if (minimal)
1165 ctx->num_reorder_tags++;
1167 if (!result) goto parse_bound_exit;
1168 /* Create the AST node(s). */
1169 /* Originally, if min == 0 && max == 0, we immediately replace the whole
1170 iteration with EMPTY. This unfortunately drops any submatches, and
1171 messes up setting the pmatch values (we can get tags of -1, and
1172 tag values in the billions). So we leave it and process this case as
1173 usual, and wait until tre_expand_ast() to replace with EMPTY */
1174 #ifdef TRE_APPROX
1175 if (min < 0 && max < 0)
1176 /* Only approximate parameters set, no repetitions. */
1177 min = max = 1;
1178 #endif /* TRE_APPROX */
1180 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
1181 if (!*result)
1182 return REG_ESPACE;
1184 #ifdef TRE_APPROX
1185 /* If approximate matching parameters are set, add them to the
1186 iteration node. */
1187 if (approx || costs_set || counts_set)
1189 int *params;
1190 tre_iteration_t *iter = (*result)->obj;
1192 if (costs_set || counts_set)
1194 if (limit_ins == TRE_PARAM_UNSET)
1196 if (cost_ins == TRE_PARAM_UNSET)
1197 limit_ins = 0;
1198 else
1199 limit_ins = INT_MAX;
1202 if (limit_del == TRE_PARAM_UNSET)
1204 if (cost_del == TRE_PARAM_UNSET)
1205 limit_del = 0;
1206 else
1207 limit_del = INT_MAX;
1210 if (limit_subst == TRE_PARAM_UNSET)
1212 if (cost_subst == TRE_PARAM_UNSET)
1213 limit_subst = 0;
1214 else
1215 limit_subst = INT_MAX;
1219 if (cost_max == TRE_PARAM_UNSET)
1220 cost_max = INT_MAX;
1221 if (limit_err == TRE_PARAM_UNSET)
1222 limit_err = INT_MAX;
1224 ctx->have_approx = 1;
1225 params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
1226 if (!params)
1227 return REG_ESPACE;
1228 for (i = 0; i < TRE_PARAM_LAST; i++)
1229 params[i] = TRE_PARAM_UNSET;
1230 params[TRE_PARAM_COST_INS] = cost_ins;
1231 params[TRE_PARAM_COST_DEL] = cost_del;
1232 params[TRE_PARAM_COST_SUBST] = cost_subst;
1233 params[TRE_PARAM_COST_MAX] = cost_max;
1234 params[TRE_PARAM_MAX_INS] = limit_ins;
1235 params[TRE_PARAM_MAX_DEL] = limit_del;
1236 params[TRE_PARAM_MAX_SUBST] = limit_subst;
1237 params[TRE_PARAM_MAX_ERR] = limit_err;
1238 iter->params = params;
1240 #endif /* TRE_APPROX */
1242 parse_bound_exit:
1243 #ifdef TRE_APPROX
1244 DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
1245 "limits [%d,%d,%d, total %d]\n",
1246 min, max, cost_ins, cost_del, cost_subst, cost_max,
1247 limit_ins, limit_del, limit_subst, limit_err));
1248 #else /* !TRE_APPROX */
1249 DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
1250 #endif /* !TRE_APPROX */
1253 ctx->re = r;
1254 return REG_OK;
1257 /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
1258 non-self-contained options, like (?i), this causes ((?i)fu)bar to be
1259 treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
1260 Because we now set up tags for even non-capturing parenthesized
1261 subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we
1262 pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
1263 have it restore cflags after the subexpression, we don't need to have
1264 a separate PARSE_RESTORE_CFLAGS, and then after processing the
1265 non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
1266 This has the side-benefit of now matching the perl behavior: the RE
1267 foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
1268 of foo AND (?i) (bar OR zap). */
1269 typedef enum {
1270 PARSE_RE = 0,
1271 PARSE_ATOM,
1272 PARSE_MARK_FOR_SUBMATCH,
1273 PARSE_BRANCH,
1274 PARSE_PIECE,
1275 PARSE_CATENATION,
1276 PARSE_POST_CATENATION,
1277 PARSE_UNION,
1278 PARSE_POST_UNION,
1279 PARSE_POSTFIX,
1280 } tre_parse_re_stack_symbol_t;
1283 reg_errcode_t
1284 tre_parse(tre_parse_ctx_t *ctx)
1286 tre_ast_node_t *result = NULL;
1287 tre_parse_re_stack_symbol_t symbol;
1288 reg_errcode_t status = REG_OK;
1289 tre_stack_t *stack = ctx->stack;
1290 int bottom = tre_stack_num_objects(stack);
1291 int depth = 0;
1292 int temporary_cflags = 0;
1293 int bre_branch_begin;
1294 #ifdef TRE_DEBUG
1295 const tre_char_t *tmp_re;
1296 #endif
1298 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
1299 ctx->len, ctx->re, ctx->len, ctx->cflags));
1301 if (ctx->len <= 0) return REG_EMPTY;
1302 if (!ctx->nofirstsub)
1304 STACK_PUSH(stack, int, ctx->cflags);
1305 STACK_PUSH(stack, int, ctx->submatch_id);
1306 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
1307 ctx->submatch_id++;
1309 STACK_PUSH(stack, int, 0); // bre_branch_begin
1310 STACK_PUSH(stack, int, PARSE_RE);
1311 ctx->re_start = ctx->re;
1312 ctx->re_end = ctx->re + ctx->len;
1315 /* The following is basically just a recursive descent parser. I use
1316 an explicit stack instead of recursive functions mostly because of
1317 two reasons: compatibility with systems which have an overflowable
1318 call stack, and efficiency (both in lines of code and speed). */
1319 while (tre_stack_num_objects(stack) > bottom)
1321 symbol = tre_stack_pop_int(stack);
1322 switch (symbol)
1324 case PARSE_RE:
1325 /* Parse a full regexp. A regexp is one or more branches,
1326 separated by the union operator `|'. */
1327 bre_branch_begin = tre_stack_pop_int(stack);
1328 if (
1329 #ifdef REG_LITERAL
1330 !(ctx->cflags & REG_LITERAL) &&
1331 #endif /* REG_LITERAL */
1332 ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
1333 STACK_PUSHX(stack, int, PARSE_UNION);
1334 STACK_PUSHX(stack, int, bre_branch_begin);
1335 STACK_PUSHX(stack, int, PARSE_BRANCH);
1336 break;
1338 case PARSE_BRANCH:
1339 /* Parse a branch. A branch is one or more pieces, concatenated.
1340 A piece is an atom possibly followed by a postfix operator. */
1341 bre_branch_begin = tre_stack_pop_int(stack);
1342 STACK_PUSHX(stack, int, PARSE_CATENATION);
1343 STACK_PUSHX(stack, int, bre_branch_begin);
1344 STACK_PUSHX(stack, int, PARSE_PIECE);
1345 break;
1347 case PARSE_PIECE:
1348 /* Parse a piece. A piece is an atom possibly followed by one
1349 or more postfix operators. */
1350 bre_branch_begin = tre_stack_pop_int(stack);
1351 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1352 STACK_PUSHX(stack, int, bre_branch_begin);
1353 STACK_PUSHX(stack, int, PARSE_ATOM);
1354 break;
1356 case PARSE_CATENATION:
1357 /* If the expression has not ended, parse another piece. */
1359 tre_char_t c;
1360 if (ctx->re >= ctx->re_end)
1361 break;
1362 c = *ctx->re;
1363 #ifdef REG_LITERAL
1364 if (!(ctx->cflags & REG_LITERAL))
1366 #endif /* REG_LITERAL */
1367 if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
1368 ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
1369 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
1370 *(ctx->re + 1) == CHAR_PIPE))
1371 break;
1372 if ((ctx->cflags & REG_EXTENDED
1373 && c == CHAR_RPAREN && depth > 0)
1374 || (!(ctx->cflags & REG_EXTENDED)
1375 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
1376 && *(ctx->re + 1) == CHAR_RPAREN))
1378 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1379 return REG_EPAREN;
1380 DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
1381 REST(ctx->re)));
1382 depth--;
1383 if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
1384 ctx->re += 2;
1385 break;
1387 #ifdef REG_LITERAL
1389 #endif /* REG_LITERAL */
1391 #ifdef REG_LEFT_ASSOC
1392 if (ctx->cflags & REG_LEFT_ASSOC)
1394 /* Left associative concatenation. */
1395 STACK_PUSHX(stack, int, PARSE_CATENATION);
1396 STACK_PUSHX(stack, voidptr, result);
1397 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1398 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1399 STACK_PUSHX(stack, int, PARSE_PIECE);
1401 else
1402 #endif /* REG_LEFT_ASSOC */
1404 /* Default case, right associative concatenation. */
1405 STACK_PUSHX(stack, voidptr, result);
1406 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1407 STACK_PUSHX(stack, int, PARSE_CATENATION);
1408 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1409 STACK_PUSHX(stack, int, PARSE_PIECE);
1411 break;
1414 case PARSE_POST_CATENATION:
1416 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1417 tre_ast_node_t *tmp_node;
1418 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1419 if (!tmp_node)
1420 return REG_ESPACE;
1421 result = tmp_node;
1422 break;
1425 case PARSE_UNION:
1426 if (ctx->re >= ctx->re_end)
1427 break;
1428 #ifdef REG_LITERAL
1429 if (ctx->cflags & REG_LITERAL)
1430 break;
1431 #endif /* REG_LITERAL */
1432 if (!(ctx->cflags & REG_EXTENDED))
1434 if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
1435 break;
1436 ctx->re++;
1438 switch (*ctx->re)
1440 case CHAR_PIPE:
1441 DPRINT(("tre_parse: union: '%.*" STRF "'\n",
1442 REST(ctx->re)));
1443 STACK_PUSHX(stack, int, PARSE_UNION);
1444 STACK_PUSHX(stack, voidptr, (void *)ctx->re);
1445 STACK_PUSHX(stack, voidptr, result);
1446 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1447 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1448 indicate if this is the beginning of a BRE extended branch. */
1449 STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
1450 STACK_PUSHX(stack, int, PARSE_BRANCH);
1451 ctx->re++;
1452 break;
1454 case CHAR_RPAREN:
1455 ctx->re++;
1456 break;
1458 default:
1459 if (!(ctx->cflags & REG_EXTENDED))
1460 ctx->re--;
1461 break;
1463 break;
1465 case PARSE_POST_UNION:
1467 tre_ast_node_t *tmp_node;
1468 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1469 const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
1470 /* error on empty expression at end of union */
1471 if (pipechar == ctx->re - 1)
1473 return REG_EMPTY;
1475 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1476 if (!tmp_node)
1477 return REG_ESPACE;
1478 result = tmp_node;
1479 break;
1482 case PARSE_POSTFIX:
1483 /* Parse postfix operators. */
1484 if (ctx->re >= ctx->re_end)
1485 break;
1486 #ifdef REG_LITERAL
1487 if (ctx->cflags & REG_LITERAL)
1488 break;
1489 #endif /* REG_LITERAL */
1490 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1491 int rep_min = 0;
1492 int rep_max = -1;
1493 #ifdef TRE_DEBUG
1494 int lbrace_off;
1495 #endif
1496 switch (*ctx->re)
1498 case CHAR_PLUS:
1499 case CHAR_QUESTIONMARK:
1500 if (!(ctx->cflags & REG_EXTENDED))
1501 break;
1502 /*FALLTHROUGH*/
1503 case CHAR_STAR:
1505 tre_ast_node_t *tmp_node;
1506 #ifdef TRE_DEBUG
1507 const char *tstr = "star";
1508 tmp_re = ctx->re;
1509 #endif
1511 handle_plus_or_question:
1512 /* error on iteration of raw assertion (not in subexpression) */
1513 if (result->type == LITERAL && result->submatch_id < 0 &&
1514 IS_ASSERTION((tre_literal_t *)result->obj))
1516 if (!(ctx->cflags & REG_EXTENDED)) break;
1517 return REG_BADRPT;
1519 if (*ctx->re == CHAR_PLUS)
1521 rep_min = 1;
1522 #ifdef TRE_DEBUG
1523 tstr = "plus";
1524 #endif
1526 if (*ctx->re == CHAR_QUESTIONMARK)
1528 rep_max = 1;
1529 #ifdef TRE_DEBUG
1530 tstr = "questionmark";
1531 #endif
1534 if (ctx->cflags & REG_EXTENDED)
1536 if (ctx->re + 1 < ctx->re_end)
1538 if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1540 /* Process the question mark only in enhanced mode.
1541 Otherwise, the question mark is an error in ERE */
1542 if (ctx->cflags & REG_ENHANCED)
1544 minimal = !(ctx->cflags & REG_UNGREEDY);
1545 ctx->re++;
1547 else return REG_BADRPT;
1549 else if (*(ctx->re + 1) == CHAR_STAR
1550 || *(ctx->re + 1) == CHAR_PLUS)
1552 /* These are reserved for future extensions. */
1553 return REG_BADRPT;
1557 else
1559 if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
1561 /* This is reserved for future extensions. */
1562 return REG_BADRPT;
1564 if (ctx->re + 2 < ctx->re_end)
1566 if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_QUESTIONMARK)
1568 /* Process the question mark only in enhanced mode.
1569 Otherwise, the question mark is a literal in BRE */
1570 if (ctx->cflags & REG_ENHANCED)
1572 minimal = !(ctx->cflags & REG_UNGREEDY);
1573 ctx->re += 2;
1576 else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
1578 /* This is reserved for future extensions. */
1579 return REG_BADRPT;
1584 if (minimal)
1585 ctx->num_reorder_tags++;
1587 DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
1588 minimal ? " minimal" : "greedy", tstr, REST(tmp_re)));
1589 if (result == NULL)
1591 if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
1592 else goto parse_literal;
1594 ctx->re++;
1595 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1596 minimal);
1597 if (tmp_node == NULL)
1598 return REG_ESPACE;
1599 result = tmp_node;
1601 /* Set the iterator with a submatch id in the invisible range
1602 * (which will be overridden if a real submatch is needed) */
1603 result->submatch_id = ctx->submatch_id_invisible++;
1605 #if 0
1606 /* We don't allow multiple postfixes, but this might be needed
1607 to support approximate matching */
1608 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1609 #endif
1611 break;
1613 case CHAR_BACKSLASH:
1614 /* "\{" is special without REG_EXTENDED */
1615 /* "\+" and "\?" are special with REG_ENHANCED for BRE */
1616 if (!(ctx->cflags & REG_EXTENDED)
1617 && ctx->re + 1 < ctx->re_end)
1619 switch (*(ctx->re + 1))
1621 case CHAR_LBRACE:
1622 ctx->re++;
1623 #ifdef TRE_DEBUG
1624 lbrace_off = 2;
1625 #endif
1626 goto parse_brace;
1627 case CHAR_PLUS:
1628 case CHAR_QUESTIONMARK:
1629 if (ctx->cflags & REG_ENHANCED)
1631 #ifdef TRE_DEBUG
1632 tmp_re = ctx->re;
1633 #endif
1634 ctx->re++;
1635 goto handle_plus_or_question;
1637 break;
1639 break;
1641 else
1642 break;
1644 case CHAR_LBRACE:
1646 int raw_assertion;
1648 /* "{" is literal without REG_EXTENDED */
1649 if (!(ctx->cflags & REG_EXTENDED))
1650 break;
1651 #ifdef TRE_DEBUG
1652 lbrace_off = 1;
1653 #endif
1655 parse_brace:
1656 /* error on iteration of raw assertion (not in subexpression),
1657 but wait until after parsing bounds */
1658 raw_assertion = (result->type == LITERAL
1659 && result->submatch_id < 0
1660 && IS_ASSERTION((tre_literal_t *)result->obj));
1661 ctx->re++;
1663 status = tre_parse_bound(ctx, &result);
1664 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1665 /* For ERE, if status is REG_NOMATCH, this mean the lbrace
1666 is to be treated as a literal. */
1667 if (status == REG_NOMATCH)
1669 ctx->re--;
1670 break;
1672 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1673 DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
1674 REST(ctx->re - lbrace_off)));
1675 if (status != REG_OK)
1676 return status;
1677 if (raw_assertion) return REG_BADRPT;
1679 /* Set the iterator with a submatch id in the invisible range
1680 * (which will be overridden if a real submatch is needed) */
1681 if (result->type == ITERATION)
1682 result->submatch_id = ctx->submatch_id_invisible++;
1684 #if 0
1685 /* We don't allow multiple postfixes, but this might be needed
1686 to support approximate matching */
1687 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1688 #endif
1689 break;
1692 break;
1694 case PARSE_ATOM:
1696 /* Parse an atom. An atom is a regular expression enclosed in `()',
1697 an empty set of `()', a bracket expression, `.', `^', `$',
1698 a `\' followed by a character, or a single character. */
1700 /* The stack contains a boolean value, whether PARSE_ATOM is
1701 being called just after the start of a group (left paren)
1702 in a BRE */
1703 bre_branch_begin = tre_stack_pop_int(stack);
1705 /* End of regexp? (empty string). */
1706 if (ctx->re >= ctx->re_end)
1707 goto parse_literal;
1709 #ifdef REG_LITERAL
1710 if (ctx->cflags & REG_LITERAL)
1711 goto parse_literal;
1712 #endif /* REG_LITERAL */
1714 switch (*ctx->re)
1716 case CHAR_LPAREN: /* parenthesized subexpression */
1718 /* Handle "(?...)" extensions. They work in a way similar
1719 to Perls corresponding extensions. */
1720 if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
1721 (REG_EXTENDED|REG_ENHANCED)
1722 && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1724 int new_cflags = ctx->cflags;
1725 int bit = 1;
1726 int invisible_submatch = 0;
1727 DPRINT(("tre_parse: extension: '%.*" STRF "'\n",
1728 REST(ctx->re)));
1729 ctx->re += 2;
1730 while (/*CONSTCOND*/1)
1732 if (*ctx->re == L'i')
1734 DPRINT(("tre_parse: icase: '%.*" STRF "'\n",
1735 REST(ctx->re)));
1736 if (bit)
1737 new_cflags |= REG_ICASE;
1738 else
1739 new_cflags &= ~REG_ICASE;
1740 ctx->re++;
1742 else if (*ctx->re == L'n')
1744 DPRINT(("tre_parse: newline: '%.*" STRF "'\n",
1745 REST(ctx->re)));
1746 if (bit)
1747 new_cflags |= REG_NEWLINE;
1748 else
1749 new_cflags &= ~REG_NEWLINE;
1750 ctx->re++;
1752 #ifdef REG_LEFT_ASSOC
1753 else if (*ctx->re == L'l')
1755 DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
1756 REST(ctx->re)));
1757 if (bit)
1758 new_cflags |= REG_LEFT_ASSOC;
1759 else
1760 new_cflags &= ~REG_LEFT_ASSOC;
1761 ctx->re++;
1763 #endif /* REG_LEFT_ASSOC */
1764 #ifdef REG_UNGREEDY
1765 else if (*ctx->re == L'U')
1767 DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n",
1768 REST(ctx->re)));
1769 if (bit)
1770 new_cflags |= REG_UNGREEDY;
1771 else
1772 new_cflags &= ~REG_UNGREEDY;
1773 ctx->re++;
1775 #endif /* REG_UNGREEDY */
1776 else if (*ctx->re == CHAR_MINUS)
1778 DPRINT(("tre_parse: turn off: '%.*" STRF "'\n",
1779 REST(ctx->re)));
1780 ctx->re++;
1781 bit = 0;
1783 else if (*ctx->re == CHAR_COLON)
1785 DPRINT(("tre_parse: no group: '%.*" STRF
1786 "', (invisible submatch %d)\n",
1787 REST(ctx->re), ctx->submatch_id_invisible));
1788 ctx->re++;
1789 depth++;
1790 invisible_submatch = 1;
1791 break;
1793 else if (*ctx->re == CHAR_HASH)
1795 DPRINT(("tre_parse: comment: '%.*" STRF "'\n",
1796 REST(ctx->re)));
1797 /* A comment can contain any character except a
1798 right parenthesis */
1799 while (*ctx->re != CHAR_RPAREN
1800 && ctx->re < ctx->re_end)
1801 ctx->re++;
1802 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1804 ctx->re++;
1805 break;
1807 else
1808 return REG_BADPAT;
1810 else if (*ctx->re == CHAR_RPAREN)
1812 ctx->re++;
1813 break;
1815 else
1816 return REG_BADRPT;
1819 /* Turn on the cflags changes for the rest of the
1820 enclosing group. */
1821 if (invisible_submatch)
1823 STACK_PUSHX(stack, int, ctx->cflags);
1824 STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
1825 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1826 ctx->submatch_id_invisible++;
1827 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1828 STACK_PUSHX(stack, int, PARSE_RE);
1830 else {
1831 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1832 STACK_PUSHX(stack, int, PARSE_ATOM);
1834 ctx->cflags = new_cflags;
1835 break;
1838 if (ctx->cflags & REG_EXTENDED)
1840 parse_bre_lparen:
1841 DPRINT(("tre_parse: group begin: '%.*" STRF
1842 "', submatch %d\n", REST(ctx->re),
1843 ctx->submatch_id));
1844 ctx->re++;
1845 /* First parse a whole RE, then mark the resulting tree
1846 for submatching. */
1847 STACK_PUSHX(stack, int, ctx->cflags);
1848 STACK_PUSHX(stack, int, ctx->submatch_id);
1849 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1850 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1851 indicate if this is the beginning of a BRE group. */
1852 STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
1853 STACK_PUSHX(stack, int, PARSE_RE);
1854 ctx->submatch_id++;
1855 depth++;
1857 else
1858 goto parse_literal;
1859 break;
1861 case CHAR_RPAREN: /* end of current subexpression */
1862 if (ctx->cflags & REG_EXTENDED && depth > 0)
1864 parse_bre_rparen_empty:
1865 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1866 return REG_EPAREN;
1867 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1868 REST(ctx->re)));
1869 /* We were expecting an atom, but instead the current
1870 subexpression was closed. POSIX leaves the meaning of
1871 this to be implementation-defined. We interpret this as
1872 an empty expression (which matches an empty string). */
1873 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1874 if (result == NULL)
1875 return REG_ESPACE;
1876 if (!(ctx->cflags & REG_EXTENDED))
1877 ctx->re--;
1879 else
1880 goto parse_literal;
1881 break;
1883 case CHAR_LBRACKET: /* bracket expression */
1884 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
1885 REST(ctx->re)));
1886 ctx->re++;
1887 status = tre_parse_bracket(ctx, &result);
1888 if (status != REG_OK)
1889 return status;
1890 break;
1892 case CHAR_BACKSLASH:
1893 /* Deal with "\(", "\)" or "\{" for BREs */
1894 if (!(ctx->cflags & REG_EXTENDED)
1895 && ctx->re + 1 < ctx->re_end)
1897 if (*(ctx->re + 1) == CHAR_LPAREN)
1899 ctx->re++;
1900 goto parse_bre_lparen;
1902 else if (*(ctx->re + 1) == CHAR_RPAREN)
1904 ctx->re++;
1905 goto parse_bre_rparen_empty;
1907 if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
1910 if (ctx->re + 1 >= ctx->re_end)
1911 /* Trailing backslash. */
1912 return REG_EESCAPE;
1914 if (!(ctx->cflags & REG_ENHANCED))
1916 DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
1917 ctx->re++;
1918 goto unenhanced_backslash;
1921 /* If a macro is used, parse the expanded macro recursively. */
1923 tre_char_t buf[64];
1924 tre_expand_macro(ctx->re + 1, ctx->re_end,
1925 buf, elementsof(buf));
1926 if (buf[0] != 0)
1928 tre_parse_ctx_t subctx;
1929 memcpy(&subctx, ctx, sizeof(subctx));
1930 subctx.re = buf;
1931 subctx.len = tre_strlen(buf);
1932 subctx.nofirstsub = 1;
1933 status = tre_parse(&subctx);
1934 if (status != REG_OK)
1935 return status;
1936 ctx->re += 2;
1937 ctx->position = subctx.position;
1938 result = subctx.result;
1939 break;
1943 #ifdef REG_LITERAL
1944 if (*(ctx->re + 1) == L'Q')
1946 DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1947 REST(ctx->re)));
1948 ctx->cflags |= REG_LITERAL;
1949 temporary_cflags |= REG_LITERAL;
1950 ctx->re += 2;
1951 STACK_PUSHX(stack, int, 0);
1952 STACK_PUSHX(stack, int, PARSE_ATOM);
1953 break;
1955 #endif /* REG_LITERAL */
1957 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re)));
1958 ctx->re++;
1959 switch (*ctx->re)
1961 case L'b':
1962 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1963 ASSERT_AT_WB, -1);
1964 ctx->re++;
1965 break;
1966 case L'B':
1967 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1968 ASSERT_AT_WB_NEG, -1);
1969 ctx->re++;
1970 break;
1971 case L'<':
1972 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1973 ASSERT_AT_BOW, -1);
1974 ctx->re++;
1975 break;
1976 case L'>':
1977 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1978 ASSERT_AT_EOW, -1);
1979 ctx->re++;
1980 break;
1981 case L'x':
1982 ctx->re++;
1983 if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1985 /* 8 bit hex char. */
1986 char tmp[3] = {0, 0, 0};
1987 long val;
1988 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
1989 REST(ctx->re - 2)));
1991 if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1992 ctx->re < ctx->re_end)
1994 tmp[0] = (char)ctx->re[0];
1995 ctx->re++;
1997 if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1998 ctx->re < ctx->re_end)
2000 tmp[1] = (char)ctx->re[0];
2001 ctx->re++;
2003 val = strtol(tmp, NULL, 16);
2004 result = tre_ast_new_literal(ctx->mem, (int)val,
2005 (int)val, ctx->position);
2006 ctx->position++;
2007 break;
2009 else if (ctx->re < ctx->re_end)
2011 /* Wide char. */
2012 char tmp[32];
2013 long val;
2014 int i = 0;
2015 ctx->re++;
2016 while (ctx->re_end - ctx->re >= 0)
2018 if (ctx->re[0] == CHAR_RBRACE)
2019 break;
2020 if (tre_isxdigit_l(ctx->re[0], ctx->loc))
2022 tmp[i] = (char)ctx->re[0];
2023 i++;
2024 ctx->re++;
2025 continue;
2027 return REG_EBRACE;
2029 ctx->re++;
2030 tmp[i] = 0;
2031 val = strtol(tmp, NULL, 16);
2032 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
2033 ctx->position);
2034 ctx->position++;
2035 break;
2037 /*FALLTHROUGH*/
2039 default:
2040 unenhanced_backslash:
2041 if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) !=
2042 REG_EXTENDED &&
2043 tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0')
2045 /* Back reference (only in BRE or enhanced). */
2046 int val = *ctx->re - L'0';
2047 DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
2048 REST(ctx->re - 1)));
2049 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
2050 ctx->position);
2051 if (result == NULL)
2052 return REG_ESPACE;
2054 /* Set the backref with a submatch id in the invisible
2055 * range (which will be overridden if a real submatch
2056 * is needed) */
2057 result->submatch_id = ctx->submatch_id_invisible++;
2059 ctx->position++;
2060 ctx->num_reorder_tags++;
2061 ctx->max_backref = MAX(val, ctx->max_backref);
2062 ctx->re++;
2064 else
2066 /* Escaped character. */
2067 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
2068 REST(ctx->re - 1)));
2069 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2070 ctx->position);
2071 ctx->position++;
2072 ctx->re++;
2074 break;
2076 if (result == NULL)
2077 return REG_ESPACE;
2078 break;
2080 case CHAR_PERIOD: /* the any-symbol */
2081 DPRINT(("tre_parse: any: '%.*" STRF "'\n",
2082 REST(ctx->re)));
2083 if (ctx->cflags & REG_NEWLINE)
2085 tre_ast_node_t *tmp1;
2086 tre_ast_node_t *tmp2;
2087 tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
2088 ctx->position);
2089 if (!tmp1)
2090 return REG_ESPACE;
2091 tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
2092 ctx->position + 1);
2093 if (!tmp2)
2094 return REG_ESPACE;
2095 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2096 if (!result)
2097 return REG_ESPACE;
2098 ctx->position += 2;
2100 else
2102 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
2103 ctx->position);
2104 if (!result)
2105 return REG_ESPACE;
2106 ctx->position++;
2108 ctx->re++;
2109 break;
2111 case CHAR_CARET: /* beginning of line assertion */
2112 /* '^' has a special meaning everywhere in EREs, at the
2113 beginning of the RE and after \( is BREs. It is also
2114 special in enhanced BREs at the beginning of each branches
2115 of a union */
2116 if (ctx->cflags & REG_EXTENDED
2117 || bre_branch_begin
2118 || ctx->re == ctx->re_start)
2120 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
2121 REST(ctx->re)));
2122 result = tre_ast_new_literal(ctx->mem, ASSERTION,
2123 ASSERT_AT_BOL, -1);
2124 if (result == NULL)
2125 return REG_ESPACE;
2126 ctx->re++;
2128 else
2129 goto parse_literal;
2130 break;
2132 case CHAR_DOLLAR: /* end of line assertion. */
2133 /* '$' is special everywhere in EREs, and in the end of the
2134 string and before \) is BREs. */
2135 if (ctx->cflags & REG_EXTENDED
2136 || (ctx->re + 2 < ctx->re_end
2137 && *(ctx->re + 1) == CHAR_BACKSLASH
2138 && *(ctx->re + 2) == CHAR_RPAREN)
2139 || ctx->re + 1 == ctx->re_end)
2141 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
2142 REST(ctx->re)));
2143 result = tre_ast_new_literal(ctx->mem, ASSERTION,
2144 ASSERT_AT_EOL, -1);
2145 if (result == NULL)
2146 return REG_ESPACE;
2147 ctx->re++;
2149 else
2150 goto parse_literal;
2151 break;
2153 default:
2154 parse_literal:
2156 if (temporary_cflags && ctx->re + 1 < ctx->re_end
2157 && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
2159 DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n",
2160 REST(ctx->re)));
2161 ctx->cflags &= ~temporary_cflags;
2162 temporary_cflags = 0;
2163 ctx->re += 2;
2164 if (ctx->re < ctx->re_end)
2166 STACK_PUSHX(stack, int, 0);
2167 STACK_PUSHX(stack, int, PARSE_ATOM);
2169 else
2171 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2172 if (!result) return REG_ESPACE;
2174 break;
2178 /* We are expecting an atom. If the subexpression (or the whole
2179 regexp ends here, we interpret it as an empty expression
2180 (which matches an empty string), which is an error.
2181 Iterations of an empty expression is also an error. */
2182 #ifdef REG_LITERAL
2183 if (!(ctx->cflags & REG_LITERAL))
2185 #endif /* REG_LITERAL */
2186 /* error on end of string */
2187 if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
2188 : REG_EMPTY;
2189 /* error on unions and iterations of empty expressions */
2190 if (ctx->cflags & REG_EXTENDED)
2192 if (ctx->re < ctx->re_end)
2194 if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
2195 if (*ctx->re == CHAR_LBRACE)
2197 ctx->re++;
2198 empty_parse_bound:
2199 /* We need to parse the bound first and return
2200 any error, before returning REG_BADRPT */
2201 status = tre_parse_bound(ctx, NULL);
2202 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2203 /* For ERE, if REG_NOMATCH is returned, we
2204 treat the lbrace as a literal. */
2205 if (status == REG_NOMATCH)
2207 ctx->re--;
2208 /* Drop down to literal-handling code */
2210 else
2212 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2213 if (status != REG_OK)
2214 return status;
2215 return REG_BADRPT;
2216 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2218 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2220 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2221 else
2222 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2223 if (*ctx->re == CHAR_STAR
2224 || *ctx->re == CHAR_PLUS
2225 || *ctx->re == CHAR_QUESTIONMARK)
2227 return REG_BADRPT;
2231 else if (ctx->re + 1 < ctx->re_end
2232 && *ctx->re == CHAR_BACKSLASH
2233 && *(ctx->re + 1) == CHAR_LBRACE)
2235 ctx->re += 2;
2236 goto empty_parse_bound;
2238 #ifdef REG_LITERAL
2240 #endif /* REG_LITERAL */
2242 DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
2243 REST(ctx->re)));
2244 /* Note that we can't use an tre_isalpha() test here, since there
2245 may be characters which are alphabetic but neither upper or
2246 lower case. */
2247 if (ctx->cflags & REG_ICASE
2248 && (tre_isupper_l(*ctx->re, ctx->loc) ||
2249 tre_islower_l(*ctx->re, ctx->loc)))
2251 tre_ast_node_t *tmp1;
2252 tre_ast_node_t *tmp2;
2254 /* XXX - Can there be more than one opposite-case
2255 counterpoints for some character in some locale? Or
2256 more than two characters which all should be regarded
2257 the same character if case is ignored? If yes, there
2258 does not seem to be a portable way to detect it. I guess
2259 that at least for multi-character collating elements there
2260 could be several opposite-case counterpoints, but they
2261 cannot be supported portably anyway. */
2262 tmp1 = tre_ast_new_literal(ctx->mem,
2263 tre_toupper_l(*ctx->re, ctx->loc),
2264 tre_toupper_l(*ctx->re, ctx->loc),
2265 ctx->position);
2266 if (!tmp1)
2267 return REG_ESPACE;
2268 tmp2 = tre_ast_new_literal(ctx->mem,
2269 tre_tolower_l(*ctx->re, ctx->loc),
2270 tre_tolower_l(*ctx->re, ctx->loc),
2271 ctx->position);
2272 if (!tmp2)
2273 return REG_ESPACE;
2274 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2275 if (!result)
2276 return REG_ESPACE;
2278 else
2280 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2281 ctx->position);
2282 if (!result)
2283 return REG_ESPACE;
2285 ctx->position++;
2286 ctx->re++;
2287 break;
2289 break;
2292 case PARSE_MARK_FOR_SUBMATCH:
2294 int submatch_id = tre_stack_pop_int(stack);
2296 ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
2297 if (result->submatch_id >= 0 &&
2298 result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
2300 tre_ast_node_t *n, *tmp_node;
2301 if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
2302 break;
2303 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2304 if (n == NULL)
2305 return REG_ESPACE;
2306 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
2307 if (tmp_node == NULL)
2308 return REG_ESPACE;
2309 tmp_node->num_submatches = result->num_submatches;
2310 result = tmp_node;
2312 result->submatch_id = submatch_id;
2313 if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
2314 result->num_submatches++;
2315 break;
2318 default:
2319 assert(0);
2320 break;
2324 /* Check for missing closing parentheses. */
2325 if (depth > 0)
2326 return REG_EPAREN;
2328 ctx->result = result;
2330 return REG_OK;
2333 /* EOF */