posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          int node, int root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static int fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   int nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  int nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      int nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             int accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         int *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       int *char_class_alloc,
  98                                       const unsigned char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const unsigned char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const unsigned char *class_name,
 111                                        const unsigned char *extra,
 112                                        int non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 const char __re_error_msgid[] attribute_hidden =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 const size_t __re_error_msgid_idx[] attribute_hidden =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206 /* re_compile_pattern is the GNU regular expression compiler: it
 207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 208    Returns 0 if the pattern was valid, otherwise an error string.
 209
 210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 211    are set in BUFP on entry.  */
 212
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 {
 219   reg_errcode_t ret;
 220
 221   /* And GNU code determines whether or not to get register information
 222      by passing null for the REGS argument to re_match, etc., not by
 223      setting no_sub, unless RE_NO_SUB is set.  */
 224   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 225
 226   /* Match anchors at newline.  */
 227   bufp->newline_anchor = 1;
 228
 229   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 230
 231   if (!ret)
 232     return NULL;
 233   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 234 }
 235 #ifdef _LIBC
 236 weak_alias (__re_compile_pattern, re_compile_pattern)
 237 #endif
 238
 239 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 240    also be assigned to arbitrarily: each pattern buffer stores its own
 241    syntax, so it can be changed between regex compilations.  */
 242 /* This has no initializer because initialized variables in Emacs
 243    become read-only after dumping.  */
 244 reg_syntax_t re_syntax_options;
 245
 246
 247 /* Specify the precise syntax of regexps for compilation.  This provides
 248    for compatibility for various utilities which historically have
 249    different, incompatible syntaxes.
 250
 251    The argument SYNTAX is a bit mask comprised of the various bits
 252    defined in regex.h.  We return the old syntax.  */
 253
 254 reg_syntax_t
 255 re_set_syntax (syntax)
 256     reg_syntax_t syntax;
 257 {
 258   reg_syntax_t ret = re_syntax_options;
 259
 260   re_syntax_options = syntax;
 261   return ret;
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_set_syntax, re_set_syntax)
 265 #endif
 266
 267 int
 268 re_compile_fastmap (bufp)
 269     struct re_pattern_buffer *bufp;
 270 {
 271   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 272   char *fastmap = bufp->fastmap;
 273
 274   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 275   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 276   if (dfa->init_state != dfa->init_state_word)
 277     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 278   if (dfa->init_state != dfa->init_state_nl)
 279     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 280   if (dfa->init_state != dfa->init_state_begbuf)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 282   bufp->fastmap_accurate = 1;
 283   return 0;
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 287 #endif
 288
 289 static inline void
 290 __attribute ((always_inline))
 291 re_set_fastmap (char *fastmap, int icase, int ch)
 292 {
 293   fastmap[ch] = 1;
 294   if (icase)
 295     fastmap[tolower (ch)] = 1;
 296 }
 297
 298 /* Helper function for re_compile_fastmap.
 299    Compile fastmap for the initial_state INIT_STATE.  */
 300
 301 static void
 302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 303                          char *fastmap)
 304 {
 305   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 306   int node_cnt;
 307   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 308   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 309     {
 310       int node = init_state->nodes.elems[node_cnt];
 311       re_token_type_t type = dfa->nodes[node].type;
 312
 313       if (type == CHARACTER)
 314         {
 315           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 316 #ifdef RE_ENABLE_I18N
 317           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 318             {
 319               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 320               wchar_t wc;
 321               mbstate_t state;
 322
 323               p = buf;
 324               *p++ = dfa->nodes[node].opr.c;
 325               while (++node < dfa->nodes_len
 326                      && dfa->nodes[node].type == CHARACTER
 327                      && dfa->nodes[node].mb_partial)
 328                 *p++ = dfa->nodes[node].opr.c;
 329               memset (&state, '\0', sizeof (state));
 330               if (mbrtowc (&wc, (const char *) buf, p - buf,
 331                            &state) == p - buf
 332                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 333                       != (size_t) -1))
 334                 re_set_fastmap (fastmap, 0, buf[0]);
 335             }
 336 #endif
 337         }
 338       else if (type == SIMPLE_BRACKET)
 339         {
 340           int i, ch;
 341           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 342             {
 343               int j;
 344               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 345               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 346                 if (w & ((bitset_word_t) 1 << j))
 347                   re_set_fastmap (fastmap, icase, ch);
 348             }
 349         }
 350 #ifdef RE_ENABLE_I18N
 351       else if (type == COMPLEX_BRACKET)
 352         {
 353           int i;
 354           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 355           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 356               || cset->nranges || cset->nchar_classes)
 357             {
 358 # ifdef _LIBC
 359               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 360                 {
 361                   /* In this case we want to catch the bytes which are
 362                      the first byte of any collation elements.
 363                      e.g. In da_DK, we want to catch 'a' since "aa"
 364                           is a valid collation element, and don't catch
 365                           'b' since 'b' is the only collation element
 366                           which starts from 'b'.  */
 367                   const int32_t *table = (const int32_t *)
 368                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 369                   for (i = 0; i < SBC_MAX; ++i)
 370                     if (table[i] < 0)
 371                       re_set_fastmap (fastmap, icase, i);
 372                 }
 373 # else
 374               if (dfa->mb_cur_max > 1)
 375                 for (i = 0; i < SBC_MAX; ++i)
 376                   if (__btowc (i) == WEOF)
 377                     re_set_fastmap (fastmap, icase, i);
 378 # endif /* not _LIBC */
 379             }
 380           for (i = 0; i < cset->nmbchars; ++i)
 381             {
 382               char buf[256];
 383               mbstate_t state;
 384               memset (&state, '\0', sizeof (state));
 385               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 386                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 387               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 388                 {
 389                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 390                       != (size_t) -1)
 391                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 392                 }
 393             }
 394         }
 395 #endif /* RE_ENABLE_I18N */
 396       else if (type == OP_PERIOD
 397 #ifdef RE_ENABLE_I18N
 398                || type == OP_UTF8_PERIOD
 399 #endif /* RE_ENABLE_I18N */
 400                || type == END_OF_RE)
 401         {
 402           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 403           if (type == END_OF_RE)
 404             bufp->can_be_null = 1;
 405           return;
 406         }
 407     }
 408 }
 409 \f
 410 /* Entry point for POSIX code.  */
 411 /* regcomp takes a regular expression as a string and compiles it.
 412
 413    PREG is a regex_t *.  We do not expect any fields to be initialized,
 414    since POSIX says we shouldn't.  Thus, we set
 415
 416      `buffer' to the compiled pattern;
 417      `used' to the length of the compiled pattern;
 418      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 419        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 420        RE_SYNTAX_POSIX_BASIC;
 421      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 422      `fastmap' to an allocated space for the fastmap;
 423      `fastmap_accurate' to zero;
 424      `re_nsub' to the number of subexpressions in PATTERN.
 425
 426    PATTERN is the address of the pattern string.
 427
 428    CFLAGS is a series of bits which affect compilation.
 429
 430      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 431      use POSIX basic syntax.
 432
 433      If REG_NEWLINE is set, then . and [^...] don't match newline.
 434      Also, regexec will try a match beginning after every newline.
 435
 436      If REG_ICASE is set, then we considers upper- and lowercase
 437      versions of letters to be equivalent when matching.
 438
 439      If REG_NOSUB is set, then when PREG is passed to regexec, that
 440      routine will report only success or failure, and nothing about the
 441      registers.
 442
 443    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 444    the return codes and their meanings.)  */
 445
 446 int
 447 regcomp (preg, pattern, cflags)
 448     regex_t *__restrict preg;
 449     const char *__restrict pattern;
 450     int cflags;
 451 {
 452   reg_errcode_t ret;
 453   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 454                          : RE_SYNTAX_POSIX_BASIC);
 455
 456   preg->buffer = NULL;
 457   preg->allocated = 0;
 458   preg->used = 0;
 459
 460   /* Try to allocate space for the fastmap.  */
 461   preg->fastmap = re_malloc (char, SBC_MAX);
 462   if (BE (preg->fastmap == NULL, 0))
 463     return REG_ESPACE;
 464
 465   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 466
 467   /* If REG_NEWLINE is set, newlines are treated differently.  */
 468   if (cflags & REG_NEWLINE)
 469     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 470       syntax &= ~RE_DOT_NEWLINE;
 471       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 472       /* It also changes the matching behavior.  */
 473       preg->newline_anchor = 1;
 474     }
 475   else
 476     preg->newline_anchor = 0;
 477   preg->no_sub = !!(cflags & REG_NOSUB);
 478   preg->translate = NULL;
 479
 480   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 481
 482   /* POSIX doesn't distinguish between an unmatched open-group and an
 483      unmatched close-group: both are REG_EPAREN.  */
 484   if (ret == REG_ERPAREN)
 485     ret = REG_EPAREN;
 486
 487   /* We have already checked preg->fastmap != NULL.  */
 488   if (BE (ret == REG_NOERROR, 1))
 489     /* Compute the fastmap now, since regexec cannot modify the pattern
 490        buffer.  This function never fails in this implementation.  */
 491     (void) re_compile_fastmap (preg);
 492   else
 493     {
 494       /* Some error occurred while compiling the expression.  */
 495       re_free (preg->fastmap);
 496       preg->fastmap = NULL;
 497     }
 498
 499   return (int) ret;
 500 }
 501 #ifdef _LIBC
 502 weak_alias (__regcomp, regcomp)
 503 #endif
 504
 505 /* Returns a message corresponding to an error code, ERRCODE, returned
 506    from either regcomp or regexec.   We don't use PREG here.  */
 507
 508 size_t
 509 regerror (errcode, preg, errbuf, errbuf_size)
 510     int errcode;
 511     const regex_t *__restrict preg;
 512     char *__restrict errbuf;
 513     size_t errbuf_size;
 514 {
 515   const char *msg;
 516   size_t msg_size;
 517
 518   if (BE (errcode < 0
 519           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 520                                / sizeof (__re_error_msgid_idx[0])), 0))
 521     /* Only error codes returned by the rest of the code should be passed
 522        to this routine.  If we are given anything else, or if other regex
 523        code generates an invalid error code, then the program has a bug.
 524        Dump core so we can fix it.  */
 525     abort ();
 526
 527   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 528
 529   msg_size = strlen (msg) + 1; /* Includes the null.  */
 530
 531   if (BE (errbuf_size != 0, 1))
 532     {
 533       if (BE (msg_size > errbuf_size, 0))
 534         {
 535 #if defined HAVE_MEMPCPY || defined _LIBC
 536           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 537 #else
 538           memcpy (errbuf, msg, errbuf_size - 1);
 539           errbuf[errbuf_size - 1] = 0;
 540 #endif
 541         }
 542       else
 543         memcpy (errbuf, msg, msg_size);
 544     }
 545
 546   return msg_size;
 547 }
 548 #ifdef _LIBC
 549 weak_alias (__regerror, regerror)
 550 #endif
 551
 552
 553 #ifdef RE_ENABLE_I18N
 554 /* This static array is used for the map to single-byte characters when
 555    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 556    it the same all the time.  UTF-8 is the preferred encoding so this is
 557    a worthwhile optimization.  */
 558 static const bitset_t utf8_sb_map =
 559 {
 560   /* Set the first 128 bits.  */
 561   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 562 };
 563 #endif
 564
 565
 566 static void
 567 free_dfa_content (re_dfa_t *dfa)
 568 {
 569   int i, j;
 570
 571   if (dfa->nodes)
 572     for (i = 0; i < dfa->nodes_len; ++i)
 573       free_token (dfa->nodes + i);
 574   re_free (dfa->nexts);
 575   for (i = 0; i < dfa->nodes_len; ++i)
 576     {
 577       if (dfa->eclosures != NULL)
 578         re_node_set_free (dfa->eclosures + i);
 579       if (dfa->inveclosures != NULL)
 580         re_node_set_free (dfa->inveclosures + i);
 581       if (dfa->edests != NULL)
 582         re_node_set_free (dfa->edests + i);
 583     }
 584   re_free (dfa->edests);
 585   re_free (dfa->eclosures);
 586   re_free (dfa->inveclosures);
 587   re_free (dfa->nodes);
 588
 589   if (dfa->state_table)
 590     for (i = 0; i <= dfa->state_hash_mask; ++i)
 591       {
 592         struct re_state_table_entry *entry = dfa->state_table + i;
 593         for (j = 0; j < entry->num; ++j)
 594           {
 595             re_dfastate_t *state = entry->array[j];
 596             free_state (state);
 597           }
 598         re_free (entry->array);
 599       }
 600   re_free (dfa->state_table);
 601 #ifdef RE_ENABLE_I18N
 602   if (dfa->sb_char != utf8_sb_map)
 603     re_free (dfa->sb_char);
 604 #endif
 605   re_free (dfa->subexp_map);
 606 #ifdef DEBUG
 607   re_free (dfa->re_str);
 608 #endif
 609
 610   re_free (dfa);
 611 }
 612
 613
 614 /* Free dynamically allocated space used by PREG.  */
 615
 616 void
 617 regfree (preg)
 618     regex_t *preg;
 619 {
 620   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 621   if (BE (dfa != NULL, 1))
 622     free_dfa_content (dfa);
 623   preg->buffer = NULL;
 624   preg->allocated = 0;
 625
 626   re_free (preg->fastmap);
 627   preg->fastmap = NULL;
 628
 629   re_free (preg->translate);
 630   preg->translate = NULL;
 631 }
 632 #ifdef _LIBC
 633 weak_alias (__regfree, regfree)
 634 #endif
 635 \f
 636 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 637    them unless specifically requested.  */
 638
 639 #if defined _REGEX_RE_COMP || defined _LIBC
 640
 641 /* BSD has one and only one pattern buffer.  */
 642 static struct re_pattern_buffer re_comp_buf;
 643
 644 char *
 645 # ifdef _LIBC
 646 /* Make these definitions weak in libc, so POSIX programs can redefine
 647    these names if they don't use our functions, and still use
 648    regcomp/regexec above without link errors.  */
 649 weak_function
 650 # endif
 651 re_comp (s)
 652      const char *s;
 653 {
 654   reg_errcode_t ret;
 655   char *fastmap;
 656
 657   if (!s)
 658     {
 659       if (!re_comp_buf.buffer)
 660         return gettext ("No previous regular expression");
 661       return 0;
 662     }
 663
 664   if (re_comp_buf.buffer)
 665     {
 666       fastmap = re_comp_buf.fastmap;
 667       re_comp_buf.fastmap = NULL;
 668       __regfree (&re_comp_buf);
 669       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 670       re_comp_buf.fastmap = fastmap;
 671     }
 672
 673   if (re_comp_buf.fastmap == NULL)
 674     {
 675       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 676       if (re_comp_buf.fastmap == NULL)
 677         return (char *) gettext (__re_error_msgid
 678                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 679     }
 680
 681   /* Since `re_exec' always passes NULL for the `regs' argument, we
 682      don't need to initialize the pattern buffer fields which affect it.  */
 683
 684   /* Match anchors at newlines.  */
 685   re_comp_buf.newline_anchor = 1;
 686
 687   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 688
 689   if (!ret)
 690     return NULL;
 691
 692   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 693   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 694 }
 695
 696 #ifdef _LIBC
 697 libc_freeres_fn (free_mem)
 698 {
 699   __regfree (&re_comp_buf);
 700 }
 701 #endif
 702
 703 #endif /* _REGEX_RE_COMP */
 704 \f
 705 /* Internal entry point.
 706    Compile the regular expression PATTERN, whose length is LENGTH.
 707    SYNTAX indicate regular expression's syntax.  */
 708
 709 static reg_errcode_t
 710 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 711                      reg_syntax_t syntax)
 712 {
 713   reg_errcode_t err = REG_NOERROR;
 714   re_dfa_t *dfa;
 715   re_string_t regexp;
 716
 717   /* Initialize the pattern buffer.  */
 718   preg->fastmap_accurate = 0;
 719   preg->syntax = syntax;
 720   preg->not_bol = preg->not_eol = 0;
 721   preg->used = 0;
 722   preg->re_nsub = 0;
 723   preg->can_be_null = 0;
 724   preg->regs_allocated = REGS_UNALLOCATED;
 725
 726   /* Initialize the dfa.  */
 727   dfa = (re_dfa_t *) preg->buffer;
 728   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 729     {
 730       /* If zero allocated, but buffer is non-null, try to realloc
 731          enough space.  This loses if buffer's address is bogus, but
 732          that is the user's responsibility.  If ->buffer is NULL this
 733          is a simple allocation.  */
 734       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 735       if (dfa == NULL)
 736         return REG_ESPACE;
 737       preg->allocated = sizeof (re_dfa_t);
 738       preg->buffer = (unsigned char *) dfa;
 739     }
 740   preg->used = sizeof (re_dfa_t);
 741
 742   err = init_dfa (dfa, length);
 743   if (BE (err != REG_NOERROR, 0))
 744     {
 745       free_dfa_content (dfa);
 746       preg->buffer = NULL;
 747       preg->allocated = 0;
 748       return err;
 749     }
 750 #ifdef DEBUG
 751   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 752   dfa->re_str = re_malloc (char, length + 1);
 753   strncpy (dfa->re_str, pattern, length + 1);
 754 #endif
 755
 756   __libc_lock_init (dfa->lock);
 757
 758   err = re_string_construct (&regexp, pattern, length, preg->translate,
 759                              syntax & RE_ICASE, dfa);
 760   if (BE (err != REG_NOERROR, 0))
 761     {
 762     re_compile_internal_free_return:
 763       free_workarea_compile (preg);
 764       re_string_destruct (&regexp);
 765       free_dfa_content (dfa);
 766       preg->buffer = NULL;
 767       preg->allocated = 0;
 768       return err;
 769     }
 770
 771   /* Parse the regular expression, and build a structure tree.  */
 772   preg->re_nsub = 0;
 773   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 774   if (BE (dfa->str_tree == NULL, 0))
 775     goto re_compile_internal_free_return;
 776
 777   /* Analyze the tree and create the nfa.  */
 778   err = analyze (preg);
 779   if (BE (err != REG_NOERROR, 0))
 780     goto re_compile_internal_free_return;
 781
 782 #ifdef RE_ENABLE_I18N
 783   /* If possible, do searching in single byte encoding to speed things up.  */
 784   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 785     optimize_utf8 (dfa);
 786 #endif
 787
 788   /* Then create the initial state of the dfa.  */
 789   err = create_initial_state (dfa);
 790
 791   /* Release work areas.  */
 792   free_workarea_compile (preg);
 793   re_string_destruct (&regexp);
 794
 795   if (BE (err != REG_NOERROR, 0))
 796     {
 797       free_dfa_content (dfa);
 798       preg->buffer = NULL;
 799       preg->allocated = 0;
 800     }
 801
 802   return err;
 803 }
 804
 805 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 806    as the initial length of some arrays.  */
 807
 808 static reg_errcode_t
 809 init_dfa (re_dfa_t *dfa, size_t pat_len)
 810 {
 811   unsigned int table_size;
 812 #ifndef _LIBC
 813   char *codeset_name;
 814 #endif
 815
 816   memset (dfa, '\0', sizeof (re_dfa_t));
 817
 818   /* Force allocation of str_tree_storage the first time.  */
 819   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 820
 821   /* Avoid overflows.  */
 822   if (pat_len == SIZE_MAX)
 823     return REG_ESPACE;
 824
 825   dfa->nodes_alloc = pat_len + 1;
 826   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 827
 828   /*  table_size = 2 ^ ceil(log pat_len) */
 829   for (table_size = 1; ; table_size <<= 1)
 830     if (table_size > pat_len)
 831       break;
 832
 833   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 834   dfa->state_hash_mask = table_size - 1;
 835
 836   dfa->mb_cur_max = MB_CUR_MAX;
 837 #ifdef _LIBC
 838   if (dfa->mb_cur_max == 6
 839       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 840     dfa->is_utf8 = 1;
 841   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 842                        != 0);
 843 #else
 844 # ifdef HAVE_LANGINFO_CODESET
 845   codeset_name = nl_langinfo (CODESET);
 846 # else
 847   codeset_name = getenv ("LC_ALL");
 848   if (codeset_name == NULL || codeset_name[0] == '\0')
 849     codeset_name = getenv ("LC_CTYPE");
 850   if (codeset_name == NULL || codeset_name[0] == '\0')
 851     codeset_name = getenv ("LANG");
 852   if (codeset_name == NULL)
 853     codeset_name = "";
 854   else if (strchr (codeset_name, '.') !=  NULL)
 855     codeset_name = strchr (codeset_name, '.') + 1;
 856 # endif
 857
 858   if (strcasecmp (codeset_name, "UTF-8") == 0
 859       || strcasecmp (codeset_name, "UTF8") == 0)
 860     dfa->is_utf8 = 1;
 861
 862   /* We check exhaustively in the loop below if this charset is a
 863      superset of ASCII.  */
 864   dfa->map_notascii = 0;
 865 #endif
 866
 867 #ifdef RE_ENABLE_I18N
 868   if (dfa->mb_cur_max > 1)
 869     {
 870       if (dfa->is_utf8)
 871         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 872       else
 873         {
 874           int i, j, ch;
 875
 876           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 877           if (BE (dfa->sb_char == NULL, 0))
 878             return REG_ESPACE;
 879
 880           /* Set the bits corresponding to single byte chars.  */
 881           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 882             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 883               {
 884                 wint_t wch = __btowc (ch);
 885                 if (wch != WEOF)
 886                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 887 # ifndef _LIBC
 888                 if (isascii (ch) && wch != ch)
 889                   dfa->map_notascii = 1;
 890 # endif
 891               }
 892         }
 893     }
 894 #endif
 895
 896   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 897     return REG_ESPACE;
 898   return REG_NOERROR;
 899 }
 900
 901 /* Initialize WORD_CHAR table, which indicate which character is
 902    "word".  In this case "word" means that it is the word construction
 903    character used by some operators like "\<", "\>", etc.  */
 904
 905 static void
 906 internal_function
 907 init_word_char (re_dfa_t *dfa)
 908 {
 909   int i, j, ch;
 910   dfa->word_ops_used = 1;
 911   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 912     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 913       if (isalnum (ch) || ch == '_')
 914         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 915 }
 916
 917 /* Free the work area which are only used while compiling.  */
 918
 919 static void
 920 free_workarea_compile (regex_t *preg)
 921 {
 922   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 923   bin_tree_storage_t *storage, *next;
 924   for (storage = dfa->str_tree_storage; storage; storage = next)
 925     {
 926       next = storage->next;
 927       re_free (storage);
 928     }
 929   dfa->str_tree_storage = NULL;
 930   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 931   dfa->str_tree = NULL;
 932   re_free (dfa->org_indices);
 933   dfa->org_indices = NULL;
 934 }
 935
 936 /* Create initial states for all contexts.  */
 937
 938 static reg_errcode_t
 939 create_initial_state (re_dfa_t *dfa)
 940 {
 941   int first, i;
 942   reg_errcode_t err;
 943   re_node_set init_nodes;
 944
 945   /* Initial states have the epsilon closure of the node which is
 946      the first node of the regular expression.  */
 947   first = dfa->str_tree->first->node_idx;
 948   dfa->init_node = first;
 949   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 950   if (BE (err != REG_NOERROR, 0))
 951     return err;
 952
 953   /* The back-references which are in initial states can epsilon transit,
 954      since in this case all of the subexpressions can be null.
 955      Then we add epsilon closures of the nodes which are the next nodes of
 956      the back-references.  */
 957   if (dfa->nbackref > 0)
 958     for (i = 0; i < init_nodes.nelem; ++i)
 959       {
 960         int node_idx = init_nodes.elems[i];
 961         re_token_type_t type = dfa->nodes[node_idx].type;
 962
 963         int clexp_idx;
 964         if (type != OP_BACK_REF)
 965           continue;
 966         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 967           {
 968             re_token_t *clexp_node;
 969             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 970             if (clexp_node->type == OP_CLOSE_SUBEXP
 971                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 972               break;
 973           }
 974         if (clexp_idx == init_nodes.nelem)
 975           continue;
 976
 977         if (type == OP_BACK_REF)
 978           {
 979             int dest_idx = dfa->edests[node_idx].elems[0];
 980             if (!re_node_set_contains (&init_nodes, dest_idx))
 981               {
 982                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 983                 i = 0;
 984               }
 985           }
 986       }
 987
 988   /* It must be the first time to invoke acquire_state.  */
 989   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 990   /* We don't check ERR here, since the initial state must not be NULL.  */
 991   if (BE (dfa->init_state == NULL, 0))
 992     return err;
 993   if (dfa->init_state->has_constraint)
 994     {
 995       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 996                                                        CONTEXT_WORD);
 997       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 998                                                      CONTEXT_NEWLINE);
 999       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1000                                                          &init_nodes,
1001                                                          CONTEXT_NEWLINE
1002                                                          | CONTEXT_BEGBUF);
1003       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1004               || dfa->init_state_begbuf == NULL, 0))
1005         return err;
1006     }
1007   else
1008     dfa->init_state_word = dfa->init_state_nl
1009       = dfa->init_state_begbuf = dfa->init_state;
1010
1011   re_node_set_free (&init_nodes);
1012   return REG_NOERROR;
1013 }
1014 \f
1015 #ifdef RE_ENABLE_I18N
1016 /* If it is possible to do searching in single byte encoding instead of UTF-8
1017    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1018    DFA nodes where needed.  */
1019
1020 static void
1021 optimize_utf8 (re_dfa_t *dfa)
1022 {
1023   int node, i, mb_chars = 0, has_period = 0;
1024
1025   for (node = 0; node < dfa->nodes_len; ++node)
1026     switch (dfa->nodes[node].type)
1027       {
1028       case CHARACTER:
1029         if (dfa->nodes[node].opr.c >= 0x80)
1030           mb_chars = 1;
1031         break;
1032       case ANCHOR:
1033         switch (dfa->nodes[node].opr.ctx_type)
1034           {
1035           case LINE_FIRST:
1036           case LINE_LAST:
1037           case BUF_FIRST:
1038           case BUF_LAST:
1039             break;
1040           default:
1041             /* Word anchors etc. cannot be handled.  It's okay to test
1042                opr.ctx_type since constraints (for all DFA nodes) are
1043                created by ORing one or more opr.ctx_type values.  */
1044             return;
1045           }
1046         break;
1047       case OP_PERIOD:
1048         has_period = 1;
1049         break;
1050       case OP_BACK_REF:
1051       case OP_ALT:
1052       case END_OF_RE:
1053       case OP_DUP_ASTERISK:
1054       case OP_OPEN_SUBEXP:
1055       case OP_CLOSE_SUBEXP:
1056         break;
1057       case COMPLEX_BRACKET:
1058         return;
1059       case SIMPLE_BRACKET:
1060         /* Just double check.  The non-ASCII range starts at 0x80.  */
1061         assert (0x80 % BITSET_WORD_BITS == 0);
1062         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1063           if (dfa->nodes[node].opr.sbcset[i])
1064             return;
1065         break;
1066       default:
1067         abort ();
1068       }
1069
1070   if (mb_chars || has_period)
1071     for (node = 0; node < dfa->nodes_len; ++node)
1072       {
1073         if (dfa->nodes[node].type == CHARACTER
1074             && dfa->nodes[node].opr.c >= 0x80)
1075           dfa->nodes[node].mb_partial = 0;
1076         else if (dfa->nodes[node].type == OP_PERIOD)
1077           dfa->nodes[node].type = OP_UTF8_PERIOD;
1078       }
1079
1080   /* The search can be in single byte locale.  */
1081   dfa->mb_cur_max = 1;
1082   dfa->is_utf8 = 0;
1083   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1084 }
1085 #endif
1086 \f
1087 /* Analyze the structure tree, and calculate "first", "next", "edest",
1088    "eclosure", and "inveclosure".  */
1089
1090 static reg_errcode_t
1091 analyze (regex_t *preg)
1092 {
1093   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1094   reg_errcode_t ret;
1095
1096   /* Allocate arrays.  */
1097   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1098   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1099   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1100   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1101   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1102           || dfa->eclosures == NULL, 0))
1103     return REG_ESPACE;
1104
1105   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1106   if (dfa->subexp_map != NULL)
1107     {
1108       int i;
1109       for (i = 0; i < preg->re_nsub; i++)
1110         dfa->subexp_map[i] = i;
1111       preorder (dfa->str_tree, optimize_subexps, dfa);
1112       for (i = 0; i < preg->re_nsub; i++)
1113         if (dfa->subexp_map[i] != i)
1114           break;
1115       if (i == preg->re_nsub)
1116         {
1117           free (dfa->subexp_map);
1118           dfa->subexp_map = NULL;
1119         }
1120     }
1121
1122   ret = postorder (dfa->str_tree, lower_subexps, preg);
1123   if (BE (ret != REG_NOERROR, 0))
1124     return ret;
1125   ret = postorder (dfa->str_tree, calc_first, dfa);
1126   if (BE (ret != REG_NOERROR, 0))
1127     return ret;
1128   preorder (dfa->str_tree, calc_next, dfa);
1129   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1130   if (BE (ret != REG_NOERROR, 0))
1131     return ret;
1132   ret = calc_eclosure (dfa);
1133   if (BE (ret != REG_NOERROR, 0))
1134     return ret;
1135
1136   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1137      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1138   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1139       || dfa->nbackref)
1140     {
1141       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1142       if (BE (dfa->inveclosures == NULL, 0))
1143         return REG_ESPACE;
1144       ret = calc_inveclosure (dfa);
1145     }
1146
1147   return ret;
1148 }
1149
1150 /* Our parse trees are very unbalanced, so we cannot use a stack to
1151    implement parse tree visits.  Instead, we use parent pointers and
1152    some hairy code in these two functions.  */
1153 static reg_errcode_t
1154 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1155            void *extra)
1156 {
1157   bin_tree_t *node, *prev;
1158
1159   for (node = root; ; )
1160     {
1161       /* Descend down the tree, preferably to the left (or to the right
1162          if that's the only child).  */
1163       while (node->left || node->right)
1164         if (node->left)
1165           node = node->left;
1166         else
1167           node = node->right;
1168
1169       do
1170         {
1171           reg_errcode_t err = fn (extra, node);
1172           if (BE (err != REG_NOERROR, 0))
1173             return err;
1174           if (node->parent == NULL)
1175             return REG_NOERROR;
1176           prev = node;
1177           node = node->parent;
1178         }
1179       /* Go up while we have a node that is reached from the right.  */
1180       while (node->right == prev || node->right == NULL);
1181       node = node->right;
1182     }
1183 }
1184
1185 static reg_errcode_t
1186 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1187           void *extra)
1188 {
1189   bin_tree_t *node;
1190
1191   for (node = root; ; )
1192     {
1193       reg_errcode_t err = fn (extra, node);
1194       if (BE (err != REG_NOERROR, 0))
1195         return err;
1196
1197       /* Go to the left node, or up and to the right.  */
1198       if (node->left)
1199         node = node->left;
1200       else
1201         {
1202           bin_tree_t *prev = NULL;
1203           while (node->right == prev || node->right == NULL)
1204             {
1205               prev = node;
1206               node = node->parent;
1207               if (!node)
1208                 return REG_NOERROR;
1209             }
1210           node = node->right;
1211         }
1212     }
1213 }
1214
1215 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1216    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1217    backreferences as well.  Requires a preorder visit.  */
1218 static reg_errcode_t
1219 optimize_subexps (void *extra, bin_tree_t *node)
1220 {
1221   re_dfa_t *dfa = (re_dfa_t *) extra;
1222
1223   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1224     {
1225       int idx = node->token.opr.idx;
1226       node->token.opr.idx = dfa->subexp_map[idx];
1227       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1228     }
1229
1230   else if (node->token.type == SUBEXP
1231            && node->left && node->left->token.type == SUBEXP)
1232     {
1233       int other_idx = node->left->token.opr.idx;
1234
1235       node->left = node->left->left;
1236       if (node->left)
1237         node->left->parent = node;
1238
1239       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1240       if (other_idx < BITSET_WORD_BITS)
1241           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1242     }
1243
1244   return REG_NOERROR;
1245 }
1246
1247 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1248    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1249 static reg_errcode_t
1250 lower_subexps (void *extra, bin_tree_t *node)
1251 {
1252   regex_t *preg = (regex_t *) extra;
1253   reg_errcode_t err = REG_NOERROR;
1254
1255   if (node->left && node->left->token.type == SUBEXP)
1256     {
1257       node->left = lower_subexp (&err, preg, node->left);
1258       if (node->left)
1259         node->left->parent = node;
1260     }
1261   if (node->right && node->right->token.type == SUBEXP)
1262     {
1263       node->right = lower_subexp (&err, preg, node->right);
1264       if (node->right)
1265         node->right->parent = node;
1266     }
1267
1268   return err;
1269 }
1270
1271 static bin_tree_t *
1272 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1273 {
1274   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1275   bin_tree_t *body = node->left;
1276   bin_tree_t *op, *cls, *tree1, *tree;
1277
1278   if (preg->no_sub
1279       /* We do not optimize empty subexpressions, because otherwise we may
1280          have bad CONCAT nodes with NULL children.  This is obviously not
1281          very common, so we do not lose much.  An example that triggers
1282          this case is the sed "script" /\(\)/x.  */
1283       && node->left != NULL
1284       && (node->token.opr.idx >= BITSET_WORD_BITS
1285           || !(dfa->used_bkref_map
1286                & ((bitset_word_t) 1 << node->token.opr.idx))))
1287     return node->left;
1288
1289   /* Convert the SUBEXP node to the concatenation of an
1290      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1291   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1292   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1293   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1294   tree = create_tree (dfa, op, tree1, CONCAT);
1295   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1296     {
1297       *err = REG_ESPACE;
1298       return NULL;
1299     }
1300
1301   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1302   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1303   return tree;
1304 }
1305
1306 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1307    nodes.  Requires a postorder visit.  */
1308 static reg_errcode_t
1309 calc_first (void *extra, bin_tree_t *node)
1310 {
1311   re_dfa_t *dfa = (re_dfa_t *) extra;
1312   if (node->token.type == CONCAT)
1313     {
1314       node->first = node->left->first;
1315       node->node_idx = node->left->node_idx;
1316     }
1317   else
1318     {
1319       node->first = node;
1320       node->node_idx = re_dfa_add_node (dfa, node->token);
1321       if (BE (node->node_idx == -1, 0))
1322         return REG_ESPACE;
1323       if (node->token.type == ANCHOR)
1324         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1325     }
1326   return REG_NOERROR;
1327 }
1328
1329 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1330 static reg_errcode_t
1331 calc_next (void *extra, bin_tree_t *node)
1332 {
1333   switch (node->token.type)
1334     {
1335     case OP_DUP_ASTERISK:
1336       node->left->next = node;
1337       break;
1338     case CONCAT:
1339       node->left->next = node->right->first;
1340       node->right->next = node->next;
1341       break;
1342     default:
1343       if (node->left)
1344         node->left->next = node->next;
1345       if (node->right)
1346         node->right->next = node->next;
1347       break;
1348     }
1349   return REG_NOERROR;
1350 }
1351
1352 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1353 static reg_errcode_t
1354 link_nfa_nodes (void *extra, bin_tree_t *node)
1355 {
1356   re_dfa_t *dfa = (re_dfa_t *) extra;
1357   int idx = node->node_idx;
1358   reg_errcode_t err = REG_NOERROR;
1359
1360   switch (node->token.type)
1361     {
1362     case CONCAT:
1363       break;
1364
1365     case END_OF_RE:
1366       assert (node->next == NULL);
1367       break;
1368
1369     case OP_DUP_ASTERISK:
1370     case OP_ALT:
1371       {
1372         int left, right;
1373         dfa->has_plural_match = 1;
1374         if (node->left != NULL)
1375           left = node->left->first->node_idx;
1376         else
1377           left = node->next->node_idx;
1378         if (node->right != NULL)
1379           right = node->right->first->node_idx;
1380         else
1381           right = node->next->node_idx;
1382         assert (left > -1);
1383         assert (right > -1);
1384         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1385       }
1386       break;
1387
1388     case ANCHOR:
1389     case OP_OPEN_SUBEXP:
1390     case OP_CLOSE_SUBEXP:
1391       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1392       break;
1393
1394     case OP_BACK_REF:
1395       dfa->nexts[idx] = node->next->node_idx;
1396       if (node->token.type == OP_BACK_REF)
1397         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1398       break;
1399
1400     default:
1401       assert (!IS_EPSILON_NODE (node->token.type));
1402       dfa->nexts[idx] = node->next->node_idx;
1403       break;
1404     }
1405
1406   return err;
1407 }
1408
1409 /* Duplicate the epsilon closure of the node ROOT_NODE.
1410    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1411    to their own constraint.  */
1412
1413 static reg_errcode_t
1414 internal_function
1415 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1416                         int root_node, unsigned int init_constraint)
1417 {
1418   int org_node, clone_node, ret;
1419   unsigned int constraint = init_constraint;
1420   for (org_node = top_org_node, clone_node = top_clone_node;;)
1421     {
1422       int org_dest, clone_dest;
1423       if (dfa->nodes[org_node].type == OP_BACK_REF)
1424         {
1425           /* If the back reference epsilon-transit, its destination must
1426              also have the constraint.  Then duplicate the epsilon closure
1427              of the destination of the back reference, and store it in
1428              edests of the back reference.  */
1429           org_dest = dfa->nexts[org_node];
1430           re_node_set_empty (dfa->edests + clone_node);
1431           clone_dest = duplicate_node (dfa, org_dest, constraint);
1432           if (BE (clone_dest == -1, 0))
1433             return REG_ESPACE;
1434           dfa->nexts[clone_node] = dfa->nexts[org_node];
1435           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1436           if (BE (ret < 0, 0))
1437             return REG_ESPACE;
1438         }
1439       else if (dfa->edests[org_node].nelem == 0)
1440         {
1441           /* In case of the node can't epsilon-transit, don't duplicate the
1442              destination and store the original destination as the
1443              destination of the node.  */
1444           dfa->nexts[clone_node] = dfa->nexts[org_node];
1445           break;
1446         }
1447       else if (dfa->edests[org_node].nelem == 1)
1448         {
1449           /* In case of the node can epsilon-transit, and it has only one
1450              destination.  */
1451           org_dest = dfa->edests[org_node].elems[0];
1452           re_node_set_empty (dfa->edests + clone_node);
1453           /* If the node is root_node itself, it means the epsilon clsoure
1454              has a loop.   Then tie it to the destination of the root_node.  */
1455           if (org_node == root_node && clone_node != org_node)
1456             {
1457               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1458               if (BE (ret < 0, 0))
1459                 return REG_ESPACE;
1460               break;
1461             }
1462           /* In case of the node has another constraint, add it.  */
1463           constraint |= dfa->nodes[org_node].constraint;
1464           clone_dest = duplicate_node (dfa, org_dest, constraint);
1465           if (BE (clone_dest == -1, 0))
1466             return REG_ESPACE;
1467           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1468           if (BE (ret < 0, 0))
1469             return REG_ESPACE;
1470         }
1471       else /* dfa->edests[org_node].nelem == 2 */
1472         {
1473           /* In case of the node can epsilon-transit, and it has two
1474              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1475           org_dest = dfa->edests[org_node].elems[0];
1476           re_node_set_empty (dfa->edests + clone_node);
1477           /* Search for a duplicated node which satisfies the constraint.  */
1478           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1479           if (clone_dest == -1)
1480             {
1481               /* There is no such duplicated node, create a new one.  */
1482               reg_errcode_t err;
1483               clone_dest = duplicate_node (dfa, org_dest, constraint);
1484               if (BE (clone_dest == -1, 0))
1485                 return REG_ESPACE;
1486               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1487               if (BE (ret < 0, 0))
1488                 return REG_ESPACE;
1489               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1490                                             root_node, constraint);
1491               if (BE (err != REG_NOERROR, 0))
1492                 return err;
1493             }
1494           else
1495             {
1496               /* There is a duplicated node which satisfies the constraint,
1497                  use it to avoid infinite loop.  */
1498               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1499               if (BE (ret < 0, 0))
1500                 return REG_ESPACE;
1501             }
1502
1503           org_dest = dfa->edests[org_node].elems[1];
1504           clone_dest = duplicate_node (dfa, org_dest, constraint);
1505           if (BE (clone_dest == -1, 0))
1506             return REG_ESPACE;
1507           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1508           if (BE (ret < 0, 0))
1509             return REG_ESPACE;
1510         }
1511       org_node = org_dest;
1512       clone_node = clone_dest;
1513     }
1514   return REG_NOERROR;
1515 }
1516
1517 /* Search for a node which is duplicated from the node ORG_NODE, and
1518    satisfies the constraint CONSTRAINT.  */
1519
1520 static int
1521 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1522                         unsigned int constraint)
1523 {
1524   int idx;
1525   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1526     {
1527       if (org_node == dfa->org_indices[idx]
1528           && constraint == dfa->nodes[idx].constraint)
1529         return idx; /* Found.  */
1530     }
1531   return -1; /* Not found.  */
1532 }
1533
1534 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1535    Return the index of the new node, or -1 if insufficient storage is
1536    available.  */
1537
1538 static int
1539 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1540 {
1541   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1542   if (BE (dup_idx != -1, 1))
1543     {
1544       dfa->nodes[dup_idx].constraint = constraint;
1545       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1546       dfa->nodes[dup_idx].duplicated = 1;
1547
1548       /* Store the index of the original node.  */
1549       dfa->org_indices[dup_idx] = org_idx;
1550     }
1551   return dup_idx;
1552 }
1553
1554 static reg_errcode_t
1555 calc_inveclosure (re_dfa_t *dfa)
1556 {
1557   int src, idx, ret;
1558   for (idx = 0; idx < dfa->nodes_len; ++idx)
1559     re_node_set_init_empty (dfa->inveclosures + idx);
1560
1561   for (src = 0; src < dfa->nodes_len; ++src)
1562     {
1563       int *elems = dfa->eclosures[src].elems;
1564       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1565         {
1566           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1567           if (BE (ret == -1, 0))
1568             return REG_ESPACE;
1569         }
1570     }
1571
1572   return REG_NOERROR;
1573 }
1574
1575 /* Calculate "eclosure" for all the node in DFA.  */
1576
1577 static reg_errcode_t
1578 calc_eclosure (re_dfa_t *dfa)
1579 {
1580   int node_idx, incomplete;
1581 #ifdef DEBUG
1582   assert (dfa->nodes_len > 0);
1583 #endif
1584   incomplete = 0;
1585   /* For each nodes, calculate epsilon closure.  */
1586   for (node_idx = 0; ; ++node_idx)
1587     {
1588       reg_errcode_t err;
1589       re_node_set eclosure_elem;
1590       if (node_idx == dfa->nodes_len)
1591         {
1592           if (!incomplete)
1593             break;
1594           incomplete = 0;
1595           node_idx = 0;
1596         }
1597
1598 #ifdef DEBUG
1599       assert (dfa->eclosures[node_idx].nelem != -1);
1600 #endif
1601
1602       /* If we have already calculated, skip it.  */
1603       if (dfa->eclosures[node_idx].nelem != 0)
1604         continue;
1605       /* Calculate epsilon closure of `node_idx'.  */
1606       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1607       if (BE (err != REG_NOERROR, 0))
1608         return err;
1609
1610       if (dfa->eclosures[node_idx].nelem == 0)
1611         {
1612           incomplete = 1;
1613           re_node_set_free (&eclosure_elem);
1614         }
1615     }
1616   return REG_NOERROR;
1617 }
1618
1619 /* Calculate epsilon closure of NODE.  */
1620
1621 static reg_errcode_t
1622 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1623 {
1624   reg_errcode_t err;
1625   int i, incomplete;
1626   re_node_set eclosure;
1627   incomplete = 0;
1628   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1629   if (BE (err != REG_NOERROR, 0))
1630     return err;
1631
1632   /* This indicates that we are calculating this node now.
1633      We reference this value to avoid infinite loop.  */
1634   dfa->eclosures[node].nelem = -1;
1635
1636   /* If the current node has constraints, duplicate all nodes
1637      since they must inherit the constraints.  */
1638   if (dfa->nodes[node].constraint
1639       && dfa->edests[node].nelem
1640       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1641     {
1642       err = duplicate_node_closure (dfa, node, node, node,
1643                                     dfa->nodes[node].constraint);
1644       if (BE (err != REG_NOERROR, 0))
1645         return err;
1646     }
1647
1648   /* Expand each epsilon destination nodes.  */
1649   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1650     for (i = 0; i < dfa->edests[node].nelem; ++i)
1651       {
1652         re_node_set eclosure_elem;
1653         int edest = dfa->edests[node].elems[i];
1654         /* If calculating the epsilon closure of `edest' is in progress,
1655            return intermediate result.  */
1656         if (dfa->eclosures[edest].nelem == -1)
1657           {
1658             incomplete = 1;
1659             continue;
1660           }
1661         /* If we haven't calculated the epsilon closure of `edest' yet,
1662            calculate now. Otherwise use calculated epsilon closure.  */
1663         if (dfa->eclosures[edest].nelem == 0)
1664           {
1665             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1666             if (BE (err != REG_NOERROR, 0))
1667               return err;
1668           }
1669         else
1670           eclosure_elem = dfa->eclosures[edest];
1671         /* Merge the epsilon closure of `edest'.  */
1672         re_node_set_merge (&eclosure, &eclosure_elem);
1673         /* If the epsilon closure of `edest' is incomplete,
1674            the epsilon closure of this node is also incomplete.  */
1675         if (dfa->eclosures[edest].nelem == 0)
1676           {
1677             incomplete = 1;
1678             re_node_set_free (&eclosure_elem);
1679           }
1680       }
1681
1682   /* Epsilon closures include itself.  */
1683   re_node_set_insert (&eclosure, node);
1684   if (incomplete && !root)
1685     dfa->eclosures[node].nelem = 0;
1686   else
1687     dfa->eclosures[node] = eclosure;
1688   *new_set = eclosure;
1689   return REG_NOERROR;
1690 }
1691 \f
1692 /* Functions for token which are used in the parser.  */
1693
1694 /* Fetch a token from INPUT.
1695    We must not use this function inside bracket expressions.  */
1696
1697 static void
1698 internal_function
1699 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1700 {
1701   re_string_skip_bytes (input, peek_token (result, input, syntax));
1702 }
1703
1704 /* Peek a token from INPUT, and return the length of the token.
1705    We must not use this function inside bracket expressions.  */
1706
1707 static int
1708 internal_function
1709 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1710 {
1711   unsigned char c;
1712
1713   if (re_string_eoi (input))
1714     {
1715       token->type = END_OF_RE;
1716       return 0;
1717     }
1718
1719   c = re_string_peek_byte (input, 0);
1720   token->opr.c = c;
1721
1722   token->word_char = 0;
1723 #ifdef RE_ENABLE_I18N
1724   token->mb_partial = 0;
1725   if (input->mb_cur_max > 1 &&
1726       !re_string_first_byte (input, re_string_cur_idx (input)))
1727     {
1728       token->type = CHARACTER;
1729       token->mb_partial = 1;
1730       return 1;
1731     }
1732 #endif
1733   if (c == '\\')
1734     {
1735       unsigned char c2;
1736       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1737         {
1738           token->type = BACK_SLASH;
1739           return 1;
1740         }
1741
1742       c2 = re_string_peek_byte_case (input, 1);
1743       token->opr.c = c2;
1744       token->type = CHARACTER;
1745 #ifdef RE_ENABLE_I18N
1746       if (input->mb_cur_max > 1)
1747         {
1748           wint_t wc = re_string_wchar_at (input,
1749                                           re_string_cur_idx (input) + 1);
1750           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1751         }
1752       else
1753 #endif
1754         token->word_char = IS_WORD_CHAR (c2) != 0;
1755
1756       switch (c2)
1757         {
1758         case '|':
1759           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1760             token->type = OP_ALT;
1761           break;
1762         case '1': case '2': case '3': case '4': case '5':
1763         case '6': case '7': case '8': case '9':
1764           if (!(syntax & RE_NO_BK_REFS))
1765             {
1766               token->type = OP_BACK_REF;
1767               token->opr.idx = c2 - '1';
1768             }
1769           break;
1770         case '<':
1771           if (!(syntax & RE_NO_GNU_OPS))
1772             {
1773               token->type = ANCHOR;
1774               token->opr.ctx_type = WORD_FIRST;
1775             }
1776           break;
1777         case '>':
1778           if (!(syntax & RE_NO_GNU_OPS))
1779             {
1780               token->type = ANCHOR;
1781               token->opr.ctx_type = WORD_LAST;
1782             }
1783           break;
1784         case 'b':
1785           if (!(syntax & RE_NO_GNU_OPS))
1786             {
1787               token->type = ANCHOR;
1788               token->opr.ctx_type = WORD_DELIM;
1789             }
1790           break;
1791         case 'B':
1792           if (!(syntax & RE_NO_GNU_OPS))
1793             {
1794               token->type = ANCHOR;
1795               token->opr.ctx_type = NOT_WORD_DELIM;
1796             }
1797           break;
1798         case 'w':
1799           if (!(syntax & RE_NO_GNU_OPS))
1800             token->type = OP_WORD;
1801           break;
1802         case 'W':
1803           if (!(syntax & RE_NO_GNU_OPS))
1804             token->type = OP_NOTWORD;
1805           break;
1806         case 's':
1807           if (!(syntax & RE_NO_GNU_OPS))
1808             token->type = OP_SPACE;
1809           break;
1810         case 'S':
1811           if (!(syntax & RE_NO_GNU_OPS))
1812             token->type = OP_NOTSPACE;
1813           break;
1814         case '`':
1815           if (!(syntax & RE_NO_GNU_OPS))
1816             {
1817               token->type = ANCHOR;
1818               token->opr.ctx_type = BUF_FIRST;
1819             }
1820           break;
1821         case '\'':
1822           if (!(syntax & RE_NO_GNU_OPS))
1823             {
1824               token->type = ANCHOR;
1825               token->opr.ctx_type = BUF_LAST;
1826             }
1827           break;
1828         case '(':
1829           if (!(syntax & RE_NO_BK_PARENS))
1830             token->type = OP_OPEN_SUBEXP;
1831           break;
1832         case ')':
1833           if (!(syntax & RE_NO_BK_PARENS))
1834             token->type = OP_CLOSE_SUBEXP;
1835           break;
1836         case '+':
1837           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1838             token->type = OP_DUP_PLUS;
1839           break;
1840         case '?':
1841           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1842             token->type = OP_DUP_QUESTION;
1843           break;
1844         case '{':
1845           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1846             token->type = OP_OPEN_DUP_NUM;
1847           break;
1848         case '}':
1849           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1850             token->type = OP_CLOSE_DUP_NUM;
1851           break;
1852         default:
1853           break;
1854         }
1855       return 2;
1856     }
1857
1858   token->type = CHARACTER;
1859 #ifdef RE_ENABLE_I18N
1860   if (input->mb_cur_max > 1)
1861     {
1862       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1863       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1864     }
1865   else
1866 #endif
1867     token->word_char = IS_WORD_CHAR (token->opr.c);
1868
1869   switch (c)
1870     {
1871     case '\n':
1872       if (syntax & RE_NEWLINE_ALT)
1873         token->type = OP_ALT;
1874       break;
1875     case '|':
1876       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1877         token->type = OP_ALT;
1878       break;
1879     case '*':
1880       token->type = OP_DUP_ASTERISK;
1881       break;
1882     case '+':
1883       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1884         token->type = OP_DUP_PLUS;
1885       break;
1886     case '?':
1887       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1888         token->type = OP_DUP_QUESTION;
1889       break;
1890     case '{':
1891       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1892         token->type = OP_OPEN_DUP_NUM;
1893       break;
1894     case '}':
1895       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1896         token->type = OP_CLOSE_DUP_NUM;
1897       break;
1898     case '(':
1899       if (syntax & RE_NO_BK_PARENS)
1900         token->type = OP_OPEN_SUBEXP;
1901       break;
1902     case ')':
1903       if (syntax & RE_NO_BK_PARENS)
1904         token->type = OP_CLOSE_SUBEXP;
1905       break;
1906     case '[':
1907       token->type = OP_OPEN_BRACKET;
1908       break;
1909     case '.':
1910       token->type = OP_PERIOD;
1911       break;
1912     case '^':
1913       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1914           re_string_cur_idx (input) != 0)
1915         {
1916           char prev = re_string_peek_byte (input, -1);
1917           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1918             break;
1919         }
1920       token->type = ANCHOR;
1921       token->opr.ctx_type = LINE_FIRST;
1922       break;
1923     case '$':
1924       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1925           re_string_cur_idx (input) + 1 != re_string_length (input))
1926         {
1927           re_token_t next;
1928           re_string_skip_bytes (input, 1);
1929           peek_token (&next, input, syntax);
1930           re_string_skip_bytes (input, -1);
1931           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1932             break;
1933         }
1934       token->type = ANCHOR;
1935       token->opr.ctx_type = LINE_LAST;
1936       break;
1937     default:
1938       break;
1939     }
1940   return 1;
1941 }
1942
1943 /* Peek a token from INPUT, and return the length of the token.
1944    We must not use this function out of bracket expressions.  */
1945
1946 static int
1947 internal_function
1948 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1949 {
1950   unsigned char c;
1951   if (re_string_eoi (input))
1952     {
1953       token->type = END_OF_RE;
1954       return 0;
1955     }
1956   c = re_string_peek_byte (input, 0);
1957   token->opr.c = c;
1958
1959 #ifdef RE_ENABLE_I18N
1960   if (input->mb_cur_max > 1 &&
1961       !re_string_first_byte (input, re_string_cur_idx (input)))
1962     {
1963       token->type = CHARACTER;
1964       return 1;
1965     }
1966 #endif /* RE_ENABLE_I18N */
1967
1968   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1969       && re_string_cur_idx (input) + 1 < re_string_length (input))
1970     {
1971       /* In this case, '\' escape a character.  */
1972       unsigned char c2;
1973       re_string_skip_bytes (input, 1);
1974       c2 = re_string_peek_byte (input, 0);
1975       token->opr.c = c2;
1976       token->type = CHARACTER;
1977       return 1;
1978     }
1979   if (c == '[') /* '[' is a special char in a bracket exps.  */
1980     {
1981       unsigned char c2;
1982       int token_len;
1983       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1984         c2 = re_string_peek_byte (input, 1);
1985       else
1986         c2 = 0;
1987       token->opr.c = c2;
1988       token_len = 2;
1989       switch (c2)
1990         {
1991         case '.':
1992           token->type = OP_OPEN_COLL_ELEM;
1993           break;
1994         case '=':
1995           token->type = OP_OPEN_EQUIV_CLASS;
1996           break;
1997         case ':':
1998           if (syntax & RE_CHAR_CLASSES)
1999             {
2000               token->type = OP_OPEN_CHAR_CLASS;
2001               break;
2002             }
2003           /* else fall through.  */
2004         default:
2005           token->type = CHARACTER;
2006           token->opr.c = c;
2007           token_len = 1;
2008           break;
2009         }
2010       return token_len;
2011     }
2012   switch (c)
2013     {
2014     case '-':
2015       token->type = OP_CHARSET_RANGE;
2016       break;
2017     case ']':
2018       token->type = OP_CLOSE_BRACKET;
2019       break;
2020     case '^':
2021       token->type = OP_NON_MATCH_LIST;
2022       break;
2023     default:
2024       token->type = CHARACTER;
2025     }
2026   return 1;
2027 }
2028 \f
2029 /* Functions for parser.  */
2030
2031 /* Entry point of the parser.
2032    Parse the regular expression REGEXP and return the structure tree.
2033    If an error is occured, ERR is set by error code, and return NULL.
2034    This function build the following tree, from regular expression <reg_exp>:
2035            CAT
2036            / \
2037           /   \
2038    <reg_exp>  EOR
2039
2040    CAT means concatenation.
2041    EOR means end of regular expression.  */
2042
2043 static bin_tree_t *
2044 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2045        reg_errcode_t *err)
2046 {
2047   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2048   bin_tree_t *tree, *eor, *root;
2049   re_token_t current_token;
2050   dfa->syntax = syntax;
2051   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2052   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2053   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2054     return NULL;
2055   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2056   if (tree != NULL)
2057     root = create_tree (dfa, tree, eor, CONCAT);
2058   else
2059     root = eor;
2060   if (BE (eor == NULL || root == NULL, 0))
2061     {
2062       *err = REG_ESPACE;
2063       return NULL;
2064     }
2065   return root;
2066 }
2067
2068 /* This function build the following tree, from regular expression
2069    <branch1>|<branch2>:
2070            ALT
2071            / \
2072           /   \
2073    <branch1> <branch2>
2074
2075    ALT means alternative, which represents the operator `|'.  */
2076
2077 static bin_tree_t *
2078 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2079                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2080 {
2081   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2082   bin_tree_t *tree, *branch = NULL;
2083   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2084   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2085     return NULL;
2086
2087   while (token->type == OP_ALT)
2088     {
2089       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2090       if (token->type != OP_ALT && token->type != END_OF_RE
2091           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2092         {
2093           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2094           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2095             return NULL;
2096         }
2097       else
2098         branch = NULL;
2099       tree = create_tree (dfa, tree, branch, OP_ALT);
2100       if (BE (tree == NULL, 0))
2101         {
2102           *err = REG_ESPACE;
2103           return NULL;
2104         }
2105     }
2106   return tree;
2107 }
2108
2109 /* This function build the following tree, from regular expression
2110    <exp1><exp2>:
2111         CAT
2112         / \
2113        /   \
2114    <exp1> <exp2>
2115
2116    CAT means concatenation.  */
2117
2118 static bin_tree_t *
2119 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2120               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2121 {
2122   bin_tree_t *tree, *exp;
2123   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2124   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2125   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126     return NULL;
2127
2128   while (token->type != OP_ALT && token->type != END_OF_RE
2129          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2130     {
2131       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2132       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2133         {
2134           return NULL;
2135         }
2136       if (tree != NULL && exp != NULL)
2137         {
2138           tree = create_tree (dfa, tree, exp, CONCAT);
2139           if (tree == NULL)
2140             {
2141               *err = REG_ESPACE;
2142               return NULL;
2143             }
2144         }
2145       else if (tree == NULL)
2146         tree = exp;
2147       /* Otherwise exp == NULL, we don't need to create new tree.  */
2148     }
2149   return tree;
2150 }
2151
2152 /* This function build the following tree, from regular expression a*:
2153          *
2154          |
2155          a
2156 */
2157
2158 static bin_tree_t *
2159 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2160                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2161 {
2162   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2163   bin_tree_t *tree;
2164   switch (token->type)
2165     {
2166     case CHARACTER:
2167       tree = create_token_tree (dfa, NULL, NULL, token);
2168       if (BE (tree == NULL, 0))
2169         {
2170           *err = REG_ESPACE;
2171           return NULL;
2172         }
2173 #ifdef RE_ENABLE_I18N
2174       if (dfa->mb_cur_max > 1)
2175         {
2176           while (!re_string_eoi (regexp)
2177                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2178             {
2179               bin_tree_t *mbc_remain;
2180               fetch_token (token, regexp, syntax);
2181               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2182               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2183               if (BE (mbc_remain == NULL || tree == NULL, 0))
2184                 {
2185                   *err = REG_ESPACE;
2186                   return NULL;
2187                 }
2188             }
2189         }
2190 #endif
2191       break;
2192     case OP_OPEN_SUBEXP:
2193       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2194       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2195         return NULL;
2196       break;
2197     case OP_OPEN_BRACKET:
2198       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2199       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2200         return NULL;
2201       break;
2202     case OP_BACK_REF:
2203       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2204         {
2205           *err = REG_ESUBREG;
2206           return NULL;
2207         }
2208       dfa->used_bkref_map |= 1 << token->opr.idx;
2209       tree = create_token_tree (dfa, NULL, NULL, token);
2210       if (BE (tree == NULL, 0))
2211         {
2212           *err = REG_ESPACE;
2213           return NULL;
2214         }
2215       ++dfa->nbackref;
2216       dfa->has_mb_node = 1;
2217       break;
2218     case OP_OPEN_DUP_NUM:
2219       if (syntax & RE_CONTEXT_INVALID_DUP)
2220         {
2221           *err = REG_BADRPT;
2222           return NULL;
2223         }
2224       /* FALLTHROUGH */
2225     case OP_DUP_ASTERISK:
2226     case OP_DUP_PLUS:
2227     case OP_DUP_QUESTION:
2228       if (syntax & RE_CONTEXT_INVALID_OPS)
2229         {
2230           *err = REG_BADRPT;
2231           return NULL;
2232         }
2233       else if (syntax & RE_CONTEXT_INDEP_OPS)
2234         {
2235           fetch_token (token, regexp, syntax);
2236           return parse_expression (regexp, preg, token, syntax, nest, err);
2237         }
2238       /* else fall through  */
2239     case OP_CLOSE_SUBEXP:
2240       if ((token->type == OP_CLOSE_SUBEXP) &&
2241           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2242         {
2243           *err = REG_ERPAREN;
2244           return NULL;
2245         }
2246       /* else fall through  */
2247     case OP_CLOSE_DUP_NUM:
2248       /* We treat it as a normal character.  */
2249
2250       /* Then we can these characters as normal characters.  */
2251       token->type = CHARACTER;
2252       /* mb_partial and word_char bits should be initialized already
2253          by peek_token.  */
2254       tree = create_token_tree (dfa, NULL, NULL, token);
2255       if (BE (tree == NULL, 0))
2256         {
2257           *err = REG_ESPACE;
2258           return NULL;
2259         }
2260       break;
2261     case ANCHOR:
2262       if ((token->opr.ctx_type
2263            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2264           && dfa->word_ops_used == 0)
2265         init_word_char (dfa);
2266       if (token->opr.ctx_type == WORD_DELIM
2267           || token->opr.ctx_type == NOT_WORD_DELIM)
2268         {
2269           bin_tree_t *tree_first, *tree_last;
2270           if (token->opr.ctx_type == WORD_DELIM)
2271             {
2272               token->opr.ctx_type = WORD_FIRST;
2273               tree_first = create_token_tree (dfa, NULL, NULL, token);
2274               token->opr.ctx_type = WORD_LAST;
2275             }
2276           else
2277             {
2278               token->opr.ctx_type = INSIDE_WORD;
2279               tree_first = create_token_tree (dfa, NULL, NULL, token);
2280               token->opr.ctx_type = INSIDE_NOTWORD;
2281             }
2282           tree_last = create_token_tree (dfa, NULL, NULL, token);
2283           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2284           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2285             {
2286               *err = REG_ESPACE;
2287               return NULL;
2288             }
2289         }
2290       else
2291         {
2292           tree = create_token_tree (dfa, NULL, NULL, token);
2293           if (BE (tree == NULL, 0))
2294             {
2295               *err = REG_ESPACE;
2296               return NULL;
2297             }
2298         }
2299       /* We must return here, since ANCHORs can't be followed
2300          by repetition operators.
2301          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2302              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2303       fetch_token (token, regexp, syntax);
2304       return tree;
2305     case OP_PERIOD:
2306       tree = create_token_tree (dfa, NULL, NULL, token);
2307       if (BE (tree == NULL, 0))
2308         {
2309           *err = REG_ESPACE;
2310           return NULL;
2311         }
2312       if (dfa->mb_cur_max > 1)
2313         dfa->has_mb_node = 1;
2314       break;
2315     case OP_WORD:
2316     case OP_NOTWORD:
2317       tree = build_charclass_op (dfa, regexp->trans,
2318                                  (const unsigned char *) "alnum",
2319                                  (const unsigned char *) "_",
2320                                  token->type == OP_NOTWORD, err);
2321       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2322         return NULL;
2323       break;
2324     case OP_SPACE:
2325     case OP_NOTSPACE:
2326       tree = build_charclass_op (dfa, regexp->trans,
2327                                  (const unsigned char *) "space",
2328                                  (const unsigned char *) "",
2329                                  token->type == OP_NOTSPACE, err);
2330       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2331         return NULL;
2332       break;
2333     case OP_ALT:
2334     case END_OF_RE:
2335       return NULL;
2336     case BACK_SLASH:
2337       *err = REG_EESCAPE;
2338       return NULL;
2339     default:
2340       /* Must not happen?  */
2341 #ifdef DEBUG
2342       assert (0);
2343 #endif
2344       return NULL;
2345     }
2346   fetch_token (token, regexp, syntax);
2347
2348   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2349          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2350     {
2351       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2352       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2353         return NULL;
2354       /* In BRE consecutive duplications are not allowed.  */
2355       if ((syntax & RE_CONTEXT_INVALID_DUP)
2356           && (token->type == OP_DUP_ASTERISK
2357               || token->type == OP_OPEN_DUP_NUM))
2358         {
2359           *err = REG_BADRPT;
2360           return NULL;
2361         }
2362     }
2363
2364   return tree;
2365 }
2366
2367 /* This function build the following tree, from regular expression
2368    (<reg_exp>):
2369          SUBEXP
2370             |
2371         <reg_exp>
2372 */
2373
2374 static bin_tree_t *
2375 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2376                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2377 {
2378   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2379   bin_tree_t *tree;
2380   size_t cur_nsub;
2381   cur_nsub = preg->re_nsub++;
2382
2383   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2384
2385   /* The subexpression may be a null string.  */
2386   if (token->type == OP_CLOSE_SUBEXP)
2387     tree = NULL;
2388   else
2389     {
2390       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2391       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2392         *err = REG_EPAREN;
2393       if (BE (*err != REG_NOERROR, 0))
2394         return NULL;
2395     }
2396
2397   if (cur_nsub <= '9' - '1')
2398     dfa->completed_bkref_map |= 1 << cur_nsub;
2399
2400   tree = create_tree (dfa, tree, NULL, SUBEXP);
2401   if (BE (tree == NULL, 0))
2402     {
2403       *err = REG_ESPACE;
2404       return NULL;
2405     }
2406   tree->token.opr.idx = cur_nsub;
2407   return tree;
2408 }
2409
2410 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2411
2412 static bin_tree_t *
2413 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2414               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2415 {
2416   bin_tree_t *tree = NULL, *old_tree = NULL;
2417   int i, start, end, start_idx = re_string_cur_idx (regexp);
2418   re_token_t start_token = *token;
2419
2420   if (token->type == OP_OPEN_DUP_NUM)
2421     {
2422       end = 0;
2423       start = fetch_number (regexp, token, syntax);
2424       if (start == -1)
2425         {
2426           if (token->type == CHARACTER && token->opr.c == ',')
2427             start = 0; /* We treat "{,m}" as "{0,m}".  */
2428           else
2429             {
2430               *err = REG_BADBR; /* <re>{} is invalid.  */
2431               return NULL;
2432             }
2433         }
2434       if (BE (start != -2, 1))
2435         {
2436           /* We treat "{n}" as "{n,n}".  */
2437           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2438                  : ((token->type == CHARACTER && token->opr.c == ',')
2439                     ? fetch_number (regexp, token, syntax) : -2));
2440         }
2441       if (BE (start == -2 || end == -2, 0))
2442         {
2443           /* Invalid sequence.  */
2444           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2445             {
2446               if (token->type == END_OF_RE)
2447                 *err = REG_EBRACE;
2448               else
2449                 *err = REG_BADBR;
2450
2451               return NULL;
2452             }
2453
2454           /* If the syntax bit is set, rollback.  */
2455           re_string_set_index (regexp, start_idx);
2456           *token = start_token;
2457           token->type = CHARACTER;
2458           /* mb_partial and word_char bits should be already initialized by
2459              peek_token.  */
2460           return elem;
2461         }
2462
2463       if (BE (end != -1 && start > end, 0))
2464         {
2465           /* First number greater than second.  */
2466           *err = REG_BADBR;
2467           return NULL;
2468         }
2469     }
2470   else
2471     {
2472       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2473       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2474     }
2475
2476   fetch_token (token, regexp, syntax);
2477
2478   if (BE (elem == NULL, 0))
2479     return NULL;
2480   if (BE (start == 0 && end == 0, 0))
2481     {
2482       postorder (elem, free_tree, NULL);
2483       return NULL;
2484     }
2485
2486   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2487   if (BE (start > 0, 0))
2488     {
2489       tree = elem;
2490       for (i = 2; i <= start; ++i)
2491         {
2492           elem = duplicate_tree (elem, dfa);
2493           tree = create_tree (dfa, tree, elem, CONCAT);
2494           if (BE (elem == NULL || tree == NULL, 0))
2495             goto parse_dup_op_espace;
2496         }
2497
2498       if (start == end)
2499         return tree;
2500
2501       /* Duplicate ELEM before it is marked optional.  */
2502       elem = duplicate_tree (elem, dfa);
2503       old_tree = tree;
2504     }
2505   else
2506     old_tree = NULL;
2507
2508   if (elem->token.type == SUBEXP)
2509     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2510
2511   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2512   if (BE (tree == NULL, 0))
2513     goto parse_dup_op_espace;
2514
2515   /* This loop is actually executed only when end != -1,
2516      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2517      already created the start+1-th copy.  */
2518   for (i = start + 2; i <= end; ++i)
2519     {
2520       elem = duplicate_tree (elem, dfa);
2521       tree = create_tree (dfa, tree, elem, CONCAT);
2522       if (BE (elem == NULL || tree == NULL, 0))
2523         goto parse_dup_op_espace;
2524
2525       tree = create_tree (dfa, tree, NULL, OP_ALT);
2526       if (BE (tree == NULL, 0))
2527         goto parse_dup_op_espace;
2528     }
2529
2530   if (old_tree)
2531     tree = create_tree (dfa, old_tree, tree, CONCAT);
2532
2533   return tree;
2534
2535  parse_dup_op_espace:
2536   *err = REG_ESPACE;
2537   return NULL;
2538 }
2539
2540 /* Size of the names for collating symbol/equivalence_class/character_class.
2541    I'm not sure, but maybe enough.  */
2542 #define BRACKET_NAME_BUF_SIZE 32
2543
2544 #ifndef _LIBC
2545   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2546      Build the range expression which starts from START_ELEM, and ends
2547      at END_ELEM.  The result are written to MBCSET and SBCSET.
2548      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2549      mbcset->range_ends, is a pointer argument sinse we may
2550      update it.  */
2551
2552 static reg_errcode_t
2553 internal_function
2554 # ifdef RE_ENABLE_I18N
2555 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2556                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2557 # else /* not RE_ENABLE_I18N */
2558 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2559                  bracket_elem_t *end_elem)
2560 # endif /* not RE_ENABLE_I18N */
2561 {
2562   unsigned int start_ch, end_ch;
2563   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2564   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2565           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2566           0))
2567     return REG_ERANGE;
2568
2569   /* We can handle no multi character collating elements without libc
2570      support.  */
2571   if (BE ((start_elem->type == COLL_SYM
2572            && strlen ((char *) start_elem->opr.name) > 1)
2573           || (end_elem->type == COLL_SYM
2574               && strlen ((char *) end_elem->opr.name) > 1), 0))
2575     return REG_ECOLLATE;
2576
2577 # ifdef RE_ENABLE_I18N
2578   {
2579     wchar_t wc;
2580     wint_t start_wc;
2581     wint_t end_wc;
2582     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2583
2584     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2585                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2586                    : 0));
2587     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2588               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2589                  : 0));
2590     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2591                 ? __btowc (start_ch) : start_elem->opr.wch);
2592     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2593               ? __btowc (end_ch) : end_elem->opr.wch);
2594     if (start_wc == WEOF || end_wc == WEOF)
2595       return REG_ECOLLATE;
2596     cmp_buf[0] = start_wc;
2597     cmp_buf[4] = end_wc;
2598     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2599       return REG_ERANGE;
2600
2601     /* Got valid collation sequence values, add them as a new entry.
2602        However, for !_LIBC we have no collation elements: if the
2603        character set is single byte, the single byte character set
2604        that we build below suffices.  parse_bracket_exp passes
2605        no MBCSET if dfa->mb_cur_max == 1.  */
2606     if (mbcset)
2607       {
2608         /* Check the space of the arrays.  */
2609         if (BE (*range_alloc == mbcset->nranges, 0))
2610           {
2611             /* There is not enough space, need realloc.  */
2612             wchar_t *new_array_start, *new_array_end;
2613             int new_nranges;
2614
2615             /* +1 in case of mbcset->nranges is 0.  */
2616             new_nranges = 2 * mbcset->nranges + 1;
2617             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2618                are NULL if *range_alloc == 0.  */
2619             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2620                                           new_nranges);
2621             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2622                                         new_nranges);
2623
2624             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2625               return REG_ESPACE;
2626
2627             mbcset->range_starts = new_array_start;
2628             mbcset->range_ends = new_array_end;
2629             *range_alloc = new_nranges;
2630           }
2631
2632         mbcset->range_starts[mbcset->nranges] = start_wc;
2633         mbcset->range_ends[mbcset->nranges++] = end_wc;
2634       }
2635
2636     /* Build the table for single byte characters.  */
2637     for (wc = 0; wc < SBC_MAX; ++wc)
2638       {
2639         cmp_buf[2] = wc;
2640         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2641             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2642           bitset_set (sbcset, wc);
2643       }
2644   }
2645 # else /* not RE_ENABLE_I18N */
2646   {
2647     unsigned int ch;
2648     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2649                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2650                    : 0));
2651     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2652               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2653                  : 0));
2654     if (start_ch > end_ch)
2655       return REG_ERANGE;
2656     /* Build the table for single byte characters.  */
2657     for (ch = 0; ch < SBC_MAX; ++ch)
2658       if (start_ch <= ch  && ch <= end_ch)
2659         bitset_set (sbcset, ch);
2660   }
2661 # endif /* not RE_ENABLE_I18N */
2662   return REG_NOERROR;
2663 }
2664 #endif /* not _LIBC */
2665
2666 #ifndef _LIBC
2667 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2668    Build the collating element which is represented by NAME.
2669    The result are written to MBCSET and SBCSET.
2670    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2671    pointer argument since we may update it.  */
2672
2673 static reg_errcode_t
2674 internal_function
2675 # ifdef RE_ENABLE_I18N
2676 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2677                         int *coll_sym_alloc, const unsigned char *name)
2678 # else /* not RE_ENABLE_I18N */
2679 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2680 # endif /* not RE_ENABLE_I18N */
2681 {
2682   size_t name_len = strlen ((const char *) name);
2683   if (BE (name_len != 1, 0))
2684     return REG_ECOLLATE;
2685   else
2686     {
2687       bitset_set (sbcset, name[0]);
2688       return REG_NOERROR;
2689     }
2690 }
2691 #endif /* not _LIBC */
2692
2693 /* This function parse bracket expression like "[abc]", "[a-c]",
2694    "[[.a-a.]]" etc.  */
2695
2696 static bin_tree_t *
2697 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2698                    reg_syntax_t syntax, reg_errcode_t *err)
2699 {
2700 #ifdef _LIBC
2701   const unsigned char *collseqmb;
2702   const char *collseqwc;
2703   uint32_t nrules;
2704   int32_t table_size;
2705   const int32_t *symb_table;
2706   const unsigned char *extra;
2707
2708   /* Local function for parse_bracket_exp used in _LIBC environement.
2709      Seek the collating symbol entry correspondings to NAME.
2710      Return the index of the symbol in the SYMB_TABLE.  */
2711
2712   auto inline int32_t
2713   __attribute ((always_inline))
2714   seek_collating_symbol_entry (name, name_len)
2715          const unsigned char *name;
2716          size_t name_len;
2717     {
2718       int32_t hash = elem_hash ((const char *) name, name_len);
2719       int32_t elem = hash % table_size;
2720       if (symb_table[2 * elem] != 0)
2721         {
2722           int32_t second = hash % (table_size - 2) + 1;
2723
2724           do
2725             {
2726               /* First compare the hashing value.  */
2727               if (symb_table[2 * elem] == hash
2728                   /* Compare the length of the name.  */
2729                   && name_len == extra[symb_table[2 * elem + 1]]
2730                   /* Compare the name.  */
2731                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2732                              name_len) == 0)
2733                 {
2734                   /* Yep, this is the entry.  */
2735                   break;
2736                 }
2737
2738               /* Next entry.  */
2739               elem += second;
2740             }
2741           while (symb_table[2 * elem] != 0);
2742         }
2743       return elem;
2744     }
2745
2746   /* Local function for parse_bracket_exp used in _LIBC environment.
2747      Look up the collation sequence value of BR_ELEM.
2748      Return the value if succeeded, UINT_MAX otherwise.  */
2749
2750   auto inline unsigned int
2751   __attribute ((always_inline))
2752   lookup_collation_sequence_value (br_elem)
2753          bracket_elem_t *br_elem;
2754     {
2755       if (br_elem->type == SB_CHAR)
2756         {
2757           /*
2758           if (MB_CUR_MAX == 1)
2759           */
2760           if (nrules == 0)
2761             return collseqmb[br_elem->opr.ch];
2762           else
2763             {
2764               wint_t wc = __btowc (br_elem->opr.ch);
2765               return __collseq_table_lookup (collseqwc, wc);
2766             }
2767         }
2768       else if (br_elem->type == MB_CHAR)
2769         {
2770           if (nrules != 0)
2771             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2772         }
2773       else if (br_elem->type == COLL_SYM)
2774         {
2775           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2776           if (nrules != 0)
2777             {
2778               int32_t elem, idx;
2779               elem = seek_collating_symbol_entry (br_elem->opr.name,
2780                                                   sym_name_len);
2781               if (symb_table[2 * elem] != 0)
2782                 {
2783                   /* We found the entry.  */
2784                   idx = symb_table[2 * elem + 1];
2785                   /* Skip the name of collating element name.  */
2786                   idx += 1 + extra[idx];
2787                   /* Skip the byte sequence of the collating element.  */
2788                   idx += 1 + extra[idx];
2789                   /* Adjust for the alignment.  */
2790                   idx = (idx + 3) & ~3;
2791                   /* Skip the multibyte collation sequence value.  */
2792                   idx += sizeof (unsigned int);
2793                   /* Skip the wide char sequence of the collating element.  */
2794                   idx += sizeof (unsigned int) *
2795                     (1 + *(unsigned int *) (extra + idx));
2796                   /* Return the collation sequence value.  */
2797                   return *(unsigned int *) (extra + idx);
2798                 }
2799               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2800                 {
2801                   /* No valid character.  Match it as a single byte
2802                      character.  */
2803                   return collseqmb[br_elem->opr.name[0]];
2804                 }
2805             }
2806           else if (sym_name_len == 1)
2807             return collseqmb[br_elem->opr.name[0]];
2808         }
2809       return UINT_MAX;
2810     }
2811
2812   /* Local function for parse_bracket_exp used in _LIBC environement.
2813      Build the range expression which starts from START_ELEM, and ends
2814      at END_ELEM.  The result are written to MBCSET and SBCSET.
2815      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2816      mbcset->range_ends, is a pointer argument sinse we may
2817      update it.  */
2818
2819   auto inline reg_errcode_t
2820   __attribute ((always_inline))
2821   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2822          re_charset_t *mbcset;
2823          int *range_alloc;
2824          bitset_t sbcset;
2825          bracket_elem_t *start_elem, *end_elem;
2826     {
2827       unsigned int ch;
2828       uint32_t start_collseq;
2829       uint32_t end_collseq;
2830
2831       /* Equivalence Classes and Character Classes can't be a range
2832          start/end.  */
2833       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2834               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2835               0))
2836         return REG_ERANGE;
2837
2838       start_collseq = lookup_collation_sequence_value (start_elem);
2839       end_collseq = lookup_collation_sequence_value (end_elem);
2840       /* Check start/end collation sequence values.  */
2841       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2842         return REG_ECOLLATE;
2843       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2844         return REG_ERANGE;
2845
2846       /* Got valid collation sequence values, add them as a new entry.
2847          However, if we have no collation elements, and the character set
2848          is single byte, the single byte character set that we
2849          build below suffices. */
2850       if (nrules > 0 || dfa->mb_cur_max > 1)
2851         {
2852           /* Check the space of the arrays.  */
2853           if (BE (*range_alloc == mbcset->nranges, 0))
2854             {
2855               /* There is not enough space, need realloc.  */
2856               uint32_t *new_array_start;
2857               uint32_t *new_array_end;
2858               int new_nranges;
2859
2860               /* +1 in case of mbcset->nranges is 0.  */
2861               new_nranges = 2 * mbcset->nranges + 1;
2862               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2863                                             new_nranges);
2864               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2865                                           new_nranges);
2866
2867               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2868                 return REG_ESPACE;
2869
2870               mbcset->range_starts = new_array_start;
2871               mbcset->range_ends = new_array_end;
2872               *range_alloc = new_nranges;
2873             }
2874
2875           mbcset->range_starts[mbcset->nranges] = start_collseq;
2876           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2877         }
2878
2879       /* Build the table for single byte characters.  */
2880       for (ch = 0; ch < SBC_MAX; ch++)
2881         {
2882           uint32_t ch_collseq;
2883           /*
2884           if (MB_CUR_MAX == 1)
2885           */
2886           if (nrules == 0)
2887             ch_collseq = collseqmb[ch];
2888           else
2889             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2890           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2891             bitset_set (sbcset, ch);
2892         }
2893       return REG_NOERROR;
2894     }
2895
2896   /* Local function for parse_bracket_exp used in _LIBC environement.
2897      Build the collating element which is represented by NAME.
2898      The result are written to MBCSET and SBCSET.
2899      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2900      pointer argument sinse we may update it.  */
2901
2902   auto inline reg_errcode_t
2903   __attribute ((always_inline))
2904   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2905          re_charset_t *mbcset;
2906          int *coll_sym_alloc;
2907          bitset_t sbcset;
2908          const unsigned char *name;
2909     {
2910       int32_t elem, idx;
2911       size_t name_len = strlen ((const char *) name);
2912       if (nrules != 0)
2913         {
2914           elem = seek_collating_symbol_entry (name, name_len);
2915           if (symb_table[2 * elem] != 0)
2916             {
2917               /* We found the entry.  */
2918               idx = symb_table[2 * elem + 1];
2919               /* Skip the name of collating element name.  */
2920               idx += 1 + extra[idx];
2921             }
2922           else if (symb_table[2 * elem] == 0 && name_len == 1)
2923             {
2924               /* No valid character, treat it as a normal
2925                  character.  */
2926               bitset_set (sbcset, name[0]);
2927               return REG_NOERROR;
2928             }
2929           else
2930             return REG_ECOLLATE;
2931
2932           /* Got valid collation sequence, add it as a new entry.  */
2933           /* Check the space of the arrays.  */
2934           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2935             {
2936               /* Not enough, realloc it.  */
2937               /* +1 in case of mbcset->ncoll_syms is 0.  */
2938               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2939               /* Use realloc since mbcset->coll_syms is NULL
2940                  if *alloc == 0.  */
2941               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2942                                                    new_coll_sym_alloc);
2943               if (BE (new_coll_syms == NULL, 0))
2944                 return REG_ESPACE;
2945               mbcset->coll_syms = new_coll_syms;
2946               *coll_sym_alloc = new_coll_sym_alloc;
2947             }
2948           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2949           return REG_NOERROR;
2950         }
2951       else
2952         {
2953           if (BE (name_len != 1, 0))
2954             return REG_ECOLLATE;
2955           else
2956             {
2957               bitset_set (sbcset, name[0]);
2958               return REG_NOERROR;
2959             }
2960         }
2961     }
2962 #endif
2963
2964   re_token_t br_token;
2965   re_bitset_ptr_t sbcset;
2966 #ifdef RE_ENABLE_I18N
2967   re_charset_t *mbcset;
2968   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2969   int equiv_class_alloc = 0, char_class_alloc = 0;
2970 #endif /* not RE_ENABLE_I18N */
2971   int non_match = 0;
2972   bin_tree_t *work_tree;
2973   int token_len;
2974   int first_round = 1;
2975 #ifdef _LIBC
2976   collseqmb = (const unsigned char *)
2977     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2978   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2979   if (nrules)
2980     {
2981       /*
2982       if (MB_CUR_MAX > 1)
2983       */
2984       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2985       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2986       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2987                                                   _NL_COLLATE_SYMB_TABLEMB);
2988       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2989                                                    _NL_COLLATE_SYMB_EXTRAMB);
2990     }
2991 #endif
2992   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
2993 #ifdef RE_ENABLE_I18N
2994   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2995 #endif /* RE_ENABLE_I18N */
2996 #ifdef RE_ENABLE_I18N
2997   if (BE (sbcset == NULL || mbcset == NULL, 0))
2998 #else
2999   if (BE (sbcset == NULL, 0))
3000 #endif /* RE_ENABLE_I18N */
3001     {
3002       *err = REG_ESPACE;
3003       return NULL;
3004     }
3005
3006   token_len = peek_token_bracket (token, regexp, syntax);
3007   if (BE (token->type == END_OF_RE, 0))
3008     {
3009       *err = REG_BADPAT;
3010       goto parse_bracket_exp_free_return;
3011     }
3012   if (token->type == OP_NON_MATCH_LIST)
3013     {
3014 #ifdef RE_ENABLE_I18N
3015       mbcset->non_match = 1;
3016 #endif /* not RE_ENABLE_I18N */
3017       non_match = 1;
3018       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3019         bitset_set (sbcset, '\n');
3020       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3021       token_len = peek_token_bracket (token, regexp, syntax);
3022       if (BE (token->type == END_OF_RE, 0))
3023         {
3024           *err = REG_BADPAT;
3025           goto parse_bracket_exp_free_return;
3026         }
3027     }
3028
3029   /* We treat the first ']' as a normal character.  */
3030   if (token->type == OP_CLOSE_BRACKET)
3031     token->type = CHARACTER;
3032
3033   while (1)
3034     {
3035       bracket_elem_t start_elem, end_elem;
3036       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3037       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3038       reg_errcode_t ret;
3039       int token_len2 = 0, is_range_exp = 0;
3040       re_token_t token2;
3041
3042       start_elem.opr.name = start_name_buf;
3043       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3044                                    syntax, first_round);
3045       if (BE (ret != REG_NOERROR, 0))
3046         {
3047           *err = ret;
3048           goto parse_bracket_exp_free_return;
3049         }
3050       first_round = 0;
3051
3052       /* Get information about the next token.  We need it in any case.  */
3053       token_len = peek_token_bracket (token, regexp, syntax);
3054
3055       /* Do not check for ranges if we know they are not allowed.  */
3056       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3057         {
3058           if (BE (token->type == END_OF_RE, 0))
3059             {
3060               *err = REG_EBRACK;
3061               goto parse_bracket_exp_free_return;
3062             }
3063           if (token->type == OP_CHARSET_RANGE)
3064             {
3065               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3066               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3067               if (BE (token2.type == END_OF_RE, 0))
3068                 {
3069                   *err = REG_EBRACK;
3070                   goto parse_bracket_exp_free_return;
3071                 }
3072               if (token2.type == OP_CLOSE_BRACKET)
3073                 {
3074                   /* We treat the last '-' as a normal character.  */
3075                   re_string_skip_bytes (regexp, -token_len);
3076                   token->type = CHARACTER;
3077                 }
3078               else
3079                 is_range_exp = 1;
3080             }
3081         }
3082
3083       if (is_range_exp == 1)
3084         {
3085           end_elem.opr.name = end_name_buf;
3086           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3087                                        dfa, syntax, 1);
3088           if (BE (ret != REG_NOERROR, 0))
3089             {
3090               *err = ret;
3091               goto parse_bracket_exp_free_return;
3092             }
3093
3094           token_len = peek_token_bracket (token, regexp, syntax);
3095
3096 #ifdef _LIBC
3097           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3098                                   &start_elem, &end_elem);
3099 #else
3100 # ifdef RE_ENABLE_I18N
3101           *err = build_range_exp (sbcset,
3102                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3103                                   &range_alloc, &start_elem, &end_elem);
3104 # else
3105           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3106 # endif
3107 #endif /* RE_ENABLE_I18N */
3108           if (BE (*err != REG_NOERROR, 0))
3109             goto parse_bracket_exp_free_return;
3110         }
3111       else
3112         {
3113           switch (start_elem.type)
3114             {
3115             case SB_CHAR:
3116               bitset_set (sbcset, start_elem.opr.ch);
3117               break;
3118 #ifdef RE_ENABLE_I18N
3119             case MB_CHAR:
3120               /* Check whether the array has enough space.  */
3121               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3122                 {
3123                   wchar_t *new_mbchars;
3124                   /* Not enough, realloc it.  */
3125                   /* +1 in case of mbcset->nmbchars is 0.  */
3126                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3127                   /* Use realloc since array is NULL if *alloc == 0.  */
3128                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3129                                             mbchar_alloc);
3130                   if (BE (new_mbchars == NULL, 0))
3131                     goto parse_bracket_exp_espace;
3132                   mbcset->mbchars = new_mbchars;
3133                 }
3134               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3135               break;
3136 #endif /* RE_ENABLE_I18N */
3137             case EQUIV_CLASS:
3138               *err = build_equiv_class (sbcset,
3139 #ifdef RE_ENABLE_I18N
3140                                         mbcset, &equiv_class_alloc,
3141 #endif /* RE_ENABLE_I18N */
3142                                         start_elem.opr.name);
3143               if (BE (*err != REG_NOERROR, 0))
3144                 goto parse_bracket_exp_free_return;
3145               break;
3146             case COLL_SYM:
3147               *err = build_collating_symbol (sbcset,
3148 #ifdef RE_ENABLE_I18N
3149                                              mbcset, &coll_sym_alloc,
3150 #endif /* RE_ENABLE_I18N */
3151                                              start_elem.opr.name);
3152               if (BE (*err != REG_NOERROR, 0))
3153                 goto parse_bracket_exp_free_return;
3154               break;
3155             case CHAR_CLASS:
3156               *err = build_charclass (regexp->trans, sbcset,
3157 #ifdef RE_ENABLE_I18N
3158                                       mbcset, &char_class_alloc,
3159 #endif /* RE_ENABLE_I18N */
3160                                       start_elem.opr.name, syntax);
3161               if (BE (*err != REG_NOERROR, 0))
3162                goto parse_bracket_exp_free_return;
3163               break;
3164             default:
3165               assert (0);
3166               break;
3167             }
3168         }
3169       if (BE (token->type == END_OF_RE, 0))
3170         {
3171           *err = REG_EBRACK;
3172           goto parse_bracket_exp_free_return;
3173         }
3174       if (token->type == OP_CLOSE_BRACKET)
3175         break;
3176     }
3177
3178   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3179
3180   /* If it is non-matching list.  */
3181   if (non_match)
3182     bitset_not (sbcset);
3183
3184 #ifdef RE_ENABLE_I18N
3185   /* Ensure only single byte characters are set.  */
3186   if (dfa->mb_cur_max > 1)
3187     bitset_mask (sbcset, dfa->sb_char);
3188
3189   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3190       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3191                                                      || mbcset->non_match)))
3192     {
3193       bin_tree_t *mbc_tree;
3194       int sbc_idx;
3195       /* Build a tree for complex bracket.  */
3196       dfa->has_mb_node = 1;
3197       br_token.type = COMPLEX_BRACKET;
3198       br_token.opr.mbcset = mbcset;
3199       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3200       if (BE (mbc_tree == NULL, 0))
3201         goto parse_bracket_exp_espace;
3202       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3203         if (sbcset[sbc_idx])
3204           break;
3205       /* If there are no bits set in sbcset, there is no point
3206          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3207       if (sbc_idx < BITSET_WORDS)
3208         {
3209           /* Build a tree for simple bracket.  */
3210           br_token.type = SIMPLE_BRACKET;
3211           br_token.opr.sbcset = sbcset;
3212           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3213           if (BE (work_tree == NULL, 0))
3214             goto parse_bracket_exp_espace;
3215
3216           /* Then join them by ALT node.  */
3217           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3218           if (BE (work_tree == NULL, 0))
3219             goto parse_bracket_exp_espace;
3220         }
3221       else
3222         {
3223           re_free (sbcset);
3224           work_tree = mbc_tree;
3225         }
3226     }
3227   else
3228 #endif /* not RE_ENABLE_I18N */
3229     {
3230 #ifdef RE_ENABLE_I18N
3231       free_charset (mbcset);
3232 #endif
3233       /* Build a tree for simple bracket.  */
3234       br_token.type = SIMPLE_BRACKET;
3235       br_token.opr.sbcset = sbcset;
3236       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3237       if (BE (work_tree == NULL, 0))
3238         goto parse_bracket_exp_espace;
3239     }
3240   return work_tree;
3241
3242  parse_bracket_exp_espace:
3243   *err = REG_ESPACE;
3244  parse_bracket_exp_free_return:
3245   re_free (sbcset);
3246 #ifdef RE_ENABLE_I18N
3247   free_charset (mbcset);
3248 #endif /* RE_ENABLE_I18N */
3249   return NULL;
3250 }
3251
3252 /* Parse an element in the bracket expression.  */
3253
3254 static reg_errcode_t
3255 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3256                        re_token_t *token, int token_len, re_dfa_t *dfa,
3257                        reg_syntax_t syntax, int accept_hyphen)
3258 {
3259 #ifdef RE_ENABLE_I18N
3260   int cur_char_size;
3261   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3262   if (cur_char_size > 1)
3263     {
3264       elem->type = MB_CHAR;
3265       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3266       re_string_skip_bytes (regexp, cur_char_size);
3267       return REG_NOERROR;
3268     }
3269 #endif /* RE_ENABLE_I18N */
3270   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3271   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3272       || token->type == OP_OPEN_EQUIV_CLASS)
3273     return parse_bracket_symbol (elem, regexp, token);
3274   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3275     {
3276       /* A '-' must only appear as anything but a range indicator before
3277          the closing bracket.  Everything else is an error.  */
3278       re_token_t token2;
3279       (void) peek_token_bracket (&token2, regexp, syntax);
3280       if (token2.type != OP_CLOSE_BRACKET)
3281         /* The actual error value is not standardized since this whole
3282            case is undefined.  But ERANGE makes good sense.  */
3283         return REG_ERANGE;
3284     }
3285   elem->type = SB_CHAR;
3286   elem->opr.ch = token->opr.c;
3287   return REG_NOERROR;
3288 }
3289
3290 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3291    such as [:<character_class>:], [.<collating_element>.], and
3292    [=<equivalent_class>=].  */
3293
3294 static reg_errcode_t
3295 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3296                       re_token_t *token)
3297 {
3298   unsigned char ch, delim = token->opr.c;
3299   int i = 0;
3300   if (re_string_eoi(regexp))
3301     return REG_EBRACK;
3302   for (;; ++i)
3303     {
3304       if (i >= BRACKET_NAME_BUF_SIZE)
3305         return REG_EBRACK;
3306       if (token->type == OP_OPEN_CHAR_CLASS)
3307         ch = re_string_fetch_byte_case (regexp);
3308       else
3309         ch = re_string_fetch_byte (regexp);
3310       if (re_string_eoi(regexp))
3311         return REG_EBRACK;
3312       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3313         break;
3314       elem->opr.name[i] = ch;
3315     }
3316   re_string_skip_bytes (regexp, 1);
3317   elem->opr.name[i] = '\0';
3318   switch (token->type)
3319     {
3320     case OP_OPEN_COLL_ELEM:
3321       elem->type = COLL_SYM;
3322       break;
3323     case OP_OPEN_EQUIV_CLASS:
3324       elem->type = EQUIV_CLASS;
3325       break;
3326     case OP_OPEN_CHAR_CLASS:
3327       elem->type = CHAR_CLASS;
3328       break;
3329     default:
3330       break;
3331     }
3332   return REG_NOERROR;
3333 }
3334
3335   /* Helper function for parse_bracket_exp.
3336      Build the equivalence class which is represented by NAME.
3337      The result are written to MBCSET and SBCSET.
3338      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3339      is a pointer argument sinse we may update it.  */
3340
3341 static reg_errcode_t
3342 #ifdef RE_ENABLE_I18N
3343 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3344                    int *equiv_class_alloc, const unsigned char *name)
3345 #else /* not RE_ENABLE_I18N */
3346 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3347 #endif /* not RE_ENABLE_I18N */
3348 {
3349 #ifdef _LIBC
3350   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3351   if (nrules != 0)
3352     {
3353       const int32_t *table, *indirect;
3354       const unsigned char *weights, *extra, *cp;
3355       unsigned char char_buf[2];
3356       int32_t idx1, idx2;
3357       unsigned int ch;
3358       size_t len;
3359       /* This #include defines a local function!  */
3360 # include <locale/weight.h>
3361       /* Calculate the index for equivalence class.  */
3362       cp = name;
3363       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3364       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3365                                                _NL_COLLATE_WEIGHTMB);
3366       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3367                                                    _NL_COLLATE_EXTRAMB);
3368       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3369                                                 _NL_COLLATE_INDIRECTMB);
3370       idx1 = findidx (&cp);
3371       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3372         /* This isn't a valid character.  */
3373         return REG_ECOLLATE;
3374
3375       /* Build single byte matcing table for this equivalence class.  */
3376       char_buf[1] = (unsigned char) '\0';
3377       len = weights[idx1 & 0xffffff];
3378       for (ch = 0; ch < SBC_MAX; ++ch)
3379         {
3380           char_buf[0] = ch;
3381           cp = char_buf;
3382           idx2 = findidx (&cp);
3383 /*
3384           idx2 = table[ch];
3385 */
3386           if (idx2 == 0)
3387             /* This isn't a valid character.  */
3388             continue;
3389           /* Compare only if the length matches and the collation rule
3390              index is the same.  */
3391           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3392             {
3393               int cnt = 0;
3394
3395               while (cnt <= len &&
3396                      weights[(idx1 & 0xffffff) + 1 + cnt]
3397                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3398                 ++cnt;
3399
3400               if (cnt > len)
3401                 bitset_set (sbcset, ch);
3402             }
3403         }
3404       /* Check whether the array has enough space.  */
3405       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3406         {
3407           /* Not enough, realloc it.  */
3408           /* +1 in case of mbcset->nequiv_classes is 0.  */
3409           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3410           /* Use realloc since the array is NULL if *alloc == 0.  */
3411           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3412                                                    int32_t,
3413                                                    new_equiv_class_alloc);
3414           if (BE (new_equiv_classes == NULL, 0))
3415             return REG_ESPACE;
3416           mbcset->equiv_classes = new_equiv_classes;
3417           *equiv_class_alloc = new_equiv_class_alloc;
3418         }
3419       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3420     }
3421   else
3422 #endif /* _LIBC */
3423     {
3424       if (BE (strlen ((const char *) name) != 1, 0))
3425         return REG_ECOLLATE;
3426       bitset_set (sbcset, *name);
3427     }
3428   return REG_NOERROR;
3429 }
3430
3431   /* Helper function for parse_bracket_exp.
3432      Build the character class which is represented by NAME.
3433      The result are written to MBCSET and SBCSET.
3434      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3435      is a pointer argument sinse we may update it.  */
3436
3437 static reg_errcode_t
3438 #ifdef RE_ENABLE_I18N
3439 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3440                  re_charset_t *mbcset, int *char_class_alloc,
3441                  const unsigned char *class_name, reg_syntax_t syntax)
3442 #else /* not RE_ENABLE_I18N */
3443 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3444                  const unsigned char *class_name, reg_syntax_t syntax)
3445 #endif /* not RE_ENABLE_I18N */
3446 {
3447   int i;
3448   const char *name = (const char *) class_name;
3449
3450   /* In case of REG_ICASE "upper" and "lower" match the both of
3451      upper and lower cases.  */
3452   if ((syntax & RE_ICASE)
3453       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3454     name = "alpha";
3455
3456 #ifdef RE_ENABLE_I18N
3457   /* Check the space of the arrays.  */
3458   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3459     {
3460       /* Not enough, realloc it.  */
3461       /* +1 in case of mbcset->nchar_classes is 0.  */
3462       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3463       /* Use realloc since array is NULL if *alloc == 0.  */
3464       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3465                                                new_char_class_alloc);
3466       if (BE (new_char_classes == NULL, 0))
3467         return REG_ESPACE;
3468       mbcset->char_classes = new_char_classes;
3469       *char_class_alloc = new_char_class_alloc;
3470     }
3471   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3472 #endif /* RE_ENABLE_I18N */
3473
3474 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3475   do {                                          \
3476     if (BE (trans != NULL, 0))                  \
3477       {                                         \
3478         for (i = 0; i < SBC_MAX; ++i)           \
3479           if (ctype_func (i))                   \
3480             bitset_set (sbcset, trans[i]);      \
3481       }                                         \
3482     else                                        \
3483       {                                         \
3484         for (i = 0; i < SBC_MAX; ++i)           \
3485           if (ctype_func (i))                   \
3486             bitset_set (sbcset, i);             \
3487       }                                         \
3488   } while (0)
3489
3490   if (strcmp (name, "alnum") == 0)
3491     BUILD_CHARCLASS_LOOP (isalnum);
3492   else if (strcmp (name, "cntrl") == 0)
3493     BUILD_CHARCLASS_LOOP (iscntrl);
3494   else if (strcmp (name, "lower") == 0)
3495     BUILD_CHARCLASS_LOOP (islower);
3496   else if (strcmp (name, "space") == 0)
3497     BUILD_CHARCLASS_LOOP (isspace);
3498   else if (strcmp (name, "alpha") == 0)
3499     BUILD_CHARCLASS_LOOP (isalpha);
3500   else if (strcmp (name, "digit") == 0)
3501     BUILD_CHARCLASS_LOOP (isdigit);
3502   else if (strcmp (name, "print") == 0)
3503     BUILD_CHARCLASS_LOOP (isprint);
3504   else if (strcmp (name, "upper") == 0)
3505     BUILD_CHARCLASS_LOOP (isupper);
3506   else if (strcmp (name, "blank") == 0)
3507     BUILD_CHARCLASS_LOOP (isblank);
3508   else if (strcmp (name, "graph") == 0)
3509     BUILD_CHARCLASS_LOOP (isgraph);
3510   else if (strcmp (name, "punct") == 0)
3511     BUILD_CHARCLASS_LOOP (ispunct);
3512   else if (strcmp (name, "xdigit") == 0)
3513     BUILD_CHARCLASS_LOOP (isxdigit);
3514   else
3515     return REG_ECTYPE;
3516
3517   return REG_NOERROR;
3518 }
3519
3520 static bin_tree_t *
3521 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3522                     const unsigned char *class_name,
3523                     const unsigned char *extra, int non_match,
3524                     reg_errcode_t *err)
3525 {
3526   re_bitset_ptr_t sbcset;
3527 #ifdef RE_ENABLE_I18N
3528   re_charset_t *mbcset;
3529   int alloc = 0;
3530 #endif /* not RE_ENABLE_I18N */
3531   reg_errcode_t ret;
3532   re_token_t br_token;
3533   bin_tree_t *tree;
3534
3535   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3536 #ifdef RE_ENABLE_I18N
3537   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3538 #endif /* RE_ENABLE_I18N */
3539
3540 #ifdef RE_ENABLE_I18N
3541   if (BE (sbcset == NULL || mbcset == NULL, 0))
3542 #else /* not RE_ENABLE_I18N */
3543   if (BE (sbcset == NULL, 0))
3544 #endif /* not RE_ENABLE_I18N */
3545     {
3546       *err = REG_ESPACE;
3547       return NULL;
3548     }
3549
3550   if (non_match)
3551     {
3552 #ifdef RE_ENABLE_I18N
3553       mbcset->non_match = 1;
3554 #endif /* not RE_ENABLE_I18N */
3555     }
3556
3557   /* We don't care the syntax in this case.  */
3558   ret = build_charclass (trans, sbcset,
3559 #ifdef RE_ENABLE_I18N
3560                          mbcset, &alloc,
3561 #endif /* RE_ENABLE_I18N */
3562                          class_name, 0);
3563
3564   if (BE (ret != REG_NOERROR, 0))
3565     {
3566       re_free (sbcset);
3567 #ifdef RE_ENABLE_I18N
3568       free_charset (mbcset);
3569 #endif /* RE_ENABLE_I18N */
3570       *err = ret;
3571       return NULL;
3572     }
3573   /* \w match '_' also.  */
3574   for (; *extra; extra++)
3575     bitset_set (sbcset, *extra);
3576
3577   /* If it is non-matching list.  */
3578   if (non_match)
3579     bitset_not (sbcset);
3580
3581 #ifdef RE_ENABLE_I18N
3582   /* Ensure only single byte characters are set.  */
3583   if (dfa->mb_cur_max > 1)
3584     bitset_mask (sbcset, dfa->sb_char);
3585 #endif
3586
3587   /* Build a tree for simple bracket.  */
3588   br_token.type = SIMPLE_BRACKET;
3589   br_token.opr.sbcset = sbcset;
3590   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3591   if (BE (tree == NULL, 0))
3592     goto build_word_op_espace;
3593
3594 #ifdef RE_ENABLE_I18N
3595   if (dfa->mb_cur_max > 1)
3596     {
3597       bin_tree_t *mbc_tree;
3598       /* Build a tree for complex bracket.  */
3599       br_token.type = COMPLEX_BRACKET;
3600       br_token.opr.mbcset = mbcset;
3601       dfa->has_mb_node = 1;
3602       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3603       if (BE (mbc_tree == NULL, 0))
3604         goto build_word_op_espace;
3605       /* Then join them by ALT node.  */
3606       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3607       if (BE (mbc_tree != NULL, 1))
3608         return tree;
3609     }
3610   else
3611     {
3612       free_charset (mbcset);
3613       return tree;
3614     }
3615 #else /* not RE_ENABLE_I18N */
3616   return tree;
3617 #endif /* not RE_ENABLE_I18N */
3618
3619  build_word_op_espace:
3620   re_free (sbcset);
3621 #ifdef RE_ENABLE_I18N
3622   free_charset (mbcset);
3623 #endif /* RE_ENABLE_I18N */
3624   *err = REG_ESPACE;
3625   return NULL;
3626 }
3627
3628 /* This is intended for the expressions like "a{1,3}".
3629    Fetch a number from `input', and return the number.
3630    Return -1, if the number field is empty like "{,1}".
3631    Return -2, If an error is occured.  */
3632
3633 static int
3634 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3635 {
3636   int num = -1;
3637   unsigned char c;
3638   while (1)
3639     {
3640       fetch_token (token, input, syntax);
3641       c = token->opr.c;
3642       if (BE (token->type == END_OF_RE, 0))
3643         return -2;
3644       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3645         break;
3646       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3647              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3648       num = (num > RE_DUP_MAX) ? -2 : num;
3649     }
3650   return num;
3651 }
3652 \f
3653 #ifdef RE_ENABLE_I18N
3654 static void
3655 free_charset (re_charset_t *cset)
3656 {
3657   re_free (cset->mbchars);
3658 # ifdef _LIBC
3659   re_free (cset->coll_syms);
3660   re_free (cset->equiv_classes);
3661   re_free (cset->range_starts);
3662   re_free (cset->range_ends);
3663 # endif
3664   re_free (cset->char_classes);
3665   re_free (cset);
3666 }
3667 #endif /* RE_ENABLE_I18N */
3668 \f
3669 /* Functions for binary tree operation.  */
3670
3671 /* Create a tree node.  */
3672
3673 static bin_tree_t *
3674 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3675              re_token_type_t type)
3676 {
3677   re_token_t t;
3678   t.type = type;
3679   return create_token_tree (dfa, left, right, &t);
3680 }
3681
3682 static bin_tree_t *
3683 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3684                    const re_token_t *token)
3685 {
3686   bin_tree_t *tree;
3687   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3688     {
3689       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3690
3691       if (storage == NULL)
3692         return NULL;
3693       storage->next = dfa->str_tree_storage;
3694       dfa->str_tree_storage = storage;
3695       dfa->str_tree_storage_idx = 0;
3696     }
3697   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3698
3699   tree->parent = NULL;
3700   tree->left = left;
3701   tree->right = right;
3702   tree->token = *token;
3703   tree->token.duplicated = 0;
3704   tree->token.opt_subexp = 0;
3705   tree->first = NULL;
3706   tree->next = NULL;
3707   tree->node_idx = -1;
3708
3709   if (left != NULL)
3710     left->parent = tree;
3711   if (right != NULL)
3712     right->parent = tree;
3713   return tree;
3714 }
3715
3716 /* Mark the tree SRC as an optional subexpression.
3717    To be called from preorder or postorder.  */
3718
3719 static reg_errcode_t
3720 mark_opt_subexp (void *extra, bin_tree_t *node)
3721 {
3722   int idx = (int) (long) extra;
3723   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3724     node->token.opt_subexp = 1;
3725
3726   return REG_NOERROR;
3727 }
3728
3729 /* Free the allocated memory inside NODE. */
3730
3731 static void
3732 free_token (re_token_t *node)
3733 {
3734 #ifdef RE_ENABLE_I18N
3735   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3736     free_charset (node->opr.mbcset);
3737   else
3738 #endif /* RE_ENABLE_I18N */
3739     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3740       re_free (node->opr.sbcset);
3741 }
3742
3743 /* Worker function for tree walking.  Free the allocated memory inside NODE
3744    and its children. */
3745
3746 static reg_errcode_t
3747 free_tree (void *extra, bin_tree_t *node)
3748 {
3749   free_token (&node->token);
3750   return REG_NOERROR;
3751 }
3752
3753
3754 /* Duplicate the node SRC, and return new node.  This is a preorder
3755    visit similar to the one implemented by the generic visitor, but
3756    we need more infrastructure to maintain two parallel trees --- so,
3757    it's easier to duplicate.  */
3758
3759 static bin_tree_t *
3760 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3761 {
3762   const bin_tree_t *node;
3763   bin_tree_t *dup_root;
3764   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3765
3766   for (node = root; ; )
3767     {
3768       /* Create a new tree and link it back to the current parent.  */
3769       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3770       if (*p_new == NULL)
3771         return NULL;
3772       (*p_new)->parent = dup_node;
3773       (*p_new)->token.duplicated = 1;
3774       dup_node = *p_new;
3775
3776       /* Go to the left node, or up and to the right.  */
3777       if (node->left)
3778         {
3779           node = node->left;
3780           p_new = &dup_node->left;
3781         }
3782       else
3783         {
3784           const bin_tree_t *prev = NULL;
3785           while (node->right == prev || node->right == NULL)
3786             {
3787               prev = node;
3788               node = node->parent;
3789               dup_node = dup_node->parent;
3790               if (!node)
3791                 return dup_root;
3792             }
3793           node = node->right;
3794           p_new = &dup_node->right;
3795         }
3796     }
3797 }