posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          int node, int root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static int fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   int nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  int nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      int nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             int accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         int *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       int *char_class_alloc,
  98                                       const unsigned char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const unsigned char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const unsigned char *class_name,
 111                                        const unsigned char *extra,
 112                                        int non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 const char __re_error_msgid[] attribute_hidden =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 const size_t __re_error_msgid_idx[] attribute_hidden =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206 /* re_compile_pattern is the GNU regular expression compiler: it
 207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 208    Returns 0 if the pattern was valid, otherwise an error string.
 209
 210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 211    are set in BUFP on entry.  */
 212
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 {
 219   reg_errcode_t ret;
 220
 221   /* And GNU code determines whether or not to get register information
 222      by passing null for the REGS argument to re_match, etc., not by
 223      setting no_sub, unless RE_NO_SUB is set.  */
 224   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 225
 226   /* Match anchors at newline.  */
 227   bufp->newline_anchor = 1;
 228
 229   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 230
 231   if (!ret)
 232     return NULL;
 233   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 234 }
 235 #ifdef _LIBC
 236 weak_alias (__re_compile_pattern, re_compile_pattern)
 237 #endif
 238
 239 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 240    also be assigned to arbitrarily: each pattern buffer stores its own
 241    syntax, so it can be changed between regex compilations.  */
 242 /* This has no initializer because initialized variables in Emacs
 243    become read-only after dumping.  */
 244 reg_syntax_t re_syntax_options;
 245
 246
 247 /* Specify the precise syntax of regexps for compilation.  This provides
 248    for compatibility for various utilities which historically have
 249    different, incompatible syntaxes.
 250
 251    The argument SYNTAX is a bit mask comprised of the various bits
 252    defined in regex.h.  We return the old syntax.  */
 253
 254 reg_syntax_t
 255 re_set_syntax (syntax)
 256     reg_syntax_t syntax;
 257 {
 258   reg_syntax_t ret = re_syntax_options;
 259
 260   re_syntax_options = syntax;
 261   return ret;
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_set_syntax, re_set_syntax)
 265 #endif
 266
 267 int
 268 re_compile_fastmap (bufp)
 269     struct re_pattern_buffer *bufp;
 270 {
 271   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 272   char *fastmap = bufp->fastmap;
 273
 274   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 275   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 276   if (dfa->init_state != dfa->init_state_word)
 277     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 278   if (dfa->init_state != dfa->init_state_nl)
 279     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 280   if (dfa->init_state != dfa->init_state_begbuf)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 282   bufp->fastmap_accurate = 1;
 283   return 0;
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 287 #endif
 288
 289 static inline void
 290 __attribute ((always_inline))
 291 re_set_fastmap (char *fastmap, int icase, int ch)
 292 {
 293   fastmap[ch] = 1;
 294   if (icase)
 295     fastmap[tolower (ch)] = 1;
 296 }
 297
 298 /* Helper function for re_compile_fastmap.
 299    Compile fastmap for the initial_state INIT_STATE.  */
 300
 301 static void
 302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 303                          char *fastmap)
 304 {
 305   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 306   int node_cnt;
 307   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 308   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 309     {
 310       int node = init_state->nodes.elems[node_cnt];
 311       re_token_type_t type = dfa->nodes[node].type;
 312
 313       if (type == CHARACTER)
 314         {
 315           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 316 #ifdef RE_ENABLE_I18N
 317           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 318             {
 319               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 320               wchar_t wc;
 321               mbstate_t state;
 322
 323               p = buf;
 324               *p++ = dfa->nodes[node].opr.c;
 325               while (++node < dfa->nodes_len
 326                      && dfa->nodes[node].type == CHARACTER
 327                      && dfa->nodes[node].mb_partial)
 328                 *p++ = dfa->nodes[node].opr.c;
 329               memset (&state, '\0', sizeof (state));
 330               if (mbrtowc (&wc, (const char *) buf, p - buf,
 331                            &state) == p - buf
 332                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 333                       != (size_t) -1))
 334                 re_set_fastmap (fastmap, 0, buf[0]);
 335             }
 336 #endif
 337         }
 338       else if (type == SIMPLE_BRACKET)
 339         {
 340           int i, ch;
 341           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 342             {
 343               int j;
 344               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 345               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 346                 if (w & ((bitset_word_t) 1 << j))
 347                   re_set_fastmap (fastmap, icase, ch);
 348             }
 349         }
 350 #ifdef RE_ENABLE_I18N
 351       else if (type == COMPLEX_BRACKET)
 352         {
 353           int i;
 354           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 355           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 356               || cset->nranges || cset->nchar_classes)
 357             {
 358 # ifdef _LIBC
 359               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 360                 {
 361                   /* In this case we want to catch the bytes which are
 362                      the first byte of any collation elements.
 363                      e.g. In da_DK, we want to catch 'a' since "aa"
 364                           is a valid collation element, and don't catch
 365                           'b' since 'b' is the only collation element
 366                           which starts from 'b'.  */
 367                   const int32_t *table = (const int32_t *)
 368                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 369                   for (i = 0; i < SBC_MAX; ++i)
 370                     if (table[i] < 0)
 371                       re_set_fastmap (fastmap, icase, i);
 372                 }
 373 # else
 374               if (dfa->mb_cur_max > 1)
 375                 for (i = 0; i < SBC_MAX; ++i)
 376                   if (__btowc (i) == WEOF)
 377                     re_set_fastmap (fastmap, icase, i);
 378 # endif /* not _LIBC */
 379             }
 380           for (i = 0; i < cset->nmbchars; ++i)
 381             {
 382               char buf[256];
 383               mbstate_t state;
 384               memset (&state, '\0', sizeof (state));
 385               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 386                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 387               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 388                 {
 389                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 390                       != (size_t) -1)
 391                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 392                 }
 393             }
 394         }
 395 #endif /* RE_ENABLE_I18N */
 396       else if (type == OP_PERIOD
 397 #ifdef RE_ENABLE_I18N
 398                || type == OP_UTF8_PERIOD
 399 #endif /* RE_ENABLE_I18N */
 400                || type == END_OF_RE)
 401         {
 402           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 403           if (type == END_OF_RE)
 404             bufp->can_be_null = 1;
 405           return;
 406         }
 407     }
 408 }
 409 \f
 410 /* Entry point for POSIX code.  */
 411 /* regcomp takes a regular expression as a string and compiles it.
 412
 413    PREG is a regex_t *.  We do not expect any fields to be initialized,
 414    since POSIX says we shouldn't.  Thus, we set
 415
 416      `buffer' to the compiled pattern;
 417      `used' to the length of the compiled pattern;
 418      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 419        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 420        RE_SYNTAX_POSIX_BASIC;
 421      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 422      `fastmap' to an allocated space for the fastmap;
 423      `fastmap_accurate' to zero;
 424      `re_nsub' to the number of subexpressions in PATTERN.
 425
 426    PATTERN is the address of the pattern string.
 427
 428    CFLAGS is a series of bits which affect compilation.
 429
 430      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 431      use POSIX basic syntax.
 432
 433      If REG_NEWLINE is set, then . and [^...] don't match newline.
 434      Also, regexec will try a match beginning after every newline.
 435
 436      If REG_ICASE is set, then we considers upper- and lowercase
 437      versions of letters to be equivalent when matching.
 438
 439      If REG_NOSUB is set, then when PREG is passed to regexec, that
 440      routine will report only success or failure, and nothing about the
 441      registers.
 442
 443    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 444    the return codes and their meanings.)  */
 445
 446 int
 447 regcomp (preg, pattern, cflags)
 448     regex_t *__restrict preg;
 449     const char *__restrict pattern;
 450     int cflags;
 451 {
 452   reg_errcode_t ret;
 453   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 454                          : RE_SYNTAX_POSIX_BASIC);
 455
 456   preg->buffer = NULL;
 457   preg->allocated = 0;
 458   preg->used = 0;
 459
 460   /* Try to allocate space for the fastmap.  */
 461   preg->fastmap = re_malloc (char, SBC_MAX);
 462   if (BE (preg->fastmap == NULL, 0))
 463     return REG_ESPACE;
 464
 465   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 466
 467   /* If REG_NEWLINE is set, newlines are treated differently.  */
 468   if (cflags & REG_NEWLINE)
 469     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 470       syntax &= ~RE_DOT_NEWLINE;
 471       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 472       /* It also changes the matching behavior.  */
 473       preg->newline_anchor = 1;
 474     }
 475   else
 476     preg->newline_anchor = 0;
 477   preg->no_sub = !!(cflags & REG_NOSUB);
 478   preg->translate = NULL;
 479
 480   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 481
 482   /* POSIX doesn't distinguish between an unmatched open-group and an
 483      unmatched close-group: both are REG_EPAREN.  */
 484   if (ret == REG_ERPAREN)
 485     ret = REG_EPAREN;
 486
 487   /* We have already checked preg->fastmap != NULL.  */
 488   if (BE (ret == REG_NOERROR, 1))
 489     /* Compute the fastmap now, since regexec cannot modify the pattern
 490        buffer.  This function never fails in this implementation.  */
 491     (void) re_compile_fastmap (preg);
 492   else
 493     {
 494       /* Some error occurred while compiling the expression.  */
 495       re_free (preg->fastmap);
 496       preg->fastmap = NULL;
 497     }
 498
 499   return (int) ret;
 500 }
 501 #ifdef _LIBC
 502 weak_alias (__regcomp, regcomp)
 503 #endif
 504
 505 /* Returns a message corresponding to an error code, ERRCODE, returned
 506    from either regcomp or regexec.   We don't use PREG here.  */
 507
 508 size_t
 509 regerror (errcode, preg, errbuf, errbuf_size)
 510     int errcode;
 511     const regex_t *__restrict preg;
 512     char *__restrict errbuf;
 513     size_t errbuf_size;
 514 {
 515   const char *msg;
 516   size_t msg_size;
 517
 518   if (BE (errcode < 0
 519           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 520                                / sizeof (__re_error_msgid_idx[0])), 0))
 521     /* Only error codes returned by the rest of the code should be passed
 522        to this routine.  If we are given anything else, or if other regex
 523        code generates an invalid error code, then the program has a bug.
 524        Dump core so we can fix it.  */
 525     abort ();
 526
 527   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 528
 529   msg_size = strlen (msg) + 1; /* Includes the null.  */
 530
 531   if (BE (errbuf_size != 0, 1))
 532     {
 533       if (BE (msg_size > errbuf_size, 0))
 534         {
 535 #if defined HAVE_MEMPCPY || defined _LIBC
 536           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 537 #else
 538           memcpy (errbuf, msg, errbuf_size - 1);
 539           errbuf[errbuf_size - 1] = 0;
 540 #endif
 541         }
 542       else
 543         memcpy (errbuf, msg, msg_size);
 544     }
 545
 546   return msg_size;
 547 }
 548 #ifdef _LIBC
 549 weak_alias (__regerror, regerror)
 550 #endif
 551
 552
 553 #ifdef RE_ENABLE_I18N
 554 /* This static array is used for the map to single-byte characters when
 555    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 556    it the same all the time.  UTF-8 is the preferred encoding so this is
 557    a worthwhile optimization.  */
 558 static const bitset_t utf8_sb_map =
 559 {
 560   /* Set the first 128 bits.  */
 561   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 562 };
 563 #endif
 564
 565
 566 static void
 567 free_dfa_content (re_dfa_t *dfa)
 568 {
 569   int i, j;
 570
 571   if (dfa->nodes)
 572     for (i = 0; i < dfa->nodes_len; ++i)
 573       free_token (dfa->nodes + i);
 574   re_free (dfa->nexts);
 575   for (i = 0; i < dfa->nodes_len; ++i)
 576     {
 577       if (dfa->eclosures != NULL)
 578         re_node_set_free (dfa->eclosures + i);
 579       if (dfa->inveclosures != NULL)
 580         re_node_set_free (dfa->inveclosures + i);
 581       if (dfa->edests != NULL)
 582         re_node_set_free (dfa->edests + i);
 583     }
 584   re_free (dfa->edests);
 585   re_free (dfa->eclosures);
 586   re_free (dfa->inveclosures);
 587   re_free (dfa->nodes);
 588
 589   if (dfa->state_table)
 590     for (i = 0; i <= dfa->state_hash_mask; ++i)
 591       {
 592         struct re_state_table_entry *entry = dfa->state_table + i;
 593         for (j = 0; j < entry->num; ++j)
 594           {
 595             re_dfastate_t *state = entry->array[j];
 596             free_state (state);
 597           }
 598         re_free (entry->array);
 599       }
 600   re_free (dfa->state_table);
 601 #ifdef RE_ENABLE_I18N
 602   if (dfa->sb_char != utf8_sb_map)
 603     re_free (dfa->sb_char);
 604 #endif
 605   re_free (dfa->subexp_map);
 606 #ifdef DEBUG
 607   re_free (dfa->re_str);
 608 #endif
 609
 610   re_free (dfa);
 611 }
 612
 613
 614 /* Free dynamically allocated space used by PREG.  */
 615
 616 void
 617 regfree (preg)
 618     regex_t *preg;
 619 {
 620   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 621   if (BE (dfa != NULL, 1))
 622     free_dfa_content (dfa);
 623   preg->buffer = NULL;
 624   preg->allocated = 0;
 625
 626   re_free (preg->fastmap);
 627   preg->fastmap = NULL;
 628
 629   re_free (preg->translate);
 630   preg->translate = NULL;
 631 }
 632 #ifdef _LIBC
 633 weak_alias (__regfree, regfree)
 634 #endif
 635 \f
 636 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 637    them unless specifically requested.  */
 638
 639 #if defined _REGEX_RE_COMP || defined _LIBC
 640
 641 /* BSD has one and only one pattern buffer.  */
 642 static struct re_pattern_buffer re_comp_buf;
 643
 644 char *
 645 # ifdef _LIBC
 646 /* Make these definitions weak in libc, so POSIX programs can redefine
 647    these names if they don't use our functions, and still use
 648    regcomp/regexec above without link errors.  */
 649 weak_function
 650 # endif
 651 re_comp (s)
 652      const char *s;
 653 {
 654   reg_errcode_t ret;
 655   char *fastmap;
 656
 657   if (!s)
 658     {
 659       if (!re_comp_buf.buffer)
 660         return gettext ("No previous regular expression");
 661       return 0;
 662     }
 663
 664   if (re_comp_buf.buffer)
 665     {
 666       fastmap = re_comp_buf.fastmap;
 667       re_comp_buf.fastmap = NULL;
 668       __regfree (&re_comp_buf);
 669       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 670       re_comp_buf.fastmap = fastmap;
 671     }
 672
 673   if (re_comp_buf.fastmap == NULL)
 674     {
 675       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 676       if (re_comp_buf.fastmap == NULL)
 677         return (char *) gettext (__re_error_msgid
 678                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 679     }
 680
 681   /* Since `re_exec' always passes NULL for the `regs' argument, we
 682      don't need to initialize the pattern buffer fields which affect it.  */
 683
 684   /* Match anchors at newlines.  */
 685   re_comp_buf.newline_anchor = 1;
 686
 687   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 688
 689   if (!ret)
 690     return NULL;
 691
 692   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 693   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 694 }
 695
 696 #ifdef _LIBC
 697 libc_freeres_fn (free_mem)
 698 {
 699   __regfree (&re_comp_buf);
 700 }
 701 #endif
 702
 703 #endif /* _REGEX_RE_COMP */
 704 \f
 705 /* Internal entry point.
 706    Compile the regular expression PATTERN, whose length is LENGTH.
 707    SYNTAX indicate regular expression's syntax.  */
 708
 709 static reg_errcode_t
 710 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 711                      reg_syntax_t syntax)
 712 {
 713   reg_errcode_t err = REG_NOERROR;
 714   re_dfa_t *dfa;
 715   re_string_t regexp;
 716
 717   /* Initialize the pattern buffer.  */
 718   preg->fastmap_accurate = 0;
 719   preg->syntax = syntax;
 720   preg->not_bol = preg->not_eol = 0;
 721   preg->used = 0;
 722   preg->re_nsub = 0;
 723   preg->can_be_null = 0;
 724   preg->regs_allocated = REGS_UNALLOCATED;
 725
 726   /* Initialize the dfa.  */
 727   dfa = (re_dfa_t *) preg->buffer;
 728   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 729     {
 730       /* If zero allocated, but buffer is non-null, try to realloc
 731          enough space.  This loses if buffer's address is bogus, but
 732          that is the user's responsibility.  If ->buffer is NULL this
 733          is a simple allocation.  */
 734       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 735       if (dfa == NULL)
 736         return REG_ESPACE;
 737       preg->allocated = sizeof (re_dfa_t);
 738       preg->buffer = (unsigned char *) dfa;
 739     }
 740   preg->used = sizeof (re_dfa_t);
 741
 742   err = init_dfa (dfa, length);
 743   if (BE (err != REG_NOERROR, 0))
 744     {
 745       free_dfa_content (dfa);
 746       preg->buffer = NULL;
 747       preg->allocated = 0;
 748       return err;
 749     }
 750 #ifdef DEBUG
 751   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 752   dfa->re_str = re_malloc (char, length + 1);
 753   strncpy (dfa->re_str, pattern, length + 1);
 754 #endif
 755
 756   __libc_lock_init (dfa->lock);
 757
 758   err = re_string_construct (&regexp, pattern, length, preg->translate,
 759                              syntax & RE_ICASE, dfa);
 760   if (BE (err != REG_NOERROR, 0))
 761     {
 762     re_compile_internal_free_return:
 763       free_workarea_compile (preg);
 764       re_string_destruct (&regexp);
 765       free_dfa_content (dfa);
 766       preg->buffer = NULL;
 767       preg->allocated = 0;
 768       return err;
 769     }
 770
 771   /* Parse the regular expression, and build a structure tree.  */
 772   preg->re_nsub = 0;
 773   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 774   if (BE (dfa->str_tree == NULL, 0))
 775     goto re_compile_internal_free_return;
 776
 777   /* Analyze the tree and create the nfa.  */
 778   err = analyze (preg);
 779   if (BE (err != REG_NOERROR, 0))
 780     goto re_compile_internal_free_return;
 781
 782 #ifdef RE_ENABLE_I18N
 783   /* If possible, do searching in single byte encoding to speed things up.  */
 784   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 785     optimize_utf8 (dfa);
 786 #endif
 787
 788   /* Then create the initial state of the dfa.  */
 789   err = create_initial_state (dfa);
 790
 791   /* Release work areas.  */
 792   free_workarea_compile (preg);
 793   re_string_destruct (&regexp);
 794
 795   if (BE (err != REG_NOERROR, 0))
 796     {
 797       free_dfa_content (dfa);
 798       preg->buffer = NULL;
 799       preg->allocated = 0;
 800     }
 801
 802   return err;
 803 }
 804
 805 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 806    as the initial length of some arrays.  */
 807
 808 static reg_errcode_t
 809 init_dfa (re_dfa_t *dfa, size_t pat_len)
 810 {
 811   unsigned int table_size;
 812 #ifndef _LIBC
 813   char *codeset_name;
 814 #endif
 815
 816   memset (dfa, '\0', sizeof (re_dfa_t));
 817
 818   /* Force allocation of str_tree_storage the first time.  */
 819   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 820
 821   /* Avoid overflows.  */
 822   if (pat_len == SIZE_MAX)
 823     return REG_ESPACE;
 824
 825   dfa->nodes_alloc = pat_len + 1;
 826   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 827
 828   /*  table_size = 2 ^ ceil(log pat_len) */
 829   for (table_size = 1; ; table_size <<= 1)
 830     if (table_size > pat_len)
 831       break;
 832
 833   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 834   dfa->state_hash_mask = table_size - 1;
 835
 836   dfa->mb_cur_max = MB_CUR_MAX;
 837 #ifdef _LIBC
 838   if (dfa->mb_cur_max == 6
 839       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 840     dfa->is_utf8 = 1;
 841   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 842                        != 0);
 843 #else
 844 # ifdef HAVE_LANGINFO_CODESET
 845   codeset_name = nl_langinfo (CODESET);
 846 # else
 847   codeset_name = getenv ("LC_ALL");
 848   if (codeset_name == NULL || codeset_name[0] == '\0')
 849     codeset_name = getenv ("LC_CTYPE");
 850   if (codeset_name == NULL || codeset_name[0] == '\0')
 851     codeset_name = getenv ("LANG");
 852   if (codeset_name == NULL)
 853     codeset_name = "";
 854   else if (strchr (codeset_name, '.') !=  NULL)
 855     codeset_name = strchr (codeset_name, '.') + 1;
 856 # endif
 857
 858   if (strcasecmp (codeset_name, "UTF-8") == 0
 859       || strcasecmp (codeset_name, "UTF8") == 0)
 860     dfa->is_utf8 = 1;
 861
 862   /* We check exhaustively in the loop below if this charset is a
 863      superset of ASCII.  */
 864   dfa->map_notascii = 0;
 865 #endif
 866
 867 #ifdef RE_ENABLE_I18N
 868   if (dfa->mb_cur_max > 1)
 869     {
 870       if (dfa->is_utf8)
 871         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 872       else
 873         {
 874           int i, j, ch;
 875
 876           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 877           if (BE (dfa->sb_char == NULL, 0))
 878             return REG_ESPACE;
 879
 880           /* Set the bits corresponding to single byte chars.  */
 881           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 882             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 883               {
 884                 wint_t wch = __btowc (ch);
 885                 if (wch != WEOF)
 886                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 887 # ifndef _LIBC
 888                 if (isascii (ch) && wch != ch)
 889                   dfa->map_notascii = 1;
 890 # endif
 891               }
 892         }
 893     }
 894 #endif
 895
 896   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 897     return REG_ESPACE;
 898   return REG_NOERROR;
 899 }
 900
 901 /* Initialize WORD_CHAR table, which indicate which character is
 902    "word".  In this case "word" means that it is the word construction
 903    character used by some operators like "\<", "\>", etc.  */
 904
 905 static void
 906 internal_function
 907 init_word_char (re_dfa_t *dfa)
 908 {
 909   int i, j, ch;
 910   dfa->word_ops_used = 1;
 911   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 912     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 913       if (isalnum (ch) || ch == '_')
 914         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 915 }
 916
 917 /* Free the work area which are only used while compiling.  */
 918
 919 static void
 920 free_workarea_compile (regex_t *preg)
 921 {
 922   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 923   bin_tree_storage_t *storage, *next;
 924   for (storage = dfa->str_tree_storage; storage; storage = next)
 925     {
 926       next = storage->next;
 927       re_free (storage);
 928     }
 929   dfa->str_tree_storage = NULL;
 930   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 931   dfa->str_tree = NULL;
 932   re_free (dfa->org_indices);
 933   dfa->org_indices = NULL;
 934 }
 935
 936 /* Create initial states for all contexts.  */
 937
 938 static reg_errcode_t
 939 create_initial_state (re_dfa_t *dfa)
 940 {
 941   int first, i;
 942   reg_errcode_t err;
 943   re_node_set init_nodes;
 944
 945   /* Initial states have the epsilon closure of the node which is
 946      the first node of the regular expression.  */
 947   first = dfa->str_tree->first->node_idx;
 948   dfa->init_node = first;
 949   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 950   if (BE (err != REG_NOERROR, 0))
 951     return err;
 952
 953   /* The back-references which are in initial states can epsilon transit,
 954      since in this case all of the subexpressions can be null.
 955      Then we add epsilon closures of the nodes which are the next nodes of
 956      the back-references.  */
 957   if (dfa->nbackref > 0)
 958     for (i = 0; i < init_nodes.nelem; ++i)
 959       {
 960         int node_idx = init_nodes.elems[i];
 961         re_token_type_t type = dfa->nodes[node_idx].type;
 962
 963         int clexp_idx;
 964         if (type != OP_BACK_REF)
 965           continue;
 966         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 967           {
 968             re_token_t *clexp_node;
 969             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 970             if (clexp_node->type == OP_CLOSE_SUBEXP
 971                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 972               break;
 973           }
 974         if (clexp_idx == init_nodes.nelem)
 975           continue;
 976
 977         if (type == OP_BACK_REF)
 978           {
 979             int dest_idx = dfa->edests[node_idx].elems[0];
 980             if (!re_node_set_contains (&init_nodes, dest_idx))
 981               {
 982                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 983                 i = 0;
 984               }
 985           }
 986       }
 987
 988   /* It must be the first time to invoke acquire_state.  */
 989   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 990   /* We don't check ERR here, since the initial state must not be NULL.  */
 991   if (BE (dfa->init_state == NULL, 0))
 992     return err;
 993   if (dfa->init_state->has_constraint)
 994     {
 995       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 996                                                        CONTEXT_WORD);
 997       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 998                                                      CONTEXT_NEWLINE);
 999       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1000                                                          &init_nodes,
1001                                                          CONTEXT_NEWLINE
1002                                                          | CONTEXT_BEGBUF);
1003       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1004               || dfa->init_state_begbuf == NULL, 0))
1005         return err;
1006     }
1007   else
1008     dfa->init_state_word = dfa->init_state_nl
1009       = dfa->init_state_begbuf = dfa->init_state;
1010
1011   re_node_set_free (&init_nodes);
1012   return REG_NOERROR;
1013 }
1014 \f
1015 #ifdef RE_ENABLE_I18N
1016 /* If it is possible to do searching in single byte encoding instead of UTF-8
1017    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1018    DFA nodes where needed.  */
1019
1020 static void
1021 optimize_utf8 (re_dfa_t *dfa)
1022 {
1023   int node, i, mb_chars = 0, has_period = 0;
1024
1025   for (node = 0; node < dfa->nodes_len; ++node)
1026     switch (dfa->nodes[node].type)
1027       {
1028       case CHARACTER:
1029         if (dfa->nodes[node].opr.c >= 0x80)
1030           mb_chars = 1;
1031         break;
1032       case ANCHOR:
1033         switch (dfa->nodes[node].opr.idx)
1034           {
1035           case LINE_FIRST:
1036           case LINE_LAST:
1037           case BUF_FIRST:
1038           case BUF_LAST:
1039             break;
1040           default:
1041             /* Word anchors etc. cannot be handled.  */
1042             return;
1043           }
1044         break;
1045       case OP_PERIOD:
1046         has_period = 1;
1047         break;
1048       case OP_BACK_REF:
1049       case OP_ALT:
1050       case END_OF_RE:
1051       case OP_DUP_ASTERISK:
1052       case OP_OPEN_SUBEXP:
1053       case OP_CLOSE_SUBEXP:
1054         break;
1055       case COMPLEX_BRACKET:
1056         return;
1057       case SIMPLE_BRACKET:
1058         /* Just double check.  The non-ASCII range starts at 0x80.  */
1059         assert (0x80 % BITSET_WORD_BITS == 0);
1060         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1061           if (dfa->nodes[node].opr.sbcset[i])
1062             return;
1063         break;
1064       default:
1065         abort ();
1066       }
1067
1068   if (mb_chars || has_period)
1069     for (node = 0; node < dfa->nodes_len; ++node)
1070       {
1071         if (dfa->nodes[node].type == CHARACTER
1072             && dfa->nodes[node].opr.c >= 0x80)
1073           dfa->nodes[node].mb_partial = 0;
1074         else if (dfa->nodes[node].type == OP_PERIOD)
1075           dfa->nodes[node].type = OP_UTF8_PERIOD;
1076       }
1077
1078   /* The search can be in single byte locale.  */
1079   dfa->mb_cur_max = 1;
1080   dfa->is_utf8 = 0;
1081   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1082 }
1083 #endif
1084 \f
1085 /* Analyze the structure tree, and calculate "first", "next", "edest",
1086    "eclosure", and "inveclosure".  */
1087
1088 static reg_errcode_t
1089 analyze (regex_t *preg)
1090 {
1091   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1092   reg_errcode_t ret;
1093
1094   /* Allocate arrays.  */
1095   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1096   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1097   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1098   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1099   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1100           || dfa->eclosures == NULL, 0))
1101     return REG_ESPACE;
1102
1103   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1104   if (dfa->subexp_map != NULL)
1105     {
1106       int i;
1107       for (i = 0; i < preg->re_nsub; i++)
1108         dfa->subexp_map[i] = i;
1109       preorder (dfa->str_tree, optimize_subexps, dfa);
1110       for (i = 0; i < preg->re_nsub; i++)
1111         if (dfa->subexp_map[i] != i)
1112           break;
1113       if (i == preg->re_nsub)
1114         {
1115           free (dfa->subexp_map);
1116           dfa->subexp_map = NULL;
1117         }
1118     }
1119
1120   ret = postorder (dfa->str_tree, lower_subexps, preg);
1121   if (BE (ret != REG_NOERROR, 0))
1122     return ret;
1123   ret = postorder (dfa->str_tree, calc_first, dfa);
1124   if (BE (ret != REG_NOERROR, 0))
1125     return ret;
1126   preorder (dfa->str_tree, calc_next, dfa);
1127   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1128   if (BE (ret != REG_NOERROR, 0))
1129     return ret;
1130   ret = calc_eclosure (dfa);
1131   if (BE (ret != REG_NOERROR, 0))
1132     return ret;
1133
1134   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1135      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1136   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1137       || dfa->nbackref)
1138     {
1139       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1140       if (BE (dfa->inveclosures == NULL, 0))
1141         return REG_ESPACE;
1142       ret = calc_inveclosure (dfa);
1143     }
1144
1145   return ret;
1146 }
1147
1148 /* Our parse trees are very unbalanced, so we cannot use a stack to
1149    implement parse tree visits.  Instead, we use parent pointers and
1150    some hairy code in these two functions.  */
1151 static reg_errcode_t
1152 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1153            void *extra)
1154 {
1155   bin_tree_t *node, *prev;
1156
1157   for (node = root; ; )
1158     {
1159       /* Descend down the tree, preferably to the left (or to the right
1160          if that's the only child).  */
1161       while (node->left || node->right)
1162         if (node->left)
1163           node = node->left;
1164         else
1165           node = node->right;
1166
1167       do
1168         {
1169           reg_errcode_t err = fn (extra, node);
1170           if (BE (err != REG_NOERROR, 0))
1171             return err;
1172           if (node->parent == NULL)
1173             return REG_NOERROR;
1174           prev = node;
1175           node = node->parent;
1176         }
1177       /* Go up while we have a node that is reached from the right.  */
1178       while (node->right == prev || node->right == NULL);
1179       node = node->right;
1180     }
1181 }
1182
1183 static reg_errcode_t
1184 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1185           void *extra)
1186 {
1187   bin_tree_t *node;
1188
1189   for (node = root; ; )
1190     {
1191       reg_errcode_t err = fn (extra, node);
1192       if (BE (err != REG_NOERROR, 0))
1193         return err;
1194
1195       /* Go to the left node, or up and to the right.  */
1196       if (node->left)
1197         node = node->left;
1198       else
1199         {
1200           bin_tree_t *prev = NULL;
1201           while (node->right == prev || node->right == NULL)
1202             {
1203               prev = node;
1204               node = node->parent;
1205               if (!node)
1206                 return REG_NOERROR;
1207             }
1208           node = node->right;
1209         }
1210     }
1211 }
1212
1213 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1214    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1215    backreferences as well.  Requires a preorder visit.  */
1216 static reg_errcode_t
1217 optimize_subexps (void *extra, bin_tree_t *node)
1218 {
1219   re_dfa_t *dfa = (re_dfa_t *) extra;
1220
1221   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1222     {
1223       int idx = node->token.opr.idx;
1224       node->token.opr.idx = dfa->subexp_map[idx];
1225       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1226     }
1227
1228   else if (node->token.type == SUBEXP
1229            && node->left && node->left->token.type == SUBEXP)
1230     {
1231       int other_idx = node->left->token.opr.idx;
1232
1233       node->left = node->left->left;
1234       if (node->left)
1235         node->left->parent = node;
1236
1237       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1238       if (other_idx < BITSET_WORD_BITS)
1239           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1240     }
1241
1242   return REG_NOERROR;
1243 }
1244
1245 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1246    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1247 static reg_errcode_t
1248 lower_subexps (void *extra, bin_tree_t *node)
1249 {
1250   regex_t *preg = (regex_t *) extra;
1251   reg_errcode_t err = REG_NOERROR;
1252
1253   if (node->left && node->left->token.type == SUBEXP)
1254     {
1255       node->left = lower_subexp (&err, preg, node->left);
1256       if (node->left)
1257         node->left->parent = node;
1258     }
1259   if (node->right && node->right->token.type == SUBEXP)
1260     {
1261       node->right = lower_subexp (&err, preg, node->right);
1262       if (node->right)
1263         node->right->parent = node;
1264     }
1265
1266   return err;
1267 }
1268
1269 static bin_tree_t *
1270 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1271 {
1272   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1273   bin_tree_t *body = node->left;
1274   bin_tree_t *op, *cls, *tree1, *tree;
1275
1276   if (preg->no_sub
1277       /* We do not optimize empty subexpressions, because otherwise we may
1278          have bad CONCAT nodes with NULL children.  This is obviously not
1279          very common, so we do not lose much.  An example that triggers
1280          this case is the sed "script" /\(\)/x.  */
1281       && node->left != NULL
1282       && (node->token.opr.idx >= BITSET_WORD_BITS
1283           || !(dfa->used_bkref_map
1284                & ((bitset_word_t) 1 << node->token.opr.idx))))
1285     return node->left;
1286
1287   /* Convert the SUBEXP node to the concatenation of an
1288      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1289   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1290   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1291   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1292   tree = create_tree (dfa, op, tree1, CONCAT);
1293   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1294     {
1295       *err = REG_ESPACE;
1296       return NULL;
1297     }
1298
1299   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1300   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1301   return tree;
1302 }
1303
1304 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1305    nodes.  Requires a postorder visit.  */
1306 static reg_errcode_t
1307 calc_first (void *extra, bin_tree_t *node)
1308 {
1309   re_dfa_t *dfa = (re_dfa_t *) extra;
1310   if (node->token.type == CONCAT)
1311     {
1312       node->first = node->left->first;
1313       node->node_idx = node->left->node_idx;
1314     }
1315   else
1316     {
1317       node->first = node;
1318       node->node_idx = re_dfa_add_node (dfa, node->token);
1319       if (BE (node->node_idx == -1, 0))
1320         return REG_ESPACE;
1321     }
1322   return REG_NOERROR;
1323 }
1324
1325 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1326 static reg_errcode_t
1327 calc_next (void *extra, bin_tree_t *node)
1328 {
1329   switch (node->token.type)
1330     {
1331     case OP_DUP_ASTERISK:
1332       node->left->next = node;
1333       break;
1334     case CONCAT:
1335       node->left->next = node->right->first;
1336       node->right->next = node->next;
1337       break;
1338     default:
1339       if (node->left)
1340         node->left->next = node->next;
1341       if (node->right)
1342         node->right->next = node->next;
1343       break;
1344     }
1345   return REG_NOERROR;
1346 }
1347
1348 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1349 static reg_errcode_t
1350 link_nfa_nodes (void *extra, bin_tree_t *node)
1351 {
1352   re_dfa_t *dfa = (re_dfa_t *) extra;
1353   int idx = node->node_idx;
1354   reg_errcode_t err = REG_NOERROR;
1355
1356   switch (node->token.type)
1357     {
1358     case CONCAT:
1359       break;
1360
1361     case END_OF_RE:
1362       assert (node->next == NULL);
1363       break;
1364
1365     case OP_DUP_ASTERISK:
1366     case OP_ALT:
1367       {
1368         int left, right;
1369         dfa->has_plural_match = 1;
1370         if (node->left != NULL)
1371           left = node->left->first->node_idx;
1372         else
1373           left = node->next->node_idx;
1374         if (node->right != NULL)
1375           right = node->right->first->node_idx;
1376         else
1377           right = node->next->node_idx;
1378         assert (left > -1);
1379         assert (right > -1);
1380         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1381       }
1382       break;
1383
1384     case ANCHOR:
1385     case OP_OPEN_SUBEXP:
1386     case OP_CLOSE_SUBEXP:
1387       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1388       break;
1389
1390     case OP_BACK_REF:
1391       dfa->nexts[idx] = node->next->node_idx;
1392       if (node->token.type == OP_BACK_REF)
1393         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1394       break;
1395
1396     default:
1397       assert (!IS_EPSILON_NODE (node->token.type));
1398       dfa->nexts[idx] = node->next->node_idx;
1399       break;
1400     }
1401
1402   return err;
1403 }
1404
1405 /* Duplicate the epsilon closure of the node ROOT_NODE.
1406    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1407    to their own constraint.  */
1408
1409 static reg_errcode_t
1410 internal_function
1411 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1412                         int root_node, unsigned int init_constraint)
1413 {
1414   int org_node, clone_node, ret;
1415   unsigned int constraint = init_constraint;
1416   for (org_node = top_org_node, clone_node = top_clone_node;;)
1417     {
1418       int org_dest, clone_dest;
1419       if (dfa->nodes[org_node].type == OP_BACK_REF)
1420         {
1421           /* If the back reference epsilon-transit, its destination must
1422              also have the constraint.  Then duplicate the epsilon closure
1423              of the destination of the back reference, and store it in
1424              edests of the back reference.  */
1425           org_dest = dfa->nexts[org_node];
1426           re_node_set_empty (dfa->edests + clone_node);
1427           clone_dest = duplicate_node (dfa, org_dest, constraint);
1428           if (BE (clone_dest == -1, 0))
1429             return REG_ESPACE;
1430           dfa->nexts[clone_node] = dfa->nexts[org_node];
1431           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1432           if (BE (ret < 0, 0))
1433             return REG_ESPACE;
1434         }
1435       else if (dfa->edests[org_node].nelem == 0)
1436         {
1437           /* In case of the node can't epsilon-transit, don't duplicate the
1438              destination and store the original destination as the
1439              destination of the node.  */
1440           dfa->nexts[clone_node] = dfa->nexts[org_node];
1441           break;
1442         }
1443       else if (dfa->edests[org_node].nelem == 1)
1444         {
1445           /* In case of the node can epsilon-transit, and it has only one
1446              destination.  */
1447           org_dest = dfa->edests[org_node].elems[0];
1448           re_node_set_empty (dfa->edests + clone_node);
1449           if (dfa->nodes[org_node].type == ANCHOR)
1450             {
1451               /* In case of the node has another constraint, append it.  */
1452               if (org_node == root_node && clone_node != org_node)
1453                 {
1454                   /* ...but if the node is root_node itself, it means the
1455                      epsilon closure have a loop, then tie it to the
1456                      destination of the root_node.  */
1457                   ret = re_node_set_insert (dfa->edests + clone_node,
1458                                             org_dest);
1459                   if (BE (ret < 0, 0))
1460                     return REG_ESPACE;
1461                   break;
1462                 }
1463               constraint |= dfa->nodes[org_node].opr.ctx_type;
1464             }
1465           clone_dest = duplicate_node (dfa, org_dest, constraint);
1466           if (BE (clone_dest == -1, 0))
1467             return REG_ESPACE;
1468           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1469           if (BE (ret < 0, 0))
1470             return REG_ESPACE;
1471         }
1472       else /* dfa->edests[org_node].nelem == 2 */
1473         {
1474           /* In case of the node can epsilon-transit, and it has two
1475              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1476           org_dest = dfa->edests[org_node].elems[0];
1477           re_node_set_empty (dfa->edests + clone_node);
1478           /* Search for a duplicated node which satisfies the constraint.  */
1479           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1480           if (clone_dest == -1)
1481             {
1482               /* There are no such a duplicated node, create a new one.  */
1483               reg_errcode_t err;
1484               clone_dest = duplicate_node (dfa, org_dest, constraint);
1485               if (BE (clone_dest == -1, 0))
1486                 return REG_ESPACE;
1487               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488               if (BE (ret < 0, 0))
1489                 return REG_ESPACE;
1490               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1491                                             root_node, constraint);
1492               if (BE (err != REG_NOERROR, 0))
1493                 return err;
1494             }
1495           else
1496             {
1497               /* There are a duplicated node which satisfy the constraint,
1498                  use it to avoid infinite loop.  */
1499               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1500               if (BE (ret < 0, 0))
1501                 return REG_ESPACE;
1502             }
1503
1504           org_dest = dfa->edests[org_node].elems[1];
1505           clone_dest = duplicate_node (dfa, org_dest, constraint);
1506           if (BE (clone_dest == -1, 0))
1507             return REG_ESPACE;
1508           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509           if (BE (ret < 0, 0))
1510             return REG_ESPACE;
1511         }
1512       org_node = org_dest;
1513       clone_node = clone_dest;
1514     }
1515   return REG_NOERROR;
1516 }
1517
1518 /* Search for a node which is duplicated from the node ORG_NODE, and
1519    satisfies the constraint CONSTRAINT.  */
1520
1521 static int
1522 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1523                         unsigned int constraint)
1524 {
1525   int idx;
1526   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1527     {
1528       if (org_node == dfa->org_indices[idx]
1529           && constraint == dfa->nodes[idx].constraint)
1530         return idx; /* Found.  */
1531     }
1532   return -1; /* Not found.  */
1533 }
1534
1535 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1536    Return the index of the new node, or -1 if insufficient storage is
1537    available.  */
1538
1539 static int
1540 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1541 {
1542   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1543   if (BE (dup_idx != -1, 1))
1544     {
1545       dfa->nodes[dup_idx].constraint = constraint;
1546       if (dfa->nodes[org_idx].type == ANCHOR)
1547         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1548       dfa->nodes[dup_idx].duplicated = 1;
1549
1550       /* Store the index of the original node.  */
1551       dfa->org_indices[dup_idx] = org_idx;
1552     }
1553   return dup_idx;
1554 }
1555
1556 static reg_errcode_t
1557 calc_inveclosure (re_dfa_t *dfa)
1558 {
1559   int src, idx, ret;
1560   for (idx = 0; idx < dfa->nodes_len; ++idx)
1561     re_node_set_init_empty (dfa->inveclosures + idx);
1562
1563   for (src = 0; src < dfa->nodes_len; ++src)
1564     {
1565       int *elems = dfa->eclosures[src].elems;
1566       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1567         {
1568           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1569           if (BE (ret == -1, 0))
1570             return REG_ESPACE;
1571         }
1572     }
1573
1574   return REG_NOERROR;
1575 }
1576
1577 /* Calculate "eclosure" for all the node in DFA.  */
1578
1579 static reg_errcode_t
1580 calc_eclosure (re_dfa_t *dfa)
1581 {
1582   int node_idx, incomplete;
1583 #ifdef DEBUG
1584   assert (dfa->nodes_len > 0);
1585 #endif
1586   incomplete = 0;
1587   /* For each nodes, calculate epsilon closure.  */
1588   for (node_idx = 0; ; ++node_idx)
1589     {
1590       reg_errcode_t err;
1591       re_node_set eclosure_elem;
1592       if (node_idx == dfa->nodes_len)
1593         {
1594           if (!incomplete)
1595             break;
1596           incomplete = 0;
1597           node_idx = 0;
1598         }
1599
1600 #ifdef DEBUG
1601       assert (dfa->eclosures[node_idx].nelem != -1);
1602 #endif
1603
1604       /* If we have already calculated, skip it.  */
1605       if (dfa->eclosures[node_idx].nelem != 0)
1606         continue;
1607       /* Calculate epsilon closure of `node_idx'.  */
1608       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1609       if (BE (err != REG_NOERROR, 0))
1610         return err;
1611
1612       if (dfa->eclosures[node_idx].nelem == 0)
1613         {
1614           incomplete = 1;
1615           re_node_set_free (&eclosure_elem);
1616         }
1617     }
1618   return REG_NOERROR;
1619 }
1620
1621 /* Calculate epsilon closure of NODE.  */
1622
1623 static reg_errcode_t
1624 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1625 {
1626   reg_errcode_t err;
1627   unsigned int constraint;
1628   int i, incomplete;
1629   re_node_set eclosure;
1630   incomplete = 0;
1631   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1632   if (BE (err != REG_NOERROR, 0))
1633     return err;
1634
1635   /* This indicates that we are calculating this node now.
1636      We reference this value to avoid infinite loop.  */
1637   dfa->eclosures[node].nelem = -1;
1638
1639   constraint = ((dfa->nodes[node].type == ANCHOR)
1640                 ? dfa->nodes[node].opr.ctx_type : 0);
1641   /* If the current node has constraints, duplicate all nodes.
1642      Since they must inherit the constraints.  */
1643   if (constraint
1644       && dfa->edests[node].nelem
1645       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1646     {
1647       int org_node, cur_node;
1648       org_node = cur_node = node;
1649       err = duplicate_node_closure (dfa, node, node, node, constraint);
1650       if (BE (err != REG_NOERROR, 0))
1651         return err;
1652     }
1653
1654   /* Expand each epsilon destination nodes.  */
1655   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1656     for (i = 0; i < dfa->edests[node].nelem; ++i)
1657       {
1658         re_node_set eclosure_elem;
1659         int edest = dfa->edests[node].elems[i];
1660         /* If calculating the epsilon closure of `edest' is in progress,
1661            return intermediate result.  */
1662         if (dfa->eclosures[edest].nelem == -1)
1663           {
1664             incomplete = 1;
1665             continue;
1666           }
1667         /* If we haven't calculated the epsilon closure of `edest' yet,
1668            calculate now. Otherwise use calculated epsilon closure.  */
1669         if (dfa->eclosures[edest].nelem == 0)
1670           {
1671             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1672             if (BE (err != REG_NOERROR, 0))
1673               return err;
1674           }
1675         else
1676           eclosure_elem = dfa->eclosures[edest];
1677         /* Merge the epsilon closure of `edest'.  */
1678         re_node_set_merge (&eclosure, &eclosure_elem);
1679         /* If the epsilon closure of `edest' is incomplete,
1680            the epsilon closure of this node is also incomplete.  */
1681         if (dfa->eclosures[edest].nelem == 0)
1682           {
1683             incomplete = 1;
1684             re_node_set_free (&eclosure_elem);
1685           }
1686       }
1687
1688   /* Epsilon closures include itself.  */
1689   re_node_set_insert (&eclosure, node);
1690   if (incomplete && !root)
1691     dfa->eclosures[node].nelem = 0;
1692   else
1693     dfa->eclosures[node] = eclosure;
1694   *new_set = eclosure;
1695   return REG_NOERROR;
1696 }
1697 \f
1698 /* Functions for token which are used in the parser.  */
1699
1700 /* Fetch a token from INPUT.
1701    We must not use this function inside bracket expressions.  */
1702
1703 static void
1704 internal_function
1705 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1706 {
1707   re_string_skip_bytes (input, peek_token (result, input, syntax));
1708 }
1709
1710 /* Peek a token from INPUT, and return the length of the token.
1711    We must not use this function inside bracket expressions.  */
1712
1713 static int
1714 internal_function
1715 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1716 {
1717   unsigned char c;
1718
1719   if (re_string_eoi (input))
1720     {
1721       token->type = END_OF_RE;
1722       return 0;
1723     }
1724
1725   c = re_string_peek_byte (input, 0);
1726   token->opr.c = c;
1727
1728   token->word_char = 0;
1729 #ifdef RE_ENABLE_I18N
1730   token->mb_partial = 0;
1731   if (input->mb_cur_max > 1 &&
1732       !re_string_first_byte (input, re_string_cur_idx (input)))
1733     {
1734       token->type = CHARACTER;
1735       token->mb_partial = 1;
1736       return 1;
1737     }
1738 #endif
1739   if (c == '\\')
1740     {
1741       unsigned char c2;
1742       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1743         {
1744           token->type = BACK_SLASH;
1745           return 1;
1746         }
1747
1748       c2 = re_string_peek_byte_case (input, 1);
1749       token->opr.c = c2;
1750       token->type = CHARACTER;
1751 #ifdef RE_ENABLE_I18N
1752       if (input->mb_cur_max > 1)
1753         {
1754           wint_t wc = re_string_wchar_at (input,
1755                                           re_string_cur_idx (input) + 1);
1756           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1757         }
1758       else
1759 #endif
1760         token->word_char = IS_WORD_CHAR (c2) != 0;
1761
1762       switch (c2)
1763         {
1764         case '|':
1765           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1766             token->type = OP_ALT;
1767           break;
1768         case '1': case '2': case '3': case '4': case '5':
1769         case '6': case '7': case '8': case '9':
1770           if (!(syntax & RE_NO_BK_REFS))
1771             {
1772               token->type = OP_BACK_REF;
1773               token->opr.idx = c2 - '1';
1774             }
1775           break;
1776         case '<':
1777           if (!(syntax & RE_NO_GNU_OPS))
1778             {
1779               token->type = ANCHOR;
1780               token->opr.ctx_type = WORD_FIRST;
1781             }
1782           break;
1783         case '>':
1784           if (!(syntax & RE_NO_GNU_OPS))
1785             {
1786               token->type = ANCHOR;
1787               token->opr.ctx_type = WORD_LAST;
1788             }
1789           break;
1790         case 'b':
1791           if (!(syntax & RE_NO_GNU_OPS))
1792             {
1793               token->type = ANCHOR;
1794               token->opr.ctx_type = WORD_DELIM;
1795             }
1796           break;
1797         case 'B':
1798           if (!(syntax & RE_NO_GNU_OPS))
1799             {
1800               token->type = ANCHOR;
1801               token->opr.ctx_type = NOT_WORD_DELIM;
1802             }
1803           break;
1804         case 'w':
1805           if (!(syntax & RE_NO_GNU_OPS))
1806             token->type = OP_WORD;
1807           break;
1808         case 'W':
1809           if (!(syntax & RE_NO_GNU_OPS))
1810             token->type = OP_NOTWORD;
1811           break;
1812         case 's':
1813           if (!(syntax & RE_NO_GNU_OPS))
1814             token->type = OP_SPACE;
1815           break;
1816         case 'S':
1817           if (!(syntax & RE_NO_GNU_OPS))
1818             token->type = OP_NOTSPACE;
1819           break;
1820         case '`':
1821           if (!(syntax & RE_NO_GNU_OPS))
1822             {
1823               token->type = ANCHOR;
1824               token->opr.ctx_type = BUF_FIRST;
1825             }
1826           break;
1827         case '\'':
1828           if (!(syntax & RE_NO_GNU_OPS))
1829             {
1830               token->type = ANCHOR;
1831               token->opr.ctx_type = BUF_LAST;
1832             }
1833           break;
1834         case '(':
1835           if (!(syntax & RE_NO_BK_PARENS))
1836             token->type = OP_OPEN_SUBEXP;
1837           break;
1838         case ')':
1839           if (!(syntax & RE_NO_BK_PARENS))
1840             token->type = OP_CLOSE_SUBEXP;
1841           break;
1842         case '+':
1843           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1844             token->type = OP_DUP_PLUS;
1845           break;
1846         case '?':
1847           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1848             token->type = OP_DUP_QUESTION;
1849           break;
1850         case '{':
1851           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1852             token->type = OP_OPEN_DUP_NUM;
1853           break;
1854         case '}':
1855           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1856             token->type = OP_CLOSE_DUP_NUM;
1857           break;
1858         default:
1859           break;
1860         }
1861       return 2;
1862     }
1863
1864   token->type = CHARACTER;
1865 #ifdef RE_ENABLE_I18N
1866   if (input->mb_cur_max > 1)
1867     {
1868       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1869       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1870     }
1871   else
1872 #endif
1873     token->word_char = IS_WORD_CHAR (token->opr.c);
1874
1875   switch (c)
1876     {
1877     case '\n':
1878       if (syntax & RE_NEWLINE_ALT)
1879         token->type = OP_ALT;
1880       break;
1881     case '|':
1882       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1883         token->type = OP_ALT;
1884       break;
1885     case '*':
1886       token->type = OP_DUP_ASTERISK;
1887       break;
1888     case '+':
1889       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1890         token->type = OP_DUP_PLUS;
1891       break;
1892     case '?':
1893       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1894         token->type = OP_DUP_QUESTION;
1895       break;
1896     case '{':
1897       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1898         token->type = OP_OPEN_DUP_NUM;
1899       break;
1900     case '}':
1901       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1902         token->type = OP_CLOSE_DUP_NUM;
1903       break;
1904     case '(':
1905       if (syntax & RE_NO_BK_PARENS)
1906         token->type = OP_OPEN_SUBEXP;
1907       break;
1908     case ')':
1909       if (syntax & RE_NO_BK_PARENS)
1910         token->type = OP_CLOSE_SUBEXP;
1911       break;
1912     case '[':
1913       token->type = OP_OPEN_BRACKET;
1914       break;
1915     case '.':
1916       token->type = OP_PERIOD;
1917       break;
1918     case '^':
1919       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1920           re_string_cur_idx (input) != 0)
1921         {
1922           char prev = re_string_peek_byte (input, -1);
1923           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1924             break;
1925         }
1926       token->type = ANCHOR;
1927       token->opr.ctx_type = LINE_FIRST;
1928       break;
1929     case '$':
1930       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1931           re_string_cur_idx (input) + 1 != re_string_length (input))
1932         {
1933           re_token_t next;
1934           re_string_skip_bytes (input, 1);
1935           peek_token (&next, input, syntax);
1936           re_string_skip_bytes (input, -1);
1937           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1938             break;
1939         }
1940       token->type = ANCHOR;
1941       token->opr.ctx_type = LINE_LAST;
1942       break;
1943     default:
1944       break;
1945     }
1946   return 1;
1947 }
1948
1949 /* Peek a token from INPUT, and return the length of the token.
1950    We must not use this function out of bracket expressions.  */
1951
1952 static int
1953 internal_function
1954 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1955 {
1956   unsigned char c;
1957   if (re_string_eoi (input))
1958     {
1959       token->type = END_OF_RE;
1960       return 0;
1961     }
1962   c = re_string_peek_byte (input, 0);
1963   token->opr.c = c;
1964
1965 #ifdef RE_ENABLE_I18N
1966   if (input->mb_cur_max > 1 &&
1967       !re_string_first_byte (input, re_string_cur_idx (input)))
1968     {
1969       token->type = CHARACTER;
1970       return 1;
1971     }
1972 #endif /* RE_ENABLE_I18N */
1973
1974   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1975       && re_string_cur_idx (input) + 1 < re_string_length (input))
1976     {
1977       /* In this case, '\' escape a character.  */
1978       unsigned char c2;
1979       re_string_skip_bytes (input, 1);
1980       c2 = re_string_peek_byte (input, 0);
1981       token->opr.c = c2;
1982       token->type = CHARACTER;
1983       return 1;
1984     }
1985   if (c == '[') /* '[' is a special char in a bracket exps.  */
1986     {
1987       unsigned char c2;
1988       int token_len;
1989       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1990         c2 = re_string_peek_byte (input, 1);
1991       else
1992         c2 = 0;
1993       token->opr.c = c2;
1994       token_len = 2;
1995       switch (c2)
1996         {
1997         case '.':
1998           token->type = OP_OPEN_COLL_ELEM;
1999           break;
2000         case '=':
2001           token->type = OP_OPEN_EQUIV_CLASS;
2002           break;
2003         case ':':
2004           if (syntax & RE_CHAR_CLASSES)
2005             {
2006               token->type = OP_OPEN_CHAR_CLASS;
2007               break;
2008             }
2009           /* else fall through.  */
2010         default:
2011           token->type = CHARACTER;
2012           token->opr.c = c;
2013           token_len = 1;
2014           break;
2015         }
2016       return token_len;
2017     }
2018   switch (c)
2019     {
2020     case '-':
2021       token->type = OP_CHARSET_RANGE;
2022       break;
2023     case ']':
2024       token->type = OP_CLOSE_BRACKET;
2025       break;
2026     case '^':
2027       token->type = OP_NON_MATCH_LIST;
2028       break;
2029     default:
2030       token->type = CHARACTER;
2031     }
2032   return 1;
2033 }
2034 \f
2035 /* Functions for parser.  */
2036
2037 /* Entry point of the parser.
2038    Parse the regular expression REGEXP and return the structure tree.
2039    If an error is occured, ERR is set by error code, and return NULL.
2040    This function build the following tree, from regular expression <reg_exp>:
2041            CAT
2042            / \
2043           /   \
2044    <reg_exp>  EOR
2045
2046    CAT means concatenation.
2047    EOR means end of regular expression.  */
2048
2049 static bin_tree_t *
2050 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2051        reg_errcode_t *err)
2052 {
2053   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2054   bin_tree_t *tree, *eor, *root;
2055   re_token_t current_token;
2056   dfa->syntax = syntax;
2057   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2058   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2059   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2060     return NULL;
2061   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2062   if (tree != NULL)
2063     root = create_tree (dfa, tree, eor, CONCAT);
2064   else
2065     root = eor;
2066   if (BE (eor == NULL || root == NULL, 0))
2067     {
2068       *err = REG_ESPACE;
2069       return NULL;
2070     }
2071   return root;
2072 }
2073
2074 /* This function build the following tree, from regular expression
2075    <branch1>|<branch2>:
2076            ALT
2077            / \
2078           /   \
2079    <branch1> <branch2>
2080
2081    ALT means alternative, which represents the operator `|'.  */
2082
2083 static bin_tree_t *
2084 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2085                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2086 {
2087   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2088   bin_tree_t *tree, *branch = NULL;
2089   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2090   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2091     return NULL;
2092
2093   while (token->type == OP_ALT)
2094     {
2095       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2096       if (token->type != OP_ALT && token->type != END_OF_RE
2097           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2098         {
2099           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2100           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2101             return NULL;
2102         }
2103       else
2104         branch = NULL;
2105       tree = create_tree (dfa, tree, branch, OP_ALT);
2106       if (BE (tree == NULL, 0))
2107         {
2108           *err = REG_ESPACE;
2109           return NULL;
2110         }
2111     }
2112   return tree;
2113 }
2114
2115 /* This function build the following tree, from regular expression
2116    <exp1><exp2>:
2117         CAT
2118         / \
2119        /   \
2120    <exp1> <exp2>
2121
2122    CAT means concatenation.  */
2123
2124 static bin_tree_t *
2125 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2126               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2127 {
2128   bin_tree_t *tree, *exp;
2129   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2130   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2131   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2132     return NULL;
2133
2134   while (token->type != OP_ALT && token->type != END_OF_RE
2135          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2136     {
2137       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2138       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2139         {
2140           return NULL;
2141         }
2142       if (tree != NULL && exp != NULL)
2143         {
2144           tree = create_tree (dfa, tree, exp, CONCAT);
2145           if (tree == NULL)
2146             {
2147               *err = REG_ESPACE;
2148               return NULL;
2149             }
2150         }
2151       else if (tree == NULL)
2152         tree = exp;
2153       /* Otherwise exp == NULL, we don't need to create new tree.  */
2154     }
2155   return tree;
2156 }
2157
2158 /* This function build the following tree, from regular expression a*:
2159          *
2160          |
2161          a
2162 */
2163
2164 static bin_tree_t *
2165 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2166                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2167 {
2168   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2169   bin_tree_t *tree;
2170   switch (token->type)
2171     {
2172     case CHARACTER:
2173       tree = create_token_tree (dfa, NULL, NULL, token);
2174       if (BE (tree == NULL, 0))
2175         {
2176           *err = REG_ESPACE;
2177           return NULL;
2178         }
2179 #ifdef RE_ENABLE_I18N
2180       if (dfa->mb_cur_max > 1)
2181         {
2182           while (!re_string_eoi (regexp)
2183                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2184             {
2185               bin_tree_t *mbc_remain;
2186               fetch_token (token, regexp, syntax);
2187               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2188               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2189               if (BE (mbc_remain == NULL || tree == NULL, 0))
2190                 {
2191                   *err = REG_ESPACE;
2192                   return NULL;
2193                 }
2194             }
2195         }
2196 #endif
2197       break;
2198     case OP_OPEN_SUBEXP:
2199       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2200       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2201         return NULL;
2202       break;
2203     case OP_OPEN_BRACKET:
2204       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2205       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2206         return NULL;
2207       break;
2208     case OP_BACK_REF:
2209       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2210         {
2211           *err = REG_ESUBREG;
2212           return NULL;
2213         }
2214       dfa->used_bkref_map |= 1 << token->opr.idx;
2215       tree = create_token_tree (dfa, NULL, NULL, token);
2216       if (BE (tree == NULL, 0))
2217         {
2218           *err = REG_ESPACE;
2219           return NULL;
2220         }
2221       ++dfa->nbackref;
2222       dfa->has_mb_node = 1;
2223       break;
2224     case OP_OPEN_DUP_NUM:
2225       if (syntax & RE_CONTEXT_INVALID_DUP)
2226         {
2227           *err = REG_BADRPT;
2228           return NULL;
2229         }
2230       /* FALLTHROUGH */
2231     case OP_DUP_ASTERISK:
2232     case OP_DUP_PLUS:
2233     case OP_DUP_QUESTION:
2234       if (syntax & RE_CONTEXT_INVALID_OPS)
2235         {
2236           *err = REG_BADRPT;
2237           return NULL;
2238         }
2239       else if (syntax & RE_CONTEXT_INDEP_OPS)
2240         {
2241           fetch_token (token, regexp, syntax);
2242           return parse_expression (regexp, preg, token, syntax, nest, err);
2243         }
2244       /* else fall through  */
2245     case OP_CLOSE_SUBEXP:
2246       if ((token->type == OP_CLOSE_SUBEXP) &&
2247           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2248         {
2249           *err = REG_ERPAREN;
2250           return NULL;
2251         }
2252       /* else fall through  */
2253     case OP_CLOSE_DUP_NUM:
2254       /* We treat it as a normal character.  */
2255
2256       /* Then we can these characters as normal characters.  */
2257       token->type = CHARACTER;
2258       /* mb_partial and word_char bits should be initialized already
2259          by peek_token.  */
2260       tree = create_token_tree (dfa, NULL, NULL, token);
2261       if (BE (tree == NULL, 0))
2262         {
2263           *err = REG_ESPACE;
2264           return NULL;
2265         }
2266       break;
2267     case ANCHOR:
2268       if ((token->opr.ctx_type
2269            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2270           && dfa->word_ops_used == 0)
2271         init_word_char (dfa);
2272       if (token->opr.ctx_type == WORD_DELIM
2273           || token->opr.ctx_type == NOT_WORD_DELIM)
2274         {
2275           bin_tree_t *tree_first, *tree_last;
2276           if (token->opr.ctx_type == WORD_DELIM)
2277             {
2278               token->opr.ctx_type = WORD_FIRST;
2279               tree_first = create_token_tree (dfa, NULL, NULL, token);
2280               token->opr.ctx_type = WORD_LAST;
2281             }
2282           else
2283             {
2284               token->opr.ctx_type = INSIDE_WORD;
2285               tree_first = create_token_tree (dfa, NULL, NULL, token);
2286               token->opr.ctx_type = INSIDE_NOTWORD;
2287             }
2288           tree_last = create_token_tree (dfa, NULL, NULL, token);
2289           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2290           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2291             {
2292               *err = REG_ESPACE;
2293               return NULL;
2294             }
2295         }
2296       else
2297         {
2298           tree = create_token_tree (dfa, NULL, NULL, token);
2299           if (BE (tree == NULL, 0))
2300             {
2301               *err = REG_ESPACE;
2302               return NULL;
2303             }
2304         }
2305       /* We must return here, since ANCHORs can't be followed
2306          by repetition operators.
2307          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2308              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2309       fetch_token (token, regexp, syntax);
2310       return tree;
2311     case OP_PERIOD:
2312       tree = create_token_tree (dfa, NULL, NULL, token);
2313       if (BE (tree == NULL, 0))
2314         {
2315           *err = REG_ESPACE;
2316           return NULL;
2317         }
2318       if (dfa->mb_cur_max > 1)
2319         dfa->has_mb_node = 1;
2320       break;
2321     case OP_WORD:
2322     case OP_NOTWORD:
2323       tree = build_charclass_op (dfa, regexp->trans,
2324                                  (const unsigned char *) "alnum",
2325                                  (const unsigned char *) "_",
2326                                  token->type == OP_NOTWORD, err);
2327       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2328         return NULL;
2329       break;
2330     case OP_SPACE:
2331     case OP_NOTSPACE:
2332       tree = build_charclass_op (dfa, regexp->trans,
2333                                  (const unsigned char *) "space",
2334                                  (const unsigned char *) "",
2335                                  token->type == OP_NOTSPACE, err);
2336       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2337         return NULL;
2338       break;
2339     case OP_ALT:
2340     case END_OF_RE:
2341       return NULL;
2342     case BACK_SLASH:
2343       *err = REG_EESCAPE;
2344       return NULL;
2345     default:
2346       /* Must not happen?  */
2347 #ifdef DEBUG
2348       assert (0);
2349 #endif
2350       return NULL;
2351     }
2352   fetch_token (token, regexp, syntax);
2353
2354   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2355          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2356     {
2357       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2358       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2359         return NULL;
2360       /* In BRE consecutive duplications are not allowed.  */
2361       if ((syntax & RE_CONTEXT_INVALID_DUP)
2362           && (token->type == OP_DUP_ASTERISK
2363               || token->type == OP_OPEN_DUP_NUM))
2364         {
2365           *err = REG_BADRPT;
2366           return NULL;
2367         }
2368     }
2369
2370   return tree;
2371 }
2372
2373 /* This function build the following tree, from regular expression
2374    (<reg_exp>):
2375          SUBEXP
2376             |
2377         <reg_exp>
2378 */
2379
2380 static bin_tree_t *
2381 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2382                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2383 {
2384   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2385   bin_tree_t *tree;
2386   size_t cur_nsub;
2387   cur_nsub = preg->re_nsub++;
2388
2389   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2390
2391   /* The subexpression may be a null string.  */
2392   if (token->type == OP_CLOSE_SUBEXP)
2393     tree = NULL;
2394   else
2395     {
2396       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2397       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2398         *err = REG_EPAREN;
2399       if (BE (*err != REG_NOERROR, 0))
2400         return NULL;
2401     }
2402
2403   if (cur_nsub <= '9' - '1')
2404     dfa->completed_bkref_map |= 1 << cur_nsub;
2405
2406   tree = create_tree (dfa, tree, NULL, SUBEXP);
2407   if (BE (tree == NULL, 0))
2408     {
2409       *err = REG_ESPACE;
2410       return NULL;
2411     }
2412   tree->token.opr.idx = cur_nsub;
2413   return tree;
2414 }
2415
2416 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2417
2418 static bin_tree_t *
2419 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2420               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2421 {
2422   bin_tree_t *tree = NULL, *old_tree = NULL;
2423   int i, start, end, start_idx = re_string_cur_idx (regexp);
2424   re_token_t start_token = *token;
2425
2426   if (token->type == OP_OPEN_DUP_NUM)
2427     {
2428       end = 0;
2429       start = fetch_number (regexp, token, syntax);
2430       if (start == -1)
2431         {
2432           if (token->type == CHARACTER && token->opr.c == ',')
2433             start = 0; /* We treat "{,m}" as "{0,m}".  */
2434           else
2435             {
2436               *err = REG_BADBR; /* <re>{} is invalid.  */
2437               return NULL;
2438             }
2439         }
2440       if (BE (start != -2, 1))
2441         {
2442           /* We treat "{n}" as "{n,n}".  */
2443           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2444                  : ((token->type == CHARACTER && token->opr.c == ',')
2445                     ? fetch_number (regexp, token, syntax) : -2));
2446         }
2447       if (BE (start == -2 || end == -2, 0))
2448         {
2449           /* Invalid sequence.  */
2450           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2451             {
2452               if (token->type == END_OF_RE)
2453                 *err = REG_EBRACE;
2454               else
2455                 *err = REG_BADBR;
2456
2457               return NULL;
2458             }
2459
2460           /* If the syntax bit is set, rollback.  */
2461           re_string_set_index (regexp, start_idx);
2462           *token = start_token;
2463           token->type = CHARACTER;
2464           /* mb_partial and word_char bits should be already initialized by
2465              peek_token.  */
2466           return elem;
2467         }
2468
2469       if (BE (end != -1 && start > end, 0))
2470         {
2471           /* First number greater than second.  */
2472           *err = REG_BADBR;
2473           return NULL;
2474         }
2475     }
2476   else
2477     {
2478       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2479       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2480     }
2481
2482   fetch_token (token, regexp, syntax);
2483
2484   if (BE (elem == NULL, 0))
2485     return NULL;
2486   if (BE (start == 0 && end == 0, 0))
2487     {
2488       postorder (elem, free_tree, NULL);
2489       return NULL;
2490     }
2491
2492   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2493   if (BE (start > 0, 0))
2494     {
2495       tree = elem;
2496       for (i = 2; i <= start; ++i)
2497         {
2498           elem = duplicate_tree (elem, dfa);
2499           tree = create_tree (dfa, tree, elem, CONCAT);
2500           if (BE (elem == NULL || tree == NULL, 0))
2501             goto parse_dup_op_espace;
2502         }
2503
2504       if (start == end)
2505         return tree;
2506
2507       /* Duplicate ELEM before it is marked optional.  */
2508       elem = duplicate_tree (elem, dfa);
2509       old_tree = tree;
2510     }
2511   else
2512     old_tree = NULL;
2513
2514   if (elem->token.type == SUBEXP)
2515     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2516
2517   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2518   if (BE (tree == NULL, 0))
2519     goto parse_dup_op_espace;
2520
2521   /* This loop is actually executed only when end != -1,
2522      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2523      already created the start+1-th copy.  */
2524   for (i = start + 2; i <= end; ++i)
2525     {
2526       elem = duplicate_tree (elem, dfa);
2527       tree = create_tree (dfa, tree, elem, CONCAT);
2528       if (BE (elem == NULL || tree == NULL, 0))
2529         goto parse_dup_op_espace;
2530
2531       tree = create_tree (dfa, tree, NULL, OP_ALT);
2532       if (BE (tree == NULL, 0))
2533         goto parse_dup_op_espace;
2534     }
2535
2536   if (old_tree)
2537     tree = create_tree (dfa, old_tree, tree, CONCAT);
2538
2539   return tree;
2540
2541  parse_dup_op_espace:
2542   *err = REG_ESPACE;
2543   return NULL;
2544 }
2545
2546 /* Size of the names for collating symbol/equivalence_class/character_class.
2547    I'm not sure, but maybe enough.  */
2548 #define BRACKET_NAME_BUF_SIZE 32
2549
2550 #ifndef _LIBC
2551   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2552      Build the range expression which starts from START_ELEM, and ends
2553      at END_ELEM.  The result are written to MBCSET and SBCSET.
2554      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2555      mbcset->range_ends, is a pointer argument sinse we may
2556      update it.  */
2557
2558 static reg_errcode_t
2559 internal_function
2560 # ifdef RE_ENABLE_I18N
2561 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2562                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2563 # else /* not RE_ENABLE_I18N */
2564 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2565                  bracket_elem_t *end_elem)
2566 # endif /* not RE_ENABLE_I18N */
2567 {
2568   unsigned int start_ch, end_ch;
2569   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2570   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2571           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2572           0))
2573     return REG_ERANGE;
2574
2575   /* We can handle no multi character collating elements without libc
2576      support.  */
2577   if (BE ((start_elem->type == COLL_SYM
2578            && strlen ((char *) start_elem->opr.name) > 1)
2579           || (end_elem->type == COLL_SYM
2580               && strlen ((char *) end_elem->opr.name) > 1), 0))
2581     return REG_ECOLLATE;
2582
2583 # ifdef RE_ENABLE_I18N
2584   {
2585     wchar_t wc;
2586     wint_t start_wc;
2587     wint_t end_wc;
2588     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2589
2590     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2591                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2592                    : 0));
2593     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2594               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2595                  : 0));
2596     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2597                 ? __btowc (start_ch) : start_elem->opr.wch);
2598     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2599               ? __btowc (end_ch) : end_elem->opr.wch);
2600     if (start_wc == WEOF || end_wc == WEOF)
2601       return REG_ECOLLATE;
2602     cmp_buf[0] = start_wc;
2603     cmp_buf[4] = end_wc;
2604     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2605       return REG_ERANGE;
2606
2607     /* Got valid collation sequence values, add them as a new entry.
2608        However, for !_LIBC we have no collation elements: if the
2609        character set is single byte, the single byte character set
2610        that we build below suffices.  parse_bracket_exp passes
2611        no MBCSET if dfa->mb_cur_max == 1.  */
2612     if (mbcset)
2613       {
2614         /* Check the space of the arrays.  */
2615         if (BE (*range_alloc == mbcset->nranges, 0))
2616           {
2617             /* There is not enough space, need realloc.  */
2618             wchar_t *new_array_start, *new_array_end;
2619             int new_nranges;
2620
2621             /* +1 in case of mbcset->nranges is 0.  */
2622             new_nranges = 2 * mbcset->nranges + 1;
2623             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2624                are NULL if *range_alloc == 0.  */
2625             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2626                                           new_nranges);
2627             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2628                                         new_nranges);
2629
2630             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2631               return REG_ESPACE;
2632
2633             mbcset->range_starts = new_array_start;
2634             mbcset->range_ends = new_array_end;
2635             *range_alloc = new_nranges;
2636           }
2637
2638         mbcset->range_starts[mbcset->nranges] = start_wc;
2639         mbcset->range_ends[mbcset->nranges++] = end_wc;
2640       }
2641
2642     /* Build the table for single byte characters.  */
2643     for (wc = 0; wc < SBC_MAX; ++wc)
2644       {
2645         cmp_buf[2] = wc;
2646         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2647             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2648           bitset_set (sbcset, wc);
2649       }
2650   }
2651 # else /* not RE_ENABLE_I18N */
2652   {
2653     unsigned int ch;
2654     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2655                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2656                    : 0));
2657     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2658               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2659                  : 0));
2660     if (start_ch > end_ch)
2661       return REG_ERANGE;
2662     /* Build the table for single byte characters.  */
2663     for (ch = 0; ch < SBC_MAX; ++ch)
2664       if (start_ch <= ch  && ch <= end_ch)
2665         bitset_set (sbcset, ch);
2666   }
2667 # endif /* not RE_ENABLE_I18N */
2668   return REG_NOERROR;
2669 }
2670 #endif /* not _LIBC */
2671
2672 #ifndef _LIBC
2673 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2674    Build the collating element which is represented by NAME.
2675    The result are written to MBCSET and SBCSET.
2676    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2677    pointer argument since we may update it.  */
2678
2679 static reg_errcode_t
2680 internal_function
2681 # ifdef RE_ENABLE_I18N
2682 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2683                         int *coll_sym_alloc, const unsigned char *name)
2684 # else /* not RE_ENABLE_I18N */
2685 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2686 # endif /* not RE_ENABLE_I18N */
2687 {
2688   size_t name_len = strlen ((const char *) name);
2689   if (BE (name_len != 1, 0))
2690     return REG_ECOLLATE;
2691   else
2692     {
2693       bitset_set (sbcset, name[0]);
2694       return REG_NOERROR;
2695     }
2696 }
2697 #endif /* not _LIBC */
2698
2699 /* This function parse bracket expression like "[abc]", "[a-c]",
2700    "[[.a-a.]]" etc.  */
2701
2702 static bin_tree_t *
2703 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2704                    reg_syntax_t syntax, reg_errcode_t *err)
2705 {
2706 #ifdef _LIBC
2707   const unsigned char *collseqmb;
2708   const char *collseqwc;
2709   uint32_t nrules;
2710   int32_t table_size;
2711   const int32_t *symb_table;
2712   const unsigned char *extra;
2713
2714   /* Local function for parse_bracket_exp used in _LIBC environement.
2715      Seek the collating symbol entry correspondings to NAME.
2716      Return the index of the symbol in the SYMB_TABLE.  */
2717
2718   auto inline int32_t
2719   __attribute ((always_inline))
2720   seek_collating_symbol_entry (name, name_len)
2721          const unsigned char *name;
2722          size_t name_len;
2723     {
2724       int32_t hash = elem_hash ((const char *) name, name_len);
2725       int32_t elem = hash % table_size;
2726       if (symb_table[2 * elem] != 0)
2727         {
2728           int32_t second = hash % (table_size - 2) + 1;
2729
2730           do
2731             {
2732               /* First compare the hashing value.  */
2733               if (symb_table[2 * elem] == hash
2734                   /* Compare the length of the name.  */
2735                   && name_len == extra[symb_table[2 * elem + 1]]
2736                   /* Compare the name.  */
2737                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2738                              name_len) == 0)
2739                 {
2740                   /* Yep, this is the entry.  */
2741                   break;
2742                 }
2743
2744               /* Next entry.  */
2745               elem += second;
2746             }
2747           while (symb_table[2 * elem] != 0);
2748         }
2749       return elem;
2750     }
2751
2752   /* Local function for parse_bracket_exp used in _LIBC environement.
2753      Look up the collation sequence value of BR_ELEM.
2754      Return the value if succeeded, UINT_MAX otherwise.  */
2755
2756   auto inline unsigned int
2757   __attribute ((always_inline))
2758   lookup_collation_sequence_value (br_elem)
2759          bracket_elem_t *br_elem;
2760     {
2761       if (br_elem->type == SB_CHAR)
2762         {
2763           /*
2764           if (MB_CUR_MAX == 1)
2765           */
2766           if (nrules == 0)
2767             return collseqmb[br_elem->opr.ch];
2768           else
2769             {
2770               wint_t wc = __btowc (br_elem->opr.ch);
2771               return __collseq_table_lookup (collseqwc, wc);
2772             }
2773         }
2774       else if (br_elem->type == MB_CHAR)
2775         {
2776           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2777         }
2778       else if (br_elem->type == COLL_SYM)
2779         {
2780           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2781           if (nrules != 0)
2782             {
2783               int32_t elem, idx;
2784               elem = seek_collating_symbol_entry (br_elem->opr.name,
2785                                                   sym_name_len);
2786               if (symb_table[2 * elem] != 0)
2787                 {
2788                   /* We found the entry.  */
2789                   idx = symb_table[2 * elem + 1];
2790                   /* Skip the name of collating element name.  */
2791                   idx += 1 + extra[idx];
2792                   /* Skip the byte sequence of the collating element.  */
2793                   idx += 1 + extra[idx];
2794                   /* Adjust for the alignment.  */
2795                   idx = (idx + 3) & ~3;
2796                   /* Skip the multibyte collation sequence value.  */
2797                   idx += sizeof (unsigned int);
2798                   /* Skip the wide char sequence of the collating element.  */
2799                   idx += sizeof (unsigned int) *
2800                     (1 + *(unsigned int *) (extra + idx));
2801                   /* Return the collation sequence value.  */
2802                   return *(unsigned int *) (extra + idx);
2803                 }
2804               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2805                 {
2806                   /* No valid character.  Match it as a single byte
2807                      character.  */
2808                   return collseqmb[br_elem->opr.name[0]];
2809                 }
2810             }
2811           else if (sym_name_len == 1)
2812             return collseqmb[br_elem->opr.name[0]];
2813         }
2814       return UINT_MAX;
2815     }
2816
2817   /* Local function for parse_bracket_exp used in _LIBC environement.
2818      Build the range expression which starts from START_ELEM, and ends
2819      at END_ELEM.  The result are written to MBCSET and SBCSET.
2820      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2821      mbcset->range_ends, is a pointer argument sinse we may
2822      update it.  */
2823
2824   auto inline reg_errcode_t
2825   __attribute ((always_inline))
2826   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2827          re_charset_t *mbcset;
2828          int *range_alloc;
2829          bitset_t sbcset;
2830          bracket_elem_t *start_elem, *end_elem;
2831     {
2832       unsigned int ch;
2833       uint32_t start_collseq;
2834       uint32_t end_collseq;
2835
2836       /* Equivalence Classes and Character Classes can't be a range
2837          start/end.  */
2838       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2839               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2840               0))
2841         return REG_ERANGE;
2842
2843       start_collseq = lookup_collation_sequence_value (start_elem);
2844       end_collseq = lookup_collation_sequence_value (end_elem);
2845       /* Check start/end collation sequence values.  */
2846       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2847         return REG_ECOLLATE;
2848       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2849         return REG_ERANGE;
2850
2851       /* Got valid collation sequence values, add them as a new entry.
2852          However, if we have no collation elements, and the character set
2853          is single byte, the single byte character set that we
2854          build below suffices. */
2855       if (nrules > 0 || dfa->mb_cur_max > 1)
2856         {
2857           /* Check the space of the arrays.  */
2858           if (BE (*range_alloc == mbcset->nranges, 0))
2859             {
2860               /* There is not enough space, need realloc.  */
2861               uint32_t *new_array_start;
2862               uint32_t *new_array_end;
2863               int new_nranges;
2864
2865               /* +1 in case of mbcset->nranges is 0.  */
2866               new_nranges = 2 * mbcset->nranges + 1;
2867               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2868                                             new_nranges);
2869               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2870                                           new_nranges);
2871
2872               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2873                 return REG_ESPACE;
2874
2875               mbcset->range_starts = new_array_start;
2876               mbcset->range_ends = new_array_end;
2877               *range_alloc = new_nranges;
2878             }
2879
2880           mbcset->range_starts[mbcset->nranges] = start_collseq;
2881           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2882         }
2883
2884       /* Build the table for single byte characters.  */
2885       for (ch = 0; ch < SBC_MAX; ch++)
2886         {
2887           uint32_t ch_collseq;
2888           /*
2889           if (MB_CUR_MAX == 1)
2890           */
2891           if (nrules == 0)
2892             ch_collseq = collseqmb[ch];
2893           else
2894             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2895           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2896             bitset_set (sbcset, ch);
2897         }
2898       return REG_NOERROR;
2899     }
2900
2901   /* Local function for parse_bracket_exp used in _LIBC environement.
2902      Build the collating element which is represented by NAME.
2903      The result are written to MBCSET and SBCSET.
2904      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2905      pointer argument sinse we may update it.  */
2906
2907   auto inline reg_errcode_t
2908   __attribute ((always_inline))
2909   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2910          re_charset_t *mbcset;
2911          int *coll_sym_alloc;
2912          bitset_t sbcset;
2913          const unsigned char *name;
2914     {
2915       int32_t elem, idx;
2916       size_t name_len = strlen ((const char *) name);
2917       if (nrules != 0)
2918         {
2919           elem = seek_collating_symbol_entry (name, name_len);
2920           if (symb_table[2 * elem] != 0)
2921             {
2922               /* We found the entry.  */
2923               idx = symb_table[2 * elem + 1];
2924               /* Skip the name of collating element name.  */
2925               idx += 1 + extra[idx];
2926             }
2927           else if (symb_table[2 * elem] == 0 && name_len == 1)
2928             {
2929               /* No valid character, treat it as a normal
2930                  character.  */
2931               bitset_set (sbcset, name[0]);
2932               return REG_NOERROR;
2933             }
2934           else
2935             return REG_ECOLLATE;
2936
2937           /* Got valid collation sequence, add it as a new entry.  */
2938           /* Check the space of the arrays.  */
2939           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2940             {
2941               /* Not enough, realloc it.  */
2942               /* +1 in case of mbcset->ncoll_syms is 0.  */
2943               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2944               /* Use realloc since mbcset->coll_syms is NULL
2945                  if *alloc == 0.  */
2946               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2947                                                    new_coll_sym_alloc);
2948               if (BE (new_coll_syms == NULL, 0))
2949                 return REG_ESPACE;
2950               mbcset->coll_syms = new_coll_syms;
2951               *coll_sym_alloc = new_coll_sym_alloc;
2952             }
2953           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2954           return REG_NOERROR;
2955         }
2956       else
2957         {
2958           if (BE (name_len != 1, 0))
2959             return REG_ECOLLATE;
2960           else
2961             {
2962               bitset_set (sbcset, name[0]);
2963               return REG_NOERROR;
2964             }
2965         }
2966     }
2967 #endif
2968
2969   re_token_t br_token;
2970   re_bitset_ptr_t sbcset;
2971 #ifdef RE_ENABLE_I18N
2972   re_charset_t *mbcset;
2973   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2974   int equiv_class_alloc = 0, char_class_alloc = 0;
2975 #endif /* not RE_ENABLE_I18N */
2976   int non_match = 0;
2977   bin_tree_t *work_tree;
2978   int token_len;
2979   int first_round = 1;
2980 #ifdef _LIBC
2981   collseqmb = (const unsigned char *)
2982     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2983   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2984   if (nrules)
2985     {
2986       /*
2987       if (MB_CUR_MAX > 1)
2988       */
2989       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2990       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2991       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2992                                                   _NL_COLLATE_SYMB_TABLEMB);
2993       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2994                                                    _NL_COLLATE_SYMB_EXTRAMB);
2995     }
2996 #endif
2997   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
2998 #ifdef RE_ENABLE_I18N
2999   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3000 #endif /* RE_ENABLE_I18N */
3001 #ifdef RE_ENABLE_I18N
3002   if (BE (sbcset == NULL || mbcset == NULL, 0))
3003 #else
3004   if (BE (sbcset == NULL, 0))
3005 #endif /* RE_ENABLE_I18N */
3006     {
3007       *err = REG_ESPACE;
3008       return NULL;
3009     }
3010
3011   token_len = peek_token_bracket (token, regexp, syntax);
3012   if (BE (token->type == END_OF_RE, 0))
3013     {
3014       *err = REG_BADPAT;
3015       goto parse_bracket_exp_free_return;
3016     }
3017   if (token->type == OP_NON_MATCH_LIST)
3018     {
3019 #ifdef RE_ENABLE_I18N
3020       mbcset->non_match = 1;
3021 #endif /* not RE_ENABLE_I18N */
3022       non_match = 1;
3023       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3024         bitset_set (sbcset, '\0');
3025       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3026       token_len = peek_token_bracket (token, regexp, syntax);
3027       if (BE (token->type == END_OF_RE, 0))
3028         {
3029           *err = REG_BADPAT;
3030           goto parse_bracket_exp_free_return;
3031         }
3032     }
3033
3034   /* We treat the first ']' as a normal character.  */
3035   if (token->type == OP_CLOSE_BRACKET)
3036     token->type = CHARACTER;
3037
3038   while (1)
3039     {
3040       bracket_elem_t start_elem, end_elem;
3041       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3042       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3043       reg_errcode_t ret;
3044       int token_len2 = 0, is_range_exp = 0;
3045       re_token_t token2;
3046
3047       start_elem.opr.name = start_name_buf;
3048       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3049                                    syntax, first_round);
3050       if (BE (ret != REG_NOERROR, 0))
3051         {
3052           *err = ret;
3053           goto parse_bracket_exp_free_return;
3054         }
3055       first_round = 0;
3056
3057       /* Get information about the next token.  We need it in any case.  */
3058       token_len = peek_token_bracket (token, regexp, syntax);
3059
3060       /* Do not check for ranges if we know they are not allowed.  */
3061       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3062         {
3063           if (BE (token->type == END_OF_RE, 0))
3064             {
3065               *err = REG_EBRACK;
3066               goto parse_bracket_exp_free_return;
3067             }
3068           if (token->type == OP_CHARSET_RANGE)
3069             {
3070               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3071               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3072               if (BE (token2.type == END_OF_RE, 0))
3073                 {
3074                   *err = REG_EBRACK;
3075                   goto parse_bracket_exp_free_return;
3076                 }
3077               if (token2.type == OP_CLOSE_BRACKET)
3078                 {
3079                   /* We treat the last '-' as a normal character.  */
3080                   re_string_skip_bytes (regexp, -token_len);
3081                   token->type = CHARACTER;
3082                 }
3083               else
3084                 is_range_exp = 1;
3085             }
3086         }
3087
3088       if (is_range_exp == 1)
3089         {
3090           end_elem.opr.name = end_name_buf;
3091           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3092                                        dfa, syntax, 1);
3093           if (BE (ret != REG_NOERROR, 0))
3094             {
3095               *err = ret;
3096               goto parse_bracket_exp_free_return;
3097             }
3098
3099           token_len = peek_token_bracket (token, regexp, syntax);
3100
3101 #ifdef _LIBC
3102           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3103                                   &start_elem, &end_elem);
3104 #else
3105 # ifdef RE_ENABLE_I18N
3106           *err = build_range_exp (sbcset,
3107                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3108                                   &range_alloc, &start_elem, &end_elem);
3109 # else
3110           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3111 # endif
3112 #endif /* RE_ENABLE_I18N */
3113           if (BE (*err != REG_NOERROR, 0))
3114             goto parse_bracket_exp_free_return;
3115         }
3116       else
3117         {
3118           switch (start_elem.type)
3119             {
3120             case SB_CHAR:
3121               bitset_set (sbcset, start_elem.opr.ch);
3122               break;
3123 #ifdef RE_ENABLE_I18N
3124             case MB_CHAR:
3125               /* Check whether the array has enough space.  */
3126               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3127                 {
3128                   wchar_t *new_mbchars;
3129                   /* Not enough, realloc it.  */
3130                   /* +1 in case of mbcset->nmbchars is 0.  */
3131                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3132                   /* Use realloc since array is NULL if *alloc == 0.  */
3133                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3134                                             mbchar_alloc);
3135                   if (BE (new_mbchars == NULL, 0))
3136                     goto parse_bracket_exp_espace;
3137                   mbcset->mbchars = new_mbchars;
3138                 }
3139               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3140               break;
3141 #endif /* RE_ENABLE_I18N */
3142             case EQUIV_CLASS:
3143               *err = build_equiv_class (sbcset,
3144 #ifdef RE_ENABLE_I18N
3145                                         mbcset, &equiv_class_alloc,
3146 #endif /* RE_ENABLE_I18N */
3147                                         start_elem.opr.name);
3148               if (BE (*err != REG_NOERROR, 0))
3149                 goto parse_bracket_exp_free_return;
3150               break;
3151             case COLL_SYM:
3152               *err = build_collating_symbol (sbcset,
3153 #ifdef RE_ENABLE_I18N
3154                                              mbcset, &coll_sym_alloc,
3155 #endif /* RE_ENABLE_I18N */
3156                                              start_elem.opr.name);
3157               if (BE (*err != REG_NOERROR, 0))
3158                 goto parse_bracket_exp_free_return;
3159               break;
3160             case CHAR_CLASS:
3161               *err = build_charclass (regexp->trans, sbcset,
3162 #ifdef RE_ENABLE_I18N
3163                                       mbcset, &char_class_alloc,
3164 #endif /* RE_ENABLE_I18N */
3165                                       start_elem.opr.name, syntax);
3166               if (BE (*err != REG_NOERROR, 0))
3167                goto parse_bracket_exp_free_return;
3168               break;
3169             default:
3170               assert (0);
3171               break;
3172             }
3173         }
3174       if (BE (token->type == END_OF_RE, 0))
3175         {
3176           *err = REG_EBRACK;
3177           goto parse_bracket_exp_free_return;
3178         }
3179       if (token->type == OP_CLOSE_BRACKET)
3180         break;
3181     }
3182
3183   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3184
3185   /* If it is non-matching list.  */
3186   if (non_match)
3187     bitset_not (sbcset);
3188
3189 #ifdef RE_ENABLE_I18N
3190   /* Ensure only single byte characters are set.  */
3191   if (dfa->mb_cur_max > 1)
3192     bitset_mask (sbcset, dfa->sb_char);
3193
3194   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3195       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3196                                                      || mbcset->non_match)))
3197     {
3198       bin_tree_t *mbc_tree;
3199       int sbc_idx;
3200       /* Build a tree for complex bracket.  */
3201       dfa->has_mb_node = 1;
3202       br_token.type = COMPLEX_BRACKET;
3203       br_token.opr.mbcset = mbcset;
3204       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3205       if (BE (mbc_tree == NULL, 0))
3206         goto parse_bracket_exp_espace;
3207       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3208         if (sbcset[sbc_idx])
3209           break;
3210       /* If there are no bits set in sbcset, there is no point
3211          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3212       if (sbc_idx < BITSET_WORDS)
3213         {
3214           /* Build a tree for simple bracket.  */
3215           br_token.type = SIMPLE_BRACKET;
3216           br_token.opr.sbcset = sbcset;
3217           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3218           if (BE (work_tree == NULL, 0))
3219             goto parse_bracket_exp_espace;
3220
3221           /* Then join them by ALT node.  */
3222           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3223           if (BE (work_tree == NULL, 0))
3224             goto parse_bracket_exp_espace;
3225         }
3226       else
3227         {
3228           re_free (sbcset);
3229           work_tree = mbc_tree;
3230         }
3231     }
3232   else
3233 #endif /* not RE_ENABLE_I18N */
3234     {
3235 #ifdef RE_ENABLE_I18N
3236       free_charset (mbcset);
3237 #endif
3238       /* Build a tree for simple bracket.  */
3239       br_token.type = SIMPLE_BRACKET;
3240       br_token.opr.sbcset = sbcset;
3241       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3242       if (BE (work_tree == NULL, 0))
3243         goto parse_bracket_exp_espace;
3244     }
3245   return work_tree;
3246
3247  parse_bracket_exp_espace:
3248   *err = REG_ESPACE;
3249  parse_bracket_exp_free_return:
3250   re_free (sbcset);
3251 #ifdef RE_ENABLE_I18N
3252   free_charset (mbcset);
3253 #endif /* RE_ENABLE_I18N */
3254   return NULL;
3255 }
3256
3257 /* Parse an element in the bracket expression.  */
3258
3259 static reg_errcode_t
3260 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3261                        re_token_t *token, int token_len, re_dfa_t *dfa,
3262                        reg_syntax_t syntax, int accept_hyphen)
3263 {
3264 #ifdef RE_ENABLE_I18N
3265   int cur_char_size;
3266   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3267   if (cur_char_size > 1)
3268     {
3269       elem->type = MB_CHAR;
3270       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3271       re_string_skip_bytes (regexp, cur_char_size);
3272       return REG_NOERROR;
3273     }
3274 #endif /* RE_ENABLE_I18N */
3275   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3276   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3277       || token->type == OP_OPEN_EQUIV_CLASS)
3278     return parse_bracket_symbol (elem, regexp, token);
3279   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3280     {
3281       /* A '-' must only appear as anything but a range indicator before
3282          the closing bracket.  Everything else is an error.  */
3283       re_token_t token2;
3284       (void) peek_token_bracket (&token2, regexp, syntax);
3285       if (token2.type != OP_CLOSE_BRACKET)
3286         /* The actual error value is not standardized since this whole
3287            case is undefined.  But ERANGE makes good sense.  */
3288         return REG_ERANGE;
3289     }
3290   elem->type = SB_CHAR;
3291   elem->opr.ch = token->opr.c;
3292   return REG_NOERROR;
3293 }
3294
3295 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3296    such as [:<character_class>:], [.<collating_element>.], and
3297    [=<equivalent_class>=].  */
3298
3299 static reg_errcode_t
3300 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3301                       re_token_t *token)
3302 {
3303   unsigned char ch, delim = token->opr.c;
3304   int i = 0;
3305   if (re_string_eoi(regexp))
3306     return REG_EBRACK;
3307   for (;; ++i)
3308     {
3309       if (i >= BRACKET_NAME_BUF_SIZE)
3310         return REG_EBRACK;
3311       if (token->type == OP_OPEN_CHAR_CLASS)
3312         ch = re_string_fetch_byte_case (regexp);
3313       else
3314         ch = re_string_fetch_byte (regexp);
3315       if (re_string_eoi(regexp))
3316         return REG_EBRACK;
3317       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3318         break;
3319       elem->opr.name[i] = ch;
3320     }
3321   re_string_skip_bytes (regexp, 1);
3322   elem->opr.name[i] = '\0';
3323   switch (token->type)
3324     {
3325     case OP_OPEN_COLL_ELEM:
3326       elem->type = COLL_SYM;
3327       break;
3328     case OP_OPEN_EQUIV_CLASS:
3329       elem->type = EQUIV_CLASS;
3330       break;
3331     case OP_OPEN_CHAR_CLASS:
3332       elem->type = CHAR_CLASS;
3333       break;
3334     default:
3335       break;
3336     }
3337   return REG_NOERROR;
3338 }
3339
3340   /* Helper function for parse_bracket_exp.
3341      Build the equivalence class which is represented by NAME.
3342      The result are written to MBCSET and SBCSET.
3343      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3344      is a pointer argument sinse we may update it.  */
3345
3346 static reg_errcode_t
3347 #ifdef RE_ENABLE_I18N
3348 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3349                    int *equiv_class_alloc, const unsigned char *name)
3350 #else /* not RE_ENABLE_I18N */
3351 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3352 #endif /* not RE_ENABLE_I18N */
3353 {
3354 #ifdef _LIBC
3355   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3356   if (nrules != 0)
3357     {
3358       const int32_t *table, *indirect;
3359       const unsigned char *weights, *extra, *cp;
3360       unsigned char char_buf[2];
3361       int32_t idx1, idx2;
3362       unsigned int ch;
3363       size_t len;
3364       /* This #include defines a local function!  */
3365 # include <locale/weight.h>
3366       /* Calculate the index for equivalence class.  */
3367       cp = name;
3368       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3369       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3370                                                _NL_COLLATE_WEIGHTMB);
3371       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3372                                                    _NL_COLLATE_EXTRAMB);
3373       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3374                                                 _NL_COLLATE_INDIRECTMB);
3375       idx1 = findidx (&cp);
3376       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3377         /* This isn't a valid character.  */
3378         return REG_ECOLLATE;
3379
3380       /* Build single byte matcing table for this equivalence class.  */
3381       char_buf[1] = (unsigned char) '\0';
3382       len = weights[idx1];
3383       for (ch = 0; ch < SBC_MAX; ++ch)
3384         {
3385           char_buf[0] = ch;
3386           cp = char_buf;
3387           idx2 = findidx (&cp);
3388 /*
3389           idx2 = table[ch];
3390 */
3391           if (idx2 == 0)
3392             /* This isn't a valid character.  */
3393             continue;
3394           if (len == weights[idx2])
3395             {
3396               int cnt = 0;
3397               while (cnt <= len &&
3398                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3399                 ++cnt;
3400
3401               if (cnt > len)
3402                 bitset_set (sbcset, ch);
3403             }
3404         }
3405       /* Check whether the array has enough space.  */
3406       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3407         {
3408           /* Not enough, realloc it.  */
3409           /* +1 in case of mbcset->nequiv_classes is 0.  */
3410           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3411           /* Use realloc since the array is NULL if *alloc == 0.  */
3412           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3413                                                    int32_t,
3414                                                    new_equiv_class_alloc);
3415           if (BE (new_equiv_classes == NULL, 0))
3416             return REG_ESPACE;
3417           mbcset->equiv_classes = new_equiv_classes;
3418           *equiv_class_alloc = new_equiv_class_alloc;
3419         }
3420       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3421     }
3422   else
3423 #endif /* _LIBC */
3424     {
3425       if (BE (strlen ((const char *) name) != 1, 0))
3426         return REG_ECOLLATE;
3427       bitset_set (sbcset, *name);
3428     }
3429   return REG_NOERROR;
3430 }
3431
3432   /* Helper function for parse_bracket_exp.
3433      Build the character class which is represented by NAME.
3434      The result are written to MBCSET and SBCSET.
3435      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3436      is a pointer argument sinse we may update it.  */
3437
3438 static reg_errcode_t
3439 #ifdef RE_ENABLE_I18N
3440 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3441                  re_charset_t *mbcset, int *char_class_alloc,
3442                  const unsigned char *class_name, reg_syntax_t syntax)
3443 #else /* not RE_ENABLE_I18N */
3444 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3445                  const unsigned char *class_name, reg_syntax_t syntax)
3446 #endif /* not RE_ENABLE_I18N */
3447 {
3448   int i;
3449   const char *name = (const char *) class_name;
3450
3451   /* In case of REG_ICASE "upper" and "lower" match the both of
3452      upper and lower cases.  */
3453   if ((syntax & RE_ICASE)
3454       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3455     name = "alpha";
3456
3457 #ifdef RE_ENABLE_I18N
3458   /* Check the space of the arrays.  */
3459   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3460     {
3461       /* Not enough, realloc it.  */
3462       /* +1 in case of mbcset->nchar_classes is 0.  */
3463       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3464       /* Use realloc since array is NULL if *alloc == 0.  */
3465       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3466                                                new_char_class_alloc);
3467       if (BE (new_char_classes == NULL, 0))
3468         return REG_ESPACE;
3469       mbcset->char_classes = new_char_classes;
3470       *char_class_alloc = new_char_class_alloc;
3471     }
3472   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3473 #endif /* RE_ENABLE_I18N */
3474
3475 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3476   do {                                          \
3477     if (BE (trans != NULL, 0))                  \
3478       {                                         \
3479         for (i = 0; i < SBC_MAX; ++i)           \
3480           if (ctype_func (i))                   \
3481             bitset_set (sbcset, trans[i]);      \
3482       }                                         \
3483     else                                        \
3484       {                                         \
3485         for (i = 0; i < SBC_MAX; ++i)           \
3486           if (ctype_func (i))                   \
3487             bitset_set (sbcset, i);             \
3488       }                                         \
3489   } while (0)
3490
3491   if (strcmp (name, "alnum") == 0)
3492     BUILD_CHARCLASS_LOOP (isalnum);
3493   else if (strcmp (name, "cntrl") == 0)
3494     BUILD_CHARCLASS_LOOP (iscntrl);
3495   else if (strcmp (name, "lower") == 0)
3496     BUILD_CHARCLASS_LOOP (islower);
3497   else if (strcmp (name, "space") == 0)
3498     BUILD_CHARCLASS_LOOP (isspace);
3499   else if (strcmp (name, "alpha") == 0)
3500     BUILD_CHARCLASS_LOOP (isalpha);
3501   else if (strcmp (name, "digit") == 0)
3502     BUILD_CHARCLASS_LOOP (isdigit);
3503   else if (strcmp (name, "print") == 0)
3504     BUILD_CHARCLASS_LOOP (isprint);
3505   else if (strcmp (name, "upper") == 0)
3506     BUILD_CHARCLASS_LOOP (isupper);
3507   else if (strcmp (name, "blank") == 0)
3508     BUILD_CHARCLASS_LOOP (isblank);
3509   else if (strcmp (name, "graph") == 0)
3510     BUILD_CHARCLASS_LOOP (isgraph);
3511   else if (strcmp (name, "punct") == 0)
3512     BUILD_CHARCLASS_LOOP (ispunct);
3513   else if (strcmp (name, "xdigit") == 0)
3514     BUILD_CHARCLASS_LOOP (isxdigit);
3515   else
3516     return REG_ECTYPE;
3517
3518   return REG_NOERROR;
3519 }
3520
3521 static bin_tree_t *
3522 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3523                     const unsigned char *class_name,
3524                     const unsigned char *extra, int non_match,
3525                     reg_errcode_t *err)
3526 {
3527   re_bitset_ptr_t sbcset;
3528 #ifdef RE_ENABLE_I18N
3529   re_charset_t *mbcset;
3530   int alloc = 0;
3531 #endif /* not RE_ENABLE_I18N */
3532   reg_errcode_t ret;
3533   re_token_t br_token;
3534   bin_tree_t *tree;
3535
3536   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3537 #ifdef RE_ENABLE_I18N
3538   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3539 #endif /* RE_ENABLE_I18N */
3540
3541 #ifdef RE_ENABLE_I18N
3542   if (BE (sbcset == NULL || mbcset == NULL, 0))
3543 #else /* not RE_ENABLE_I18N */
3544   if (BE (sbcset == NULL, 0))
3545 #endif /* not RE_ENABLE_I18N */
3546     {
3547       *err = REG_ESPACE;
3548       return NULL;
3549     }
3550
3551   if (non_match)
3552     {
3553 #ifdef RE_ENABLE_I18N
3554       /*
3555       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3556         bitset_set(cset->sbcset, '\0');
3557       */
3558       mbcset->non_match = 1;
3559 #endif /* not RE_ENABLE_I18N */
3560     }
3561
3562   /* We don't care the syntax in this case.  */
3563   ret = build_charclass (trans, sbcset,
3564 #ifdef RE_ENABLE_I18N
3565                          mbcset, &alloc,
3566 #endif /* RE_ENABLE_I18N */
3567                          class_name, 0);
3568
3569   if (BE (ret != REG_NOERROR, 0))
3570     {
3571       re_free (sbcset);
3572 #ifdef RE_ENABLE_I18N
3573       free_charset (mbcset);
3574 #endif /* RE_ENABLE_I18N */
3575       *err = ret;
3576       return NULL;
3577     }
3578   /* \w match '_' also.  */
3579   for (; *extra; extra++)
3580     bitset_set (sbcset, *extra);
3581
3582   /* If it is non-matching list.  */
3583   if (non_match)
3584     bitset_not (sbcset);
3585
3586 #ifdef RE_ENABLE_I18N
3587   /* Ensure only single byte characters are set.  */
3588   if (dfa->mb_cur_max > 1)
3589     bitset_mask (sbcset, dfa->sb_char);
3590 #endif
3591
3592   /* Build a tree for simple bracket.  */
3593   br_token.type = SIMPLE_BRACKET;
3594   br_token.opr.sbcset = sbcset;
3595   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3596   if (BE (tree == NULL, 0))
3597     goto build_word_op_espace;
3598
3599 #ifdef RE_ENABLE_I18N
3600   if (dfa->mb_cur_max > 1)
3601     {
3602       bin_tree_t *mbc_tree;
3603       /* Build a tree for complex bracket.  */
3604       br_token.type = COMPLEX_BRACKET;
3605       br_token.opr.mbcset = mbcset;
3606       dfa->has_mb_node = 1;
3607       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3608       if (BE (mbc_tree == NULL, 0))
3609         goto build_word_op_espace;
3610       /* Then join them by ALT node.  */
3611       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3612       if (BE (mbc_tree != NULL, 1))
3613         return tree;
3614     }
3615   else
3616     {
3617       free_charset (mbcset);
3618       return tree;
3619     }
3620 #else /* not RE_ENABLE_I18N */
3621   return tree;
3622 #endif /* not RE_ENABLE_I18N */
3623
3624  build_word_op_espace:
3625   re_free (sbcset);
3626 #ifdef RE_ENABLE_I18N
3627   free_charset (mbcset);
3628 #endif /* RE_ENABLE_I18N */
3629   *err = REG_ESPACE;
3630   return NULL;
3631 }
3632
3633 /* This is intended for the expressions like "a{1,3}".
3634    Fetch a number from `input', and return the number.
3635    Return -1, if the number field is empty like "{,1}".
3636    Return -2, If an error is occured.  */
3637
3638 static int
3639 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3640 {
3641   int num = -1;
3642   unsigned char c;
3643   while (1)
3644     {
3645       fetch_token (token, input, syntax);
3646       c = token->opr.c;
3647       if (BE (token->type == END_OF_RE, 0))
3648         return -2;
3649       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3650         break;
3651       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3652              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3653       num = (num > RE_DUP_MAX) ? -2 : num;
3654     }
3655   return num;
3656 }
3657 \f
3658 #ifdef RE_ENABLE_I18N
3659 static void
3660 free_charset (re_charset_t *cset)
3661 {
3662   re_free (cset->mbchars);
3663 # ifdef _LIBC
3664   re_free (cset->coll_syms);
3665   re_free (cset->equiv_classes);
3666   re_free (cset->range_starts);
3667   re_free (cset->range_ends);
3668 # endif
3669   re_free (cset->char_classes);
3670   re_free (cset);
3671 }
3672 #endif /* RE_ENABLE_I18N */
3673 \f
3674 /* Functions for binary tree operation.  */
3675
3676 /* Create a tree node.  */
3677
3678 static bin_tree_t *
3679 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3680              re_token_type_t type)
3681 {
3682   re_token_t t;
3683   t.type = type;
3684   return create_token_tree (dfa, left, right, &t);
3685 }
3686
3687 static bin_tree_t *
3688 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3689                    const re_token_t *token)
3690 {
3691   bin_tree_t *tree;
3692   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3693     {
3694       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3695
3696       if (storage == NULL)
3697         return NULL;
3698       storage->next = dfa->str_tree_storage;
3699       dfa->str_tree_storage = storage;
3700       dfa->str_tree_storage_idx = 0;
3701     }
3702   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3703
3704   tree->parent = NULL;
3705   tree->left = left;
3706   tree->right = right;
3707   tree->token = *token;
3708   tree->token.duplicated = 0;
3709   tree->token.opt_subexp = 0;
3710   tree->first = NULL;
3711   tree->next = NULL;
3712   tree->node_idx = -1;
3713
3714   if (left != NULL)
3715     left->parent = tree;
3716   if (right != NULL)
3717     right->parent = tree;
3718   return tree;
3719 }
3720
3721 /* Mark the tree SRC as an optional subexpression.
3722    To be called from preorder or postorder.  */
3723
3724 static reg_errcode_t
3725 mark_opt_subexp (void *extra, bin_tree_t *node)
3726 {
3727   int idx = (int) (long) extra;
3728   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3729     node->token.opt_subexp = 1;
3730
3731   return REG_NOERROR;
3732 }
3733
3734 /* Free the allocated memory inside NODE. */
3735
3736 static void
3737 free_token (re_token_t *node)
3738 {
3739 #ifdef RE_ENABLE_I18N
3740   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3741     free_charset (node->opr.mbcset);
3742   else
3743 #endif /* RE_ENABLE_I18N */
3744     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3745       re_free (node->opr.sbcset);
3746 }
3747
3748 /* Worker function for tree walking.  Free the allocated memory inside NODE
3749    and its children. */
3750
3751 static reg_errcode_t
3752 free_tree (void *extra, bin_tree_t *node)
3753 {
3754   free_token (&node->token);
3755   return REG_NOERROR;
3756 }
3757
3758
3759 /* Duplicate the node SRC, and return new node.  This is a preorder
3760    visit similar to the one implemented by the generic visitor, but
3761    we need more infrastructure to maintain two parallel trees --- so,
3762    it's easier to duplicate.  */
3763
3764 static bin_tree_t *
3765 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3766 {
3767   const bin_tree_t *node;
3768   bin_tree_t *dup_root;
3769   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3770
3771   for (node = root; ; )
3772     {
3773       /* Create a new tree and link it back to the current parent.  */
3774       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3775       if (*p_new == NULL)
3776         return NULL;
3777       (*p_new)->parent = dup_node;
3778       (*p_new)->token.duplicated = 1;
3779       dup_node = *p_new;
3780
3781       /* Go to the left node, or up and to the right.  */
3782       if (node->left)
3783         {
3784           node = node->left;
3785           p_new = &dup_node->left;
3786         }
3787       else
3788         {
3789           const bin_tree_t *prev = NULL;
3790           while (node->right == prev || node->right == NULL)
3791             {
3792               prev = node;
3793               node = node->parent;
3794               dup_node = dup_node->parent;
3795               if (!node)
3796                 return dup_root;
3797             }
3798           node = node->right;
3799           p_new = &dup_node->right;
3800         }
3801     }
3802 }