posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          int node, int root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static int fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             int accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         int *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       int *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        int non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 const char __re_error_msgid[] attribute_hidden =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 const size_t __re_error_msgid_idx[] attribute_hidden =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 const char *
 213 re_compile_pattern (pattern, length, bufp)
 214     const char *pattern;
 215     size_t length;
 216     struct re_pattern_buffer *bufp;
 217 {
 218   reg_errcode_t ret;
 219
 220   /* And GNU code determines whether or not to get register information
 221      by passing null for the REGS argument to re_match, etc., not by
 222      setting no_sub, unless RE_NO_SUB is set.  */
 223   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 224
 225   /* Match anchors at newline.  */
 226   bufp->newline_anchor = 1;
 227
 228   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 229
 230   if (!ret)
 231     return NULL;
 232   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 233 }
 234 #ifdef _LIBC
 235 weak_alias (__re_compile_pattern, re_compile_pattern)
 236 #endif
 237
 238 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 239    also be assigned to arbitrarily: each pattern buffer stores its own
 240    syntax, so it can be changed between regex compilations.  */
 241 /* This has no initializer because initialized variables in Emacs
 242    become read-only after dumping.  */
 243 reg_syntax_t re_syntax_options;
 244
 245
 246 /* Specify the precise syntax of regexps for compilation.  This provides
 247    for compatibility for various utilities which historically have
 248    different, incompatible syntaxes.
 249
 250    The argument SYNTAX is a bit mask comprised of the various bits
 251    defined in regex.h.  We return the old syntax.  */
 252
 253 reg_syntax_t
 254 re_set_syntax (syntax)
 255     reg_syntax_t syntax;
 256 {
 257   reg_syntax_t ret = re_syntax_options;
 258
 259   re_syntax_options = syntax;
 260   return ret;
 261 }
 262 #ifdef _LIBC
 263 weak_alias (__re_set_syntax, re_set_syntax)
 264 #endif
 265
 266 int
 267 re_compile_fastmap (bufp)
 268     struct re_pattern_buffer *bufp;
 269 {
 270   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 271   char *fastmap = bufp->fastmap;
 272
 273   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 274   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 275   if (dfa->init_state != dfa->init_state_word)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 277   if (dfa->init_state != dfa->init_state_nl)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 279   if (dfa->init_state != dfa->init_state_begbuf)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 281   bufp->fastmap_accurate = 1;
 282   return 0;
 283 }
 284 #ifdef _LIBC
 285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 286 #endif
 287
 288 static inline void
 289 __attribute ((always_inline))
 290 re_set_fastmap (char *fastmap, int icase, int ch)
 291 {
 292   fastmap[ch] = 1;
 293   if (icase)
 294     fastmap[tolower (ch)] = 1;
 295 }
 296
 297 /* Helper function for re_compile_fastmap.
 298    Compile fastmap for the initial_state INIT_STATE.  */
 299
 300 static void
 301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 302                          char *fastmap)
 303 {
 304   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 305   int node_cnt;
 306   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 307   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 308     {
 309       int node = init_state->nodes.elems[node_cnt];
 310       re_token_type_t type = dfa->nodes[node].type;
 311
 312       if (type == CHARACTER)
 313         {
 314           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 315 #ifdef RE_ENABLE_I18N
 316           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 317             {
 318               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 319               wchar_t wc;
 320               mbstate_t state;
 321
 322               p = buf;
 323               *p++ = dfa->nodes[node].opr.c;
 324               while (++node < dfa->nodes_len
 325                      && dfa->nodes[node].type == CHARACTER
 326                      && dfa->nodes[node].mb_partial)
 327                 *p++ = dfa->nodes[node].opr.c;
 328               memset (&state, '\0', sizeof (state));
 329               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 330                              &state) == p - buf
 331                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 332                       != (size_t) -1))
 333                 re_set_fastmap (fastmap, 0, buf[0]);
 334             }
 335 #endif
 336         }
 337       else if (type == SIMPLE_BRACKET)
 338         {
 339           int i, ch;
 340           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 341             {
 342               int j;
 343               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 344               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 345                 if (w & ((bitset_word_t) 1 << j))
 346                   re_set_fastmap (fastmap, icase, ch);
 347             }
 348         }
 349 #ifdef RE_ENABLE_I18N
 350       else if (type == COMPLEX_BRACKET)
 351         {
 352           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 353           int i;
 354
 355 # ifdef _LIBC
 356           /* See if we have to try all bytes which start multiple collation
 357              elements.
 358              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 359                   collation element, and don't catch 'b' since 'b' is
 360                   the only collation element which starts from 'b' (and
 361                   it is caught by SIMPLE_BRACKET).  */
 362               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 363                   && (cset->ncoll_syms || cset->nranges))
 364                 {
 365                   const int32_t *table = (const int32_t *)
 366                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 367                   for (i = 0; i < SBC_MAX; ++i)
 368                     if (table[i] < 0)
 369                       re_set_fastmap (fastmap, icase, i);
 370                 }
 371 # endif /* _LIBC */
 372
 373           /* See if we have to start the match at all multibyte characters,
 374              i.e. where we would not find an invalid sequence.  This only
 375              applies to multibyte character sets; for single byte character
 376              sets, the SIMPLE_BRACKET again suffices.  */
 377           if (dfa->mb_cur_max > 1
 378               && (cset->nchar_classes || cset->non_match || cset->nranges
 379 # ifdef _LIBC
 380                   || cset->nequiv_classes
 381 # endif /* _LIBC */
 382                  ))
 383             {
 384               unsigned char c = 0;
 385               do
 386                 {
 387                   mbstate_t mbs;
 388                   memset (&mbs, 0, sizeof (mbs));
 389                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 390                     re_set_fastmap (fastmap, false, (int) c);
 391                 }
 392               while (++c != 0);
 393             }
 394
 395           else
 396             {
 397               /* ... Else catch all bytes which can start the mbchars.  */
 398               for (i = 0; i < cset->nmbchars; ++i)
 399                 {
 400                   char buf[256];
 401                   mbstate_t state;
 402                   memset (&state, '\0', sizeof (state));
 403                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 404                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 405                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 406                     {
 407                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 408                           != (size_t) -1)
 409                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 410                     }
 411                 }
 412             }
 413         }
 414 #endif /* RE_ENABLE_I18N */
 415       else if (type == OP_PERIOD
 416 #ifdef RE_ENABLE_I18N
 417                || type == OP_UTF8_PERIOD
 418 #endif /* RE_ENABLE_I18N */
 419                || type == END_OF_RE)
 420         {
 421           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 422           if (type == END_OF_RE)
 423             bufp->can_be_null = 1;
 424           return;
 425         }
 426     }
 427 }
 428 \f
 429 /* Entry point for POSIX code.  */
 430 /* regcomp takes a regular expression as a string and compiles it.
 431
 432    PREG is a regex_t *.  We do not expect any fields to be initialized,
 433    since POSIX says we shouldn't.  Thus, we set
 434
 435      `buffer' to the compiled pattern;
 436      `used' to the length of the compiled pattern;
 437      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 438        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 439        RE_SYNTAX_POSIX_BASIC;
 440      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 441      `fastmap' to an allocated space for the fastmap;
 442      `fastmap_accurate' to zero;
 443      `re_nsub' to the number of subexpressions in PATTERN.
 444
 445    PATTERN is the address of the pattern string.
 446
 447    CFLAGS is a series of bits which affect compilation.
 448
 449      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 450      use POSIX basic syntax.
 451
 452      If REG_NEWLINE is set, then . and [^...] don't match newline.
 453      Also, regexec will try a match beginning after every newline.
 454
 455      If REG_ICASE is set, then we considers upper- and lowercase
 456      versions of letters to be equivalent when matching.
 457
 458      If REG_NOSUB is set, then when PREG is passed to regexec, that
 459      routine will report only success or failure, and nothing about the
 460      registers.
 461
 462    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 463    the return codes and their meanings.)  */
 464
 465 int
 466 regcomp (preg, pattern, cflags)
 467     regex_t *__restrict preg;
 468     const char *__restrict pattern;
 469     int cflags;
 470 {
 471   reg_errcode_t ret;
 472   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 473                          : RE_SYNTAX_POSIX_BASIC);
 474
 475   preg->buffer = NULL;
 476   preg->allocated = 0;
 477   preg->used = 0;
 478
 479   /* Try to allocate space for the fastmap.  */
 480   preg->fastmap = re_malloc (char, SBC_MAX);
 481   if (BE (preg->fastmap == NULL, 0))
 482     return REG_ESPACE;
 483
 484   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 485
 486   /* If REG_NEWLINE is set, newlines are treated differently.  */
 487   if (cflags & REG_NEWLINE)
 488     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 489       syntax &= ~RE_DOT_NEWLINE;
 490       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 491       /* It also changes the matching behavior.  */
 492       preg->newline_anchor = 1;
 493     }
 494   else
 495     preg->newline_anchor = 0;
 496   preg->no_sub = !!(cflags & REG_NOSUB);
 497   preg->translate = NULL;
 498
 499   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 500
 501   /* POSIX doesn't distinguish between an unmatched open-group and an
 502      unmatched close-group: both are REG_EPAREN.  */
 503   if (ret == REG_ERPAREN)
 504     ret = REG_EPAREN;
 505
 506   /* We have already checked preg->fastmap != NULL.  */
 507   if (BE (ret == REG_NOERROR, 1))
 508     /* Compute the fastmap now, since regexec cannot modify the pattern
 509        buffer.  This function never fails in this implementation.  */
 510     (void) re_compile_fastmap (preg);
 511   else
 512     {
 513       /* Some error occurred while compiling the expression.  */
 514       re_free (preg->fastmap);
 515       preg->fastmap = NULL;
 516     }
 517
 518   return (int) ret;
 519 }
 520 #ifdef _LIBC
 521 weak_alias (__regcomp, regcomp)
 522 #endif
 523
 524 /* Returns a message corresponding to an error code, ERRCODE, returned
 525    from either regcomp or regexec.   We don't use PREG here.  */
 526
 527 size_t
 528 regerror (errcode, preg, errbuf, errbuf_size)
 529     int errcode;
 530     const regex_t *__restrict preg;
 531     char *__restrict errbuf;
 532     size_t errbuf_size;
 533 {
 534   const char *msg;
 535   size_t msg_size;
 536
 537   if (BE (errcode < 0
 538           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 539                                / sizeof (__re_error_msgid_idx[0])), 0))
 540     /* Only error codes returned by the rest of the code should be passed
 541        to this routine.  If we are given anything else, or if other regex
 542        code generates an invalid error code, then the program has a bug.
 543        Dump core so we can fix it.  */
 544     abort ();
 545
 546   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 547
 548   msg_size = strlen (msg) + 1; /* Includes the null.  */
 549
 550   if (BE (errbuf_size != 0, 1))
 551     {
 552       if (BE (msg_size > errbuf_size, 0))
 553         {
 554 #if defined HAVE_MEMPCPY || defined _LIBC
 555           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 556 #else
 557           memcpy (errbuf, msg, errbuf_size - 1);
 558           errbuf[errbuf_size - 1] = 0;
 559 #endif
 560         }
 561       else
 562         memcpy (errbuf, msg, msg_size);
 563     }
 564
 565   return msg_size;
 566 }
 567 #ifdef _LIBC
 568 weak_alias (__regerror, regerror)
 569 #endif
 570
 571
 572 #ifdef RE_ENABLE_I18N
 573 /* This static array is used for the map to single-byte characters when
 574    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 575    it the same all the time.  UTF-8 is the preferred encoding so this is
 576    a worthwhile optimization.  */
 577 static const bitset_t utf8_sb_map =
 578 {
 579   /* Set the first 128 bits.  */
 580   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 581 };
 582 #endif
 583
 584
 585 static void
 586 free_dfa_content (re_dfa_t *dfa)
 587 {
 588   int i, j;
 589
 590   if (dfa->nodes)
 591     for (i = 0; i < dfa->nodes_len; ++i)
 592       free_token (dfa->nodes + i);
 593   re_free (dfa->nexts);
 594   for (i = 0; i < dfa->nodes_len; ++i)
 595     {
 596       if (dfa->eclosures != NULL)
 597         re_node_set_free (dfa->eclosures + i);
 598       if (dfa->inveclosures != NULL)
 599         re_node_set_free (dfa->inveclosures + i);
 600       if (dfa->edests != NULL)
 601         re_node_set_free (dfa->edests + i);
 602     }
 603   re_free (dfa->edests);
 604   re_free (dfa->eclosures);
 605   re_free (dfa->inveclosures);
 606   re_free (dfa->nodes);
 607
 608   if (dfa->state_table)
 609     for (i = 0; i <= dfa->state_hash_mask; ++i)
 610       {
 611         struct re_state_table_entry *entry = dfa->state_table + i;
 612         for (j = 0; j < entry->num; ++j)
 613           {
 614             re_dfastate_t *state = entry->array[j];
 615             free_state (state);
 616           }
 617         re_free (entry->array);
 618       }
 619   re_free (dfa->state_table);
 620 #ifdef RE_ENABLE_I18N
 621   if (dfa->sb_char != utf8_sb_map)
 622     re_free (dfa->sb_char);
 623 #endif
 624   re_free (dfa->subexp_map);
 625 #ifdef DEBUG
 626   re_free (dfa->re_str);
 627 #endif
 628
 629   re_free (dfa);
 630 }
 631
 632
 633 /* Free dynamically allocated space used by PREG.  */
 634
 635 void
 636 regfree (preg)
 637     regex_t *preg;
 638 {
 639   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 640   if (BE (dfa != NULL, 1))
 641     free_dfa_content (dfa);
 642   preg->buffer = NULL;
 643   preg->allocated = 0;
 644
 645   re_free (preg->fastmap);
 646   preg->fastmap = NULL;
 647
 648   re_free (preg->translate);
 649   preg->translate = NULL;
 650 }
 651 #ifdef _LIBC
 652 weak_alias (__regfree, regfree)
 653 #endif
 654 \f
 655 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 656    them unless specifically requested.  */
 657
 658 #if defined _REGEX_RE_COMP || defined _LIBC
 659
 660 /* BSD has one and only one pattern buffer.  */
 661 static struct re_pattern_buffer re_comp_buf;
 662
 663 char *
 664 # ifdef _LIBC
 665 /* Make these definitions weak in libc, so POSIX programs can redefine
 666    these names if they don't use our functions, and still use
 667    regcomp/regexec above without link errors.  */
 668 weak_function
 669 # endif
 670 re_comp (s)
 671      const char *s;
 672 {
 673   reg_errcode_t ret;
 674   char *fastmap;
 675
 676   if (!s)
 677     {
 678       if (!re_comp_buf.buffer)
 679         return gettext ("No previous regular expression");
 680       return 0;
 681     }
 682
 683   if (re_comp_buf.buffer)
 684     {
 685       fastmap = re_comp_buf.fastmap;
 686       re_comp_buf.fastmap = NULL;
 687       __regfree (&re_comp_buf);
 688       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 689       re_comp_buf.fastmap = fastmap;
 690     }
 691
 692   if (re_comp_buf.fastmap == NULL)
 693     {
 694       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 695       if (re_comp_buf.fastmap == NULL)
 696         return (char *) gettext (__re_error_msgid
 697                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 698     }
 699
 700   /* Since `re_exec' always passes NULL for the `regs' argument, we
 701      don't need to initialize the pattern buffer fields which affect it.  */
 702
 703   /* Match anchors at newlines.  */
 704   re_comp_buf.newline_anchor = 1;
 705
 706   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 707
 708   if (!ret)
 709     return NULL;
 710
 711   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 712   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 713 }
 714
 715 #ifdef _LIBC
 716 libc_freeres_fn (free_mem)
 717 {
 718   __regfree (&re_comp_buf);
 719 }
 720 #endif
 721
 722 #endif /* _REGEX_RE_COMP */
 723 \f
 724 /* Internal entry point.
 725    Compile the regular expression PATTERN, whose length is LENGTH.
 726    SYNTAX indicate regular expression's syntax.  */
 727
 728 static reg_errcode_t
 729 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 730                      reg_syntax_t syntax)
 731 {
 732   reg_errcode_t err = REG_NOERROR;
 733   re_dfa_t *dfa;
 734   re_string_t regexp;
 735
 736   /* Initialize the pattern buffer.  */
 737   preg->fastmap_accurate = 0;
 738   preg->syntax = syntax;
 739   preg->not_bol = preg->not_eol = 0;
 740   preg->used = 0;
 741   preg->re_nsub = 0;
 742   preg->can_be_null = 0;
 743   preg->regs_allocated = REGS_UNALLOCATED;
 744
 745   /* Initialize the dfa.  */
 746   dfa = (re_dfa_t *) preg->buffer;
 747   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 748     {
 749       /* If zero allocated, but buffer is non-null, try to realloc
 750          enough space.  This loses if buffer's address is bogus, but
 751          that is the user's responsibility.  If ->buffer is NULL this
 752          is a simple allocation.  */
 753       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 754       if (dfa == NULL)
 755         return REG_ESPACE;
 756       preg->allocated = sizeof (re_dfa_t);
 757       preg->buffer = (unsigned char *) dfa;
 758     }
 759   preg->used = sizeof (re_dfa_t);
 760
 761   err = init_dfa (dfa, length);
 762   if (BE (err != REG_NOERROR, 0))
 763     {
 764       free_dfa_content (dfa);
 765       preg->buffer = NULL;
 766       preg->allocated = 0;
 767       return err;
 768     }
 769 #ifdef DEBUG
 770   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 771   dfa->re_str = re_malloc (char, length + 1);
 772   strncpy (dfa->re_str, pattern, length + 1);
 773 #endif
 774
 775   __libc_lock_init (dfa->lock);
 776
 777   err = re_string_construct (&regexp, pattern, length, preg->translate,
 778                              syntax & RE_ICASE, dfa);
 779   if (BE (err != REG_NOERROR, 0))
 780     {
 781     re_compile_internal_free_return:
 782       free_workarea_compile (preg);
 783       re_string_destruct (&regexp);
 784       free_dfa_content (dfa);
 785       preg->buffer = NULL;
 786       preg->allocated = 0;
 787       return err;
 788     }
 789
 790   /* Parse the regular expression, and build a structure tree.  */
 791   preg->re_nsub = 0;
 792   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 793   if (BE (dfa->str_tree == NULL, 0))
 794     goto re_compile_internal_free_return;
 795
 796   /* Analyze the tree and create the nfa.  */
 797   err = analyze (preg);
 798   if (BE (err != REG_NOERROR, 0))
 799     goto re_compile_internal_free_return;
 800
 801 #ifdef RE_ENABLE_I18N
 802   /* If possible, do searching in single byte encoding to speed things up.  */
 803   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 804     optimize_utf8 (dfa);
 805 #endif
 806
 807   /* Then create the initial state of the dfa.  */
 808   err = create_initial_state (dfa);
 809
 810   /* Release work areas.  */
 811   free_workarea_compile (preg);
 812   re_string_destruct (&regexp);
 813
 814   if (BE (err != REG_NOERROR, 0))
 815     {
 816       free_dfa_content (dfa);
 817       preg->buffer = NULL;
 818       preg->allocated = 0;
 819     }
 820
 821   return err;
 822 }
 823
 824 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 825    as the initial length of some arrays.  */
 826
 827 static reg_errcode_t
 828 init_dfa (re_dfa_t *dfa, size_t pat_len)
 829 {
 830   unsigned int table_size;
 831 #ifndef _LIBC
 832   char *codeset_name;
 833 #endif
 834
 835   memset (dfa, '\0', sizeof (re_dfa_t));
 836
 837   /* Force allocation of str_tree_storage the first time.  */
 838   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 839
 840   /* Avoid overflows.  */
 841   if (pat_len == SIZE_MAX)
 842     return REG_ESPACE;
 843
 844   dfa->nodes_alloc = pat_len + 1;
 845   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 846
 847   /*  table_size = 2 ^ ceil(log pat_len) */
 848   for (table_size = 1; ; table_size <<= 1)
 849     if (table_size > pat_len)
 850       break;
 851
 852   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 853   dfa->state_hash_mask = table_size - 1;
 854
 855   dfa->mb_cur_max = MB_CUR_MAX;
 856 #ifdef _LIBC
 857   if (dfa->mb_cur_max == 6
 858       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 859     dfa->is_utf8 = 1;
 860   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 861                        != 0);
 862 #else
 863 # ifdef HAVE_LANGINFO_CODESET
 864   codeset_name = nl_langinfo (CODESET);
 865 # else
 866   codeset_name = getenv ("LC_ALL");
 867   if (codeset_name == NULL || codeset_name[0] == '\0')
 868     codeset_name = getenv ("LC_CTYPE");
 869   if (codeset_name == NULL || codeset_name[0] == '\0')
 870     codeset_name = getenv ("LANG");
 871   if (codeset_name == NULL)
 872     codeset_name = "";
 873   else if (strchr (codeset_name, '.') !=  NULL)
 874     codeset_name = strchr (codeset_name, '.') + 1;
 875 # endif
 876
 877   if (strcasecmp (codeset_name, "UTF-8") == 0
 878       || strcasecmp (codeset_name, "UTF8") == 0)
 879     dfa->is_utf8 = 1;
 880
 881   /* We check exhaustively in the loop below if this charset is a
 882      superset of ASCII.  */
 883   dfa->map_notascii = 0;
 884 #endif
 885
 886 #ifdef RE_ENABLE_I18N
 887   if (dfa->mb_cur_max > 1)
 888     {
 889       if (dfa->is_utf8)
 890         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 891       else
 892         {
 893           int i, j, ch;
 894
 895           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 896           if (BE (dfa->sb_char == NULL, 0))
 897             return REG_ESPACE;
 898
 899           /* Set the bits corresponding to single byte chars.  */
 900           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 901             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 902               {
 903                 wint_t wch = __btowc (ch);
 904                 if (wch != WEOF)
 905                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 906 # ifndef _LIBC
 907                 if (isascii (ch) && wch != ch)
 908                   dfa->map_notascii = 1;
 909 # endif
 910               }
 911         }
 912     }
 913 #endif
 914
 915   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 916     return REG_ESPACE;
 917   return REG_NOERROR;
 918 }
 919
 920 /* Initialize WORD_CHAR table, which indicate which character is
 921    "word".  In this case "word" means that it is the word construction
 922    character used by some operators like "\<", "\>", etc.  */
 923
 924 static void
 925 internal_function
 926 init_word_char (re_dfa_t *dfa)
 927 {
 928   dfa->word_ops_used = 1;
 929   int i = 0;
 930   int ch = 0;
 931   if (BE (dfa->map_notascii == 0, 1))
 932     {
 933       if (sizeof (dfa->word_char[0]) == 8)
 934         {
 935           dfa->word_char[0] = UINT64_C (0x03ff000000000000);
 936           dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
 937           i = 2;
 938         }
 939       else if (sizeof (dfa->word_char[0]) == 4)
 940         {
 941           dfa->word_char[0] = UINT32_C (0x00000000);
 942           dfa->word_char[1] = UINT32_C (0x03ff0000);
 943           dfa->word_char[2] = UINT32_C (0x87fffffe);
 944           dfa->word_char[3] = UINT32_C (0x07fffffe);
 945           i = 4;
 946         }
 947       else
 948         abort ();
 949       ch = 128;
 950
 951       if (BE (dfa->is_utf8, 1))
 952         {
 953           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 954           return;
 955         }
 956     }
 957
 958   for (; i < BITSET_WORDS; ++i)
 959     for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 960       if (isalnum (ch) || ch == '_')
 961         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 962 }
 963
 964 /* Free the work area which are only used while compiling.  */
 965
 966 static void
 967 free_workarea_compile (regex_t *preg)
 968 {
 969   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 970   bin_tree_storage_t *storage, *next;
 971   for (storage = dfa->str_tree_storage; storage; storage = next)
 972     {
 973       next = storage->next;
 974       re_free (storage);
 975     }
 976   dfa->str_tree_storage = NULL;
 977   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 978   dfa->str_tree = NULL;
 979   re_free (dfa->org_indices);
 980   dfa->org_indices = NULL;
 981 }
 982
 983 /* Create initial states for all contexts.  */
 984
 985 static reg_errcode_t
 986 create_initial_state (re_dfa_t *dfa)
 987 {
 988   int first, i;
 989   reg_errcode_t err;
 990   re_node_set init_nodes;
 991
 992   /* Initial states have the epsilon closure of the node which is
 993      the first node of the regular expression.  */
 994   first = dfa->str_tree->first->node_idx;
 995   dfa->init_node = first;
 996   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 997   if (BE (err != REG_NOERROR, 0))
 998     return err;
 999
1000   /* The back-references which are in initial states can epsilon transit,
1001      since in this case all of the subexpressions can be null.
1002      Then we add epsilon closures of the nodes which are the next nodes of
1003      the back-references.  */
1004   if (dfa->nbackref > 0)
1005     for (i = 0; i < init_nodes.nelem; ++i)
1006       {
1007         int node_idx = init_nodes.elems[i];
1008         re_token_type_t type = dfa->nodes[node_idx].type;
1009
1010         int clexp_idx;
1011         if (type != OP_BACK_REF)
1012           continue;
1013         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1014           {
1015             re_token_t *clexp_node;
1016             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1017             if (clexp_node->type == OP_CLOSE_SUBEXP
1018                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1019               break;
1020           }
1021         if (clexp_idx == init_nodes.nelem)
1022           continue;
1023
1024         if (type == OP_BACK_REF)
1025           {
1026             int dest_idx = dfa->edests[node_idx].elems[0];
1027             if (!re_node_set_contains (&init_nodes, dest_idx))
1028               {
1029                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1030                                                        dfa->eclosures
1031                                                        + dest_idx);
1032                 if (err != REG_NOERROR)
1033                   return err;
1034                 i = 0;
1035               }
1036           }
1037       }
1038
1039   /* It must be the first time to invoke acquire_state.  */
1040   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1041   /* We don't check ERR here, since the initial state must not be NULL.  */
1042   if (BE (dfa->init_state == NULL, 0))
1043     return err;
1044   if (dfa->init_state->has_constraint)
1045     {
1046       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1047                                                        CONTEXT_WORD);
1048       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1049                                                      CONTEXT_NEWLINE);
1050       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1051                                                          &init_nodes,
1052                                                          CONTEXT_NEWLINE
1053                                                          | CONTEXT_BEGBUF);
1054       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1055               || dfa->init_state_begbuf == NULL, 0))
1056         return err;
1057     }
1058   else
1059     dfa->init_state_word = dfa->init_state_nl
1060       = dfa->init_state_begbuf = dfa->init_state;
1061
1062   re_node_set_free (&init_nodes);
1063   return REG_NOERROR;
1064 }
1065 \f
1066 #ifdef RE_ENABLE_I18N
1067 /* If it is possible to do searching in single byte encoding instead of UTF-8
1068    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1069    DFA nodes where needed.  */
1070
1071 static void
1072 optimize_utf8 (re_dfa_t *dfa)
1073 {
1074   int node, i, mb_chars = 0, has_period = 0;
1075
1076   for (node = 0; node < dfa->nodes_len; ++node)
1077     switch (dfa->nodes[node].type)
1078       {
1079       case CHARACTER:
1080         if (dfa->nodes[node].opr.c >= 0x80)
1081           mb_chars = 1;
1082         break;
1083       case ANCHOR:
1084         switch (dfa->nodes[node].opr.ctx_type)
1085           {
1086           case LINE_FIRST:
1087           case LINE_LAST:
1088           case BUF_FIRST:
1089           case BUF_LAST:
1090             break;
1091           default:
1092             /* Word anchors etc. cannot be handled.  It's okay to test
1093                opr.ctx_type since constraints (for all DFA nodes) are
1094                created by ORing one or more opr.ctx_type values.  */
1095             return;
1096           }
1097         break;
1098       case OP_PERIOD:
1099         has_period = 1;
1100         break;
1101       case OP_BACK_REF:
1102       case OP_ALT:
1103       case END_OF_RE:
1104       case OP_DUP_ASTERISK:
1105       case OP_OPEN_SUBEXP:
1106       case OP_CLOSE_SUBEXP:
1107         break;
1108       case COMPLEX_BRACKET:
1109         return;
1110       case SIMPLE_BRACKET:
1111         /* Just double check.  The non-ASCII range starts at 0x80.  */
1112         assert (0x80 % BITSET_WORD_BITS == 0);
1113         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1114           if (dfa->nodes[node].opr.sbcset[i])
1115             return;
1116         break;
1117       default:
1118         abort ();
1119       }
1120
1121   if (mb_chars || has_period)
1122     for (node = 0; node < dfa->nodes_len; ++node)
1123       {
1124         if (dfa->nodes[node].type == CHARACTER
1125             && dfa->nodes[node].opr.c >= 0x80)
1126           dfa->nodes[node].mb_partial = 0;
1127         else if (dfa->nodes[node].type == OP_PERIOD)
1128           dfa->nodes[node].type = OP_UTF8_PERIOD;
1129       }
1130
1131   /* The search can be in single byte locale.  */
1132   dfa->mb_cur_max = 1;
1133   dfa->is_utf8 = 0;
1134   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1135 }
1136 #endif
1137 \f
1138 /* Analyze the structure tree, and calculate "first", "next", "edest",
1139    "eclosure", and "inveclosure".  */
1140
1141 static reg_errcode_t
1142 analyze (regex_t *preg)
1143 {
1144   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1145   reg_errcode_t ret;
1146
1147   /* Allocate arrays.  */
1148   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1149   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1150   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1151   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1152   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1153           || dfa->eclosures == NULL, 0))
1154     return REG_ESPACE;
1155
1156   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1157   if (dfa->subexp_map != NULL)
1158     {
1159       int i;
1160       for (i = 0; i < preg->re_nsub; i++)
1161         dfa->subexp_map[i] = i;
1162       preorder (dfa->str_tree, optimize_subexps, dfa);
1163       for (i = 0; i < preg->re_nsub; i++)
1164         if (dfa->subexp_map[i] != i)
1165           break;
1166       if (i == preg->re_nsub)
1167         {
1168           free (dfa->subexp_map);
1169           dfa->subexp_map = NULL;
1170         }
1171     }
1172
1173   ret = postorder (dfa->str_tree, lower_subexps, preg);
1174   if (BE (ret != REG_NOERROR, 0))
1175     return ret;
1176   ret = postorder (dfa->str_tree, calc_first, dfa);
1177   if (BE (ret != REG_NOERROR, 0))
1178     return ret;
1179   preorder (dfa->str_tree, calc_next, dfa);
1180   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1181   if (BE (ret != REG_NOERROR, 0))
1182     return ret;
1183   ret = calc_eclosure (dfa);
1184   if (BE (ret != REG_NOERROR, 0))
1185     return ret;
1186
1187   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1188      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1189   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1190       || dfa->nbackref)
1191     {
1192       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1193       if (BE (dfa->inveclosures == NULL, 0))
1194         return REG_ESPACE;
1195       ret = calc_inveclosure (dfa);
1196     }
1197
1198   return ret;
1199 }
1200
1201 /* Our parse trees are very unbalanced, so we cannot use a stack to
1202    implement parse tree visits.  Instead, we use parent pointers and
1203    some hairy code in these two functions.  */
1204 static reg_errcode_t
1205 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1206            void *extra)
1207 {
1208   bin_tree_t *node, *prev;
1209
1210   for (node = root; ; )
1211     {
1212       /* Descend down the tree, preferably to the left (or to the right
1213          if that's the only child).  */
1214       while (node->left || node->right)
1215         if (node->left)
1216           node = node->left;
1217         else
1218           node = node->right;
1219
1220       do
1221         {
1222           reg_errcode_t err = fn (extra, node);
1223           if (BE (err != REG_NOERROR, 0))
1224             return err;
1225           if (node->parent == NULL)
1226             return REG_NOERROR;
1227           prev = node;
1228           node = node->parent;
1229         }
1230       /* Go up while we have a node that is reached from the right.  */
1231       while (node->right == prev || node->right == NULL);
1232       node = node->right;
1233     }
1234 }
1235
1236 static reg_errcode_t
1237 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1238           void *extra)
1239 {
1240   bin_tree_t *node;
1241
1242   for (node = root; ; )
1243     {
1244       reg_errcode_t err = fn (extra, node);
1245       if (BE (err != REG_NOERROR, 0))
1246         return err;
1247
1248       /* Go to the left node, or up and to the right.  */
1249       if (node->left)
1250         node = node->left;
1251       else
1252         {
1253           bin_tree_t *prev = NULL;
1254           while (node->right == prev || node->right == NULL)
1255             {
1256               prev = node;
1257               node = node->parent;
1258               if (!node)
1259                 return REG_NOERROR;
1260             }
1261           node = node->right;
1262         }
1263     }
1264 }
1265
1266 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1267    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1268    backreferences as well.  Requires a preorder visit.  */
1269 static reg_errcode_t
1270 optimize_subexps (void *extra, bin_tree_t *node)
1271 {
1272   re_dfa_t *dfa = (re_dfa_t *) extra;
1273
1274   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1275     {
1276       int idx = node->token.opr.idx;
1277       node->token.opr.idx = dfa->subexp_map[idx];
1278       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1279     }
1280
1281   else if (node->token.type == SUBEXP
1282            && node->left && node->left->token.type == SUBEXP)
1283     {
1284       int other_idx = node->left->token.opr.idx;
1285
1286       node->left = node->left->left;
1287       if (node->left)
1288         node->left->parent = node;
1289
1290       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1291       if (other_idx < BITSET_WORD_BITS)
1292           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1293     }
1294
1295   return REG_NOERROR;
1296 }
1297
1298 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1299    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1300 static reg_errcode_t
1301 lower_subexps (void *extra, bin_tree_t *node)
1302 {
1303   regex_t *preg = (regex_t *) extra;
1304   reg_errcode_t err = REG_NOERROR;
1305
1306   if (node->left && node->left->token.type == SUBEXP)
1307     {
1308       node->left = lower_subexp (&err, preg, node->left);
1309       if (node->left)
1310         node->left->parent = node;
1311     }
1312   if (node->right && node->right->token.type == SUBEXP)
1313     {
1314       node->right = lower_subexp (&err, preg, node->right);
1315       if (node->right)
1316         node->right->parent = node;
1317     }
1318
1319   return err;
1320 }
1321
1322 static bin_tree_t *
1323 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1324 {
1325   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1326   bin_tree_t *body = node->left;
1327   bin_tree_t *op, *cls, *tree1, *tree;
1328
1329   if (preg->no_sub
1330       /* We do not optimize empty subexpressions, because otherwise we may
1331          have bad CONCAT nodes with NULL children.  This is obviously not
1332          very common, so we do not lose much.  An example that triggers
1333          this case is the sed "script" /\(\)/x.  */
1334       && node->left != NULL
1335       && (node->token.opr.idx >= BITSET_WORD_BITS
1336           || !(dfa->used_bkref_map
1337                & ((bitset_word_t) 1 << node->token.opr.idx))))
1338     return node->left;
1339
1340   /* Convert the SUBEXP node to the concatenation of an
1341      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1342   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1343   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1344   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1345   tree = create_tree (dfa, op, tree1, CONCAT);
1346   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1347     {
1348       *err = REG_ESPACE;
1349       return NULL;
1350     }
1351
1352   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1353   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1354   return tree;
1355 }
1356
1357 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1358    nodes.  Requires a postorder visit.  */
1359 static reg_errcode_t
1360 calc_first (void *extra, bin_tree_t *node)
1361 {
1362   re_dfa_t *dfa = (re_dfa_t *) extra;
1363   if (node->token.type == CONCAT)
1364     {
1365       node->first = node->left->first;
1366       node->node_idx = node->left->node_idx;
1367     }
1368   else
1369     {
1370       node->first = node;
1371       node->node_idx = re_dfa_add_node (dfa, node->token);
1372       if (BE (node->node_idx == -1, 0))
1373         return REG_ESPACE;
1374       if (node->token.type == ANCHOR)
1375         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1376     }
1377   return REG_NOERROR;
1378 }
1379
1380 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1381 static reg_errcode_t
1382 calc_next (void *extra, bin_tree_t *node)
1383 {
1384   switch (node->token.type)
1385     {
1386     case OP_DUP_ASTERISK:
1387       node->left->next = node;
1388       break;
1389     case CONCAT:
1390       node->left->next = node->right->first;
1391       node->right->next = node->next;
1392       break;
1393     default:
1394       if (node->left)
1395         node->left->next = node->next;
1396       if (node->right)
1397         node->right->next = node->next;
1398       break;
1399     }
1400   return REG_NOERROR;
1401 }
1402
1403 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1404 static reg_errcode_t
1405 link_nfa_nodes (void *extra, bin_tree_t *node)
1406 {
1407   re_dfa_t *dfa = (re_dfa_t *) extra;
1408   int idx = node->node_idx;
1409   reg_errcode_t err = REG_NOERROR;
1410
1411   switch (node->token.type)
1412     {
1413     case CONCAT:
1414       break;
1415
1416     case END_OF_RE:
1417       assert (node->next == NULL);
1418       break;
1419
1420     case OP_DUP_ASTERISK:
1421     case OP_ALT:
1422       {
1423         int left, right;
1424         dfa->has_plural_match = 1;
1425         if (node->left != NULL)
1426           left = node->left->first->node_idx;
1427         else
1428           left = node->next->node_idx;
1429         if (node->right != NULL)
1430           right = node->right->first->node_idx;
1431         else
1432           right = node->next->node_idx;
1433         assert (left > -1);
1434         assert (right > -1);
1435         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1436       }
1437       break;
1438
1439     case ANCHOR:
1440     case OP_OPEN_SUBEXP:
1441     case OP_CLOSE_SUBEXP:
1442       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1443       break;
1444
1445     case OP_BACK_REF:
1446       dfa->nexts[idx] = node->next->node_idx;
1447       if (node->token.type == OP_BACK_REF)
1448         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1449       break;
1450
1451     default:
1452       assert (!IS_EPSILON_NODE (node->token.type));
1453       dfa->nexts[idx] = node->next->node_idx;
1454       break;
1455     }
1456
1457   return err;
1458 }
1459
1460 /* Duplicate the epsilon closure of the node ROOT_NODE.
1461    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1462    to their own constraint.  */
1463
1464 static reg_errcode_t
1465 internal_function
1466 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1467                         int root_node, unsigned int init_constraint)
1468 {
1469   int org_node, clone_node, ret;
1470   unsigned int constraint = init_constraint;
1471   for (org_node = top_org_node, clone_node = top_clone_node;;)
1472     {
1473       int org_dest, clone_dest;
1474       if (dfa->nodes[org_node].type == OP_BACK_REF)
1475         {
1476           /* If the back reference epsilon-transit, its destination must
1477              also have the constraint.  Then duplicate the epsilon closure
1478              of the destination of the back reference, and store it in
1479              edests of the back reference.  */
1480           org_dest = dfa->nexts[org_node];
1481           re_node_set_empty (dfa->edests + clone_node);
1482           clone_dest = duplicate_node (dfa, org_dest, constraint);
1483           if (BE (clone_dest == -1, 0))
1484             return REG_ESPACE;
1485           dfa->nexts[clone_node] = dfa->nexts[org_node];
1486           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1487           if (BE (ret < 0, 0))
1488             return REG_ESPACE;
1489         }
1490       else if (dfa->edests[org_node].nelem == 0)
1491         {
1492           /* In case of the node can't epsilon-transit, don't duplicate the
1493              destination and store the original destination as the
1494              destination of the node.  */
1495           dfa->nexts[clone_node] = dfa->nexts[org_node];
1496           break;
1497         }
1498       else if (dfa->edests[org_node].nelem == 1)
1499         {
1500           /* In case of the node can epsilon-transit, and it has only one
1501              destination.  */
1502           org_dest = dfa->edests[org_node].elems[0];
1503           re_node_set_empty (dfa->edests + clone_node);
1504           /* If the node is root_node itself, it means the epsilon clsoure
1505              has a loop.   Then tie it to the destination of the root_node.  */
1506           if (org_node == root_node && clone_node != org_node)
1507             {
1508               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1509               if (BE (ret < 0, 0))
1510                 return REG_ESPACE;
1511               break;
1512             }
1513           /* In case of the node has another constraint, add it.  */
1514           constraint |= dfa->nodes[org_node].constraint;
1515           clone_dest = duplicate_node (dfa, org_dest, constraint);
1516           if (BE (clone_dest == -1, 0))
1517             return REG_ESPACE;
1518           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519           if (BE (ret < 0, 0))
1520             return REG_ESPACE;
1521         }
1522       else /* dfa->edests[org_node].nelem == 2 */
1523         {
1524           /* In case of the node can epsilon-transit, and it has two
1525              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1526           org_dest = dfa->edests[org_node].elems[0];
1527           re_node_set_empty (dfa->edests + clone_node);
1528           /* Search for a duplicated node which satisfies the constraint.  */
1529           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1530           if (clone_dest == -1)
1531             {
1532               /* There is no such duplicated node, create a new one.  */
1533               reg_errcode_t err;
1534               clone_dest = duplicate_node (dfa, org_dest, constraint);
1535               if (BE (clone_dest == -1, 0))
1536                 return REG_ESPACE;
1537               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1538               if (BE (ret < 0, 0))
1539                 return REG_ESPACE;
1540               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1541                                             root_node, constraint);
1542               if (BE (err != REG_NOERROR, 0))
1543                 return err;
1544             }
1545           else
1546             {
1547               /* There is a duplicated node which satisfies the constraint,
1548                  use it to avoid infinite loop.  */
1549               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1550               if (BE (ret < 0, 0))
1551                 return REG_ESPACE;
1552             }
1553
1554           org_dest = dfa->edests[org_node].elems[1];
1555           clone_dest = duplicate_node (dfa, org_dest, constraint);
1556           if (BE (clone_dest == -1, 0))
1557             return REG_ESPACE;
1558           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559           if (BE (ret < 0, 0))
1560             return REG_ESPACE;
1561         }
1562       org_node = org_dest;
1563       clone_node = clone_dest;
1564     }
1565   return REG_NOERROR;
1566 }
1567
1568 /* Search for a node which is duplicated from the node ORG_NODE, and
1569    satisfies the constraint CONSTRAINT.  */
1570
1571 static int
1572 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1573                         unsigned int constraint)
1574 {
1575   int idx;
1576   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1577     {
1578       if (org_node == dfa->org_indices[idx]
1579           && constraint == dfa->nodes[idx].constraint)
1580         return idx; /* Found.  */
1581     }
1582   return -1; /* Not found.  */
1583 }
1584
1585 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1586    Return the index of the new node, or -1 if insufficient storage is
1587    available.  */
1588
1589 static int
1590 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1591 {
1592   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1593   if (BE (dup_idx != -1, 1))
1594     {
1595       dfa->nodes[dup_idx].constraint = constraint;
1596       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1597       dfa->nodes[dup_idx].duplicated = 1;
1598
1599       /* Store the index of the original node.  */
1600       dfa->org_indices[dup_idx] = org_idx;
1601     }
1602   return dup_idx;
1603 }
1604
1605 static reg_errcode_t
1606 calc_inveclosure (re_dfa_t *dfa)
1607 {
1608   int src, idx, ret;
1609   for (idx = 0; idx < dfa->nodes_len; ++idx)
1610     re_node_set_init_empty (dfa->inveclosures + idx);
1611
1612   for (src = 0; src < dfa->nodes_len; ++src)
1613     {
1614       int *elems = dfa->eclosures[src].elems;
1615       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1616         {
1617           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1618           if (BE (ret == -1, 0))
1619             return REG_ESPACE;
1620         }
1621     }
1622
1623   return REG_NOERROR;
1624 }
1625
1626 /* Calculate "eclosure" for all the node in DFA.  */
1627
1628 static reg_errcode_t
1629 calc_eclosure (re_dfa_t *dfa)
1630 {
1631   int node_idx, incomplete;
1632 #ifdef DEBUG
1633   assert (dfa->nodes_len > 0);
1634 #endif
1635   incomplete = 0;
1636   /* For each nodes, calculate epsilon closure.  */
1637   for (node_idx = 0; ; ++node_idx)
1638     {
1639       reg_errcode_t err;
1640       re_node_set eclosure_elem;
1641       if (node_idx == dfa->nodes_len)
1642         {
1643           if (!incomplete)
1644             break;
1645           incomplete = 0;
1646           node_idx = 0;
1647         }
1648
1649 #ifdef DEBUG
1650       assert (dfa->eclosures[node_idx].nelem != -1);
1651 #endif
1652
1653       /* If we have already calculated, skip it.  */
1654       if (dfa->eclosures[node_idx].nelem != 0)
1655         continue;
1656       /* Calculate epsilon closure of `node_idx'.  */
1657       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1658       if (BE (err != REG_NOERROR, 0))
1659         return err;
1660
1661       if (dfa->eclosures[node_idx].nelem == 0)
1662         {
1663           incomplete = 1;
1664           re_node_set_free (&eclosure_elem);
1665         }
1666     }
1667   return REG_NOERROR;
1668 }
1669
1670 /* Calculate epsilon closure of NODE.  */
1671
1672 static reg_errcode_t
1673 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1674 {
1675   reg_errcode_t err;
1676   int i;
1677   re_node_set eclosure;
1678   int ret;
1679   int incomplete = 0;
1680   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1681   if (BE (err != REG_NOERROR, 0))
1682     return err;
1683
1684   /* This indicates that we are calculating this node now.
1685      We reference this value to avoid infinite loop.  */
1686   dfa->eclosures[node].nelem = -1;
1687
1688   /* If the current node has constraints, duplicate all nodes
1689      since they must inherit the constraints.  */
1690   if (dfa->nodes[node].constraint
1691       && dfa->edests[node].nelem
1692       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1693     {
1694       err = duplicate_node_closure (dfa, node, node, node,
1695                                     dfa->nodes[node].constraint);
1696       if (BE (err != REG_NOERROR, 0))
1697         return err;
1698     }
1699
1700   /* Expand each epsilon destination nodes.  */
1701   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1702     for (i = 0; i < dfa->edests[node].nelem; ++i)
1703       {
1704         re_node_set eclosure_elem;
1705         int edest = dfa->edests[node].elems[i];
1706         /* If calculating the epsilon closure of `edest' is in progress,
1707            return intermediate result.  */
1708         if (dfa->eclosures[edest].nelem == -1)
1709           {
1710             incomplete = 1;
1711             continue;
1712           }
1713         /* If we haven't calculated the epsilon closure of `edest' yet,
1714            calculate now. Otherwise use calculated epsilon closure.  */
1715         if (dfa->eclosures[edest].nelem == 0)
1716           {
1717             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1718             if (BE (err != REG_NOERROR, 0))
1719               return err;
1720           }
1721         else
1722           eclosure_elem = dfa->eclosures[edest];
1723         /* Merge the epsilon closure of `edest'.  */
1724         err = re_node_set_merge (&eclosure, &eclosure_elem);
1725         if (BE (err != REG_NOERROR, 0))
1726           return err;
1727         /* If the epsilon closure of `edest' is incomplete,
1728            the epsilon closure of this node is also incomplete.  */
1729         if (dfa->eclosures[edest].nelem == 0)
1730           {
1731             incomplete = 1;
1732             re_node_set_free (&eclosure_elem);
1733           }
1734       }
1735
1736   /* An epsilon closure includes itself.  */
1737   ret = re_node_set_insert (&eclosure, node);
1738   if (BE (ret < 0, 0))
1739     return REG_ESPACE;
1740   if (incomplete && !root)
1741     dfa->eclosures[node].nelem = 0;
1742   else
1743     dfa->eclosures[node] = eclosure;
1744   *new_set = eclosure;
1745   return REG_NOERROR;
1746 }
1747 \f
1748 /* Functions for token which are used in the parser.  */
1749
1750 /* Fetch a token from INPUT.
1751    We must not use this function inside bracket expressions.  */
1752
1753 static void
1754 internal_function
1755 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1756 {
1757   re_string_skip_bytes (input, peek_token (result, input, syntax));
1758 }
1759
1760 /* Peek a token from INPUT, and return the length of the token.
1761    We must not use this function inside bracket expressions.  */
1762
1763 static int
1764 internal_function
1765 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1766 {
1767   unsigned char c;
1768
1769   if (re_string_eoi (input))
1770     {
1771       token->type = END_OF_RE;
1772       return 0;
1773     }
1774
1775   c = re_string_peek_byte (input, 0);
1776   token->opr.c = c;
1777
1778   token->word_char = 0;
1779 #ifdef RE_ENABLE_I18N
1780   token->mb_partial = 0;
1781   if (input->mb_cur_max > 1 &&
1782       !re_string_first_byte (input, re_string_cur_idx (input)))
1783     {
1784       token->type = CHARACTER;
1785       token->mb_partial = 1;
1786       return 1;
1787     }
1788 #endif
1789   if (c == '\\')
1790     {
1791       unsigned char c2;
1792       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1793         {
1794           token->type = BACK_SLASH;
1795           return 1;
1796         }
1797
1798       c2 = re_string_peek_byte_case (input, 1);
1799       token->opr.c = c2;
1800       token->type = CHARACTER;
1801 #ifdef RE_ENABLE_I18N
1802       if (input->mb_cur_max > 1)
1803         {
1804           wint_t wc = re_string_wchar_at (input,
1805                                           re_string_cur_idx (input) + 1);
1806           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1807         }
1808       else
1809 #endif
1810         token->word_char = IS_WORD_CHAR (c2) != 0;
1811
1812       switch (c2)
1813         {
1814         case '|':
1815           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1816             token->type = OP_ALT;
1817           break;
1818         case '1': case '2': case '3': case '4': case '5':
1819         case '6': case '7': case '8': case '9':
1820           if (!(syntax & RE_NO_BK_REFS))
1821             {
1822               token->type = OP_BACK_REF;
1823               token->opr.idx = c2 - '1';
1824             }
1825           break;
1826         case '<':
1827           if (!(syntax & RE_NO_GNU_OPS))
1828             {
1829               token->type = ANCHOR;
1830               token->opr.ctx_type = WORD_FIRST;
1831             }
1832           break;
1833         case '>':
1834           if (!(syntax & RE_NO_GNU_OPS))
1835             {
1836               token->type = ANCHOR;
1837               token->opr.ctx_type = WORD_LAST;
1838             }
1839           break;
1840         case 'b':
1841           if (!(syntax & RE_NO_GNU_OPS))
1842             {
1843               token->type = ANCHOR;
1844               token->opr.ctx_type = WORD_DELIM;
1845             }
1846           break;
1847         case 'B':
1848           if (!(syntax & RE_NO_GNU_OPS))
1849             {
1850               token->type = ANCHOR;
1851               token->opr.ctx_type = NOT_WORD_DELIM;
1852             }
1853           break;
1854         case 'w':
1855           if (!(syntax & RE_NO_GNU_OPS))
1856             token->type = OP_WORD;
1857           break;
1858         case 'W':
1859           if (!(syntax & RE_NO_GNU_OPS))
1860             token->type = OP_NOTWORD;
1861           break;
1862         case 's':
1863           if (!(syntax & RE_NO_GNU_OPS))
1864             token->type = OP_SPACE;
1865           break;
1866         case 'S':
1867           if (!(syntax & RE_NO_GNU_OPS))
1868             token->type = OP_NOTSPACE;
1869           break;
1870         case '`':
1871           if (!(syntax & RE_NO_GNU_OPS))
1872             {
1873               token->type = ANCHOR;
1874               token->opr.ctx_type = BUF_FIRST;
1875             }
1876           break;
1877         case '\'':
1878           if (!(syntax & RE_NO_GNU_OPS))
1879             {
1880               token->type = ANCHOR;
1881               token->opr.ctx_type = BUF_LAST;
1882             }
1883           break;
1884         case '(':
1885           if (!(syntax & RE_NO_BK_PARENS))
1886             token->type = OP_OPEN_SUBEXP;
1887           break;
1888         case ')':
1889           if (!(syntax & RE_NO_BK_PARENS))
1890             token->type = OP_CLOSE_SUBEXP;
1891           break;
1892         case '+':
1893           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1894             token->type = OP_DUP_PLUS;
1895           break;
1896         case '?':
1897           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898             token->type = OP_DUP_QUESTION;
1899           break;
1900         case '{':
1901           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1902             token->type = OP_OPEN_DUP_NUM;
1903           break;
1904         case '}':
1905           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906             token->type = OP_CLOSE_DUP_NUM;
1907           break;
1908         default:
1909           break;
1910         }
1911       return 2;
1912     }
1913
1914   token->type = CHARACTER;
1915 #ifdef RE_ENABLE_I18N
1916   if (input->mb_cur_max > 1)
1917     {
1918       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1919       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1920     }
1921   else
1922 #endif
1923     token->word_char = IS_WORD_CHAR (token->opr.c);
1924
1925   switch (c)
1926     {
1927     case '\n':
1928       if (syntax & RE_NEWLINE_ALT)
1929         token->type = OP_ALT;
1930       break;
1931     case '|':
1932       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1933         token->type = OP_ALT;
1934       break;
1935     case '*':
1936       token->type = OP_DUP_ASTERISK;
1937       break;
1938     case '+':
1939       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1940         token->type = OP_DUP_PLUS;
1941       break;
1942     case '?':
1943       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944         token->type = OP_DUP_QUESTION;
1945       break;
1946     case '{':
1947       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1948         token->type = OP_OPEN_DUP_NUM;
1949       break;
1950     case '}':
1951       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952         token->type = OP_CLOSE_DUP_NUM;
1953       break;
1954     case '(':
1955       if (syntax & RE_NO_BK_PARENS)
1956         token->type = OP_OPEN_SUBEXP;
1957       break;
1958     case ')':
1959       if (syntax & RE_NO_BK_PARENS)
1960         token->type = OP_CLOSE_SUBEXP;
1961       break;
1962     case '[':
1963       token->type = OP_OPEN_BRACKET;
1964       break;
1965     case '.':
1966       token->type = OP_PERIOD;
1967       break;
1968     case '^':
1969       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1970           re_string_cur_idx (input) != 0)
1971         {
1972           char prev = re_string_peek_byte (input, -1);
1973           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1974             break;
1975         }
1976       token->type = ANCHOR;
1977       token->opr.ctx_type = LINE_FIRST;
1978       break;
1979     case '$':
1980       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1981           re_string_cur_idx (input) + 1 != re_string_length (input))
1982         {
1983           re_token_t next;
1984           re_string_skip_bytes (input, 1);
1985           peek_token (&next, input, syntax);
1986           re_string_skip_bytes (input, -1);
1987           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1988             break;
1989         }
1990       token->type = ANCHOR;
1991       token->opr.ctx_type = LINE_LAST;
1992       break;
1993     default:
1994       break;
1995     }
1996   return 1;
1997 }
1998
1999 /* Peek a token from INPUT, and return the length of the token.
2000    We must not use this function out of bracket expressions.  */
2001
2002 static int
2003 internal_function
2004 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2005 {
2006   unsigned char c;
2007   if (re_string_eoi (input))
2008     {
2009       token->type = END_OF_RE;
2010       return 0;
2011     }
2012   c = re_string_peek_byte (input, 0);
2013   token->opr.c = c;
2014
2015 #ifdef RE_ENABLE_I18N
2016   if (input->mb_cur_max > 1 &&
2017       !re_string_first_byte (input, re_string_cur_idx (input)))
2018     {
2019       token->type = CHARACTER;
2020       return 1;
2021     }
2022 #endif /* RE_ENABLE_I18N */
2023
2024   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2025       && re_string_cur_idx (input) + 1 < re_string_length (input))
2026     {
2027       /* In this case, '\' escape a character.  */
2028       unsigned char c2;
2029       re_string_skip_bytes (input, 1);
2030       c2 = re_string_peek_byte (input, 0);
2031       token->opr.c = c2;
2032       token->type = CHARACTER;
2033       return 1;
2034     }
2035   if (c == '[') /* '[' is a special char in a bracket exps.  */
2036     {
2037       unsigned char c2;
2038       int token_len;
2039       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2040         c2 = re_string_peek_byte (input, 1);
2041       else
2042         c2 = 0;
2043       token->opr.c = c2;
2044       token_len = 2;
2045       switch (c2)
2046         {
2047         case '.':
2048           token->type = OP_OPEN_COLL_ELEM;
2049           break;
2050         case '=':
2051           token->type = OP_OPEN_EQUIV_CLASS;
2052           break;
2053         case ':':
2054           if (syntax & RE_CHAR_CLASSES)
2055             {
2056               token->type = OP_OPEN_CHAR_CLASS;
2057               break;
2058             }
2059           /* else fall through.  */
2060         default:
2061           token->type = CHARACTER;
2062           token->opr.c = c;
2063           token_len = 1;
2064           break;
2065         }
2066       return token_len;
2067     }
2068   switch (c)
2069     {
2070     case '-':
2071       token->type = OP_CHARSET_RANGE;
2072       break;
2073     case ']':
2074       token->type = OP_CLOSE_BRACKET;
2075       break;
2076     case '^':
2077       token->type = OP_NON_MATCH_LIST;
2078       break;
2079     default:
2080       token->type = CHARACTER;
2081     }
2082   return 1;
2083 }
2084 \f
2085 /* Functions for parser.  */
2086
2087 /* Entry point of the parser.
2088    Parse the regular expression REGEXP and return the structure tree.
2089    If an error is occured, ERR is set by error code, and return NULL.
2090    This function build the following tree, from regular expression <reg_exp>:
2091            CAT
2092            / \
2093           /   \
2094    <reg_exp>  EOR
2095
2096    CAT means concatenation.
2097    EOR means end of regular expression.  */
2098
2099 static bin_tree_t *
2100 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2101        reg_errcode_t *err)
2102 {
2103   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2104   bin_tree_t *tree, *eor, *root;
2105   re_token_t current_token;
2106   dfa->syntax = syntax;
2107   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2108   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2109   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2110     return NULL;
2111   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2112   if (tree != NULL)
2113     root = create_tree (dfa, tree, eor, CONCAT);
2114   else
2115     root = eor;
2116   if (BE (eor == NULL || root == NULL, 0))
2117     {
2118       *err = REG_ESPACE;
2119       return NULL;
2120     }
2121   return root;
2122 }
2123
2124 /* This function build the following tree, from regular expression
2125    <branch1>|<branch2>:
2126            ALT
2127            / \
2128           /   \
2129    <branch1> <branch2>
2130
2131    ALT means alternative, which represents the operator `|'.  */
2132
2133 static bin_tree_t *
2134 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2135                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2136 {
2137   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2138   bin_tree_t *tree, *branch = NULL;
2139   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2140   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2141     return NULL;
2142
2143   while (token->type == OP_ALT)
2144     {
2145       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2146       if (token->type != OP_ALT && token->type != END_OF_RE
2147           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2148         {
2149           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2150           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2151             return NULL;
2152         }
2153       else
2154         branch = NULL;
2155       tree = create_tree (dfa, tree, branch, OP_ALT);
2156       if (BE (tree == NULL, 0))
2157         {
2158           *err = REG_ESPACE;
2159           return NULL;
2160         }
2161     }
2162   return tree;
2163 }
2164
2165 /* This function build the following tree, from regular expression
2166    <exp1><exp2>:
2167         CAT
2168         / \
2169        /   \
2170    <exp1> <exp2>
2171
2172    CAT means concatenation.  */
2173
2174 static bin_tree_t *
2175 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2176               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2177 {
2178   bin_tree_t *tree, *exp;
2179   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2180   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2181   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2182     return NULL;
2183
2184   while (token->type != OP_ALT && token->type != END_OF_RE
2185          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2186     {
2187       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2188       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2189         {
2190           if (tree != NULL)
2191             postorder (tree, free_tree, NULL);
2192           return NULL;
2193         }
2194       if (tree != NULL && exp != NULL)
2195         {
2196           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2197           if (newtree == NULL)
2198             {
2199               postorder (exp, free_tree, NULL);
2200               postorder (tree, free_tree, NULL);
2201               *err = REG_ESPACE;
2202               return NULL;
2203             }
2204           tree = newtree;
2205         }
2206       else if (tree == NULL)
2207         tree = exp;
2208       /* Otherwise exp == NULL, we don't need to create new tree.  */
2209     }
2210   return tree;
2211 }
2212
2213 /* This function build the following tree, from regular expression a*:
2214          *
2215          |
2216          a
2217 */
2218
2219 static bin_tree_t *
2220 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2221                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2222 {
2223   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2224   bin_tree_t *tree;
2225   switch (token->type)
2226     {
2227     case CHARACTER:
2228       tree = create_token_tree (dfa, NULL, NULL, token);
2229       if (BE (tree == NULL, 0))
2230         {
2231           *err = REG_ESPACE;
2232           return NULL;
2233         }
2234 #ifdef RE_ENABLE_I18N
2235       if (dfa->mb_cur_max > 1)
2236         {
2237           while (!re_string_eoi (regexp)
2238                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2239             {
2240               bin_tree_t *mbc_remain;
2241               fetch_token (token, regexp, syntax);
2242               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2243               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2244               if (BE (mbc_remain == NULL || tree == NULL, 0))
2245                 {
2246                   *err = REG_ESPACE;
2247                   return NULL;
2248                 }
2249             }
2250         }
2251 #endif
2252       break;
2253     case OP_OPEN_SUBEXP:
2254       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2255       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2256         return NULL;
2257       break;
2258     case OP_OPEN_BRACKET:
2259       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2260       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2261         return NULL;
2262       break;
2263     case OP_BACK_REF:
2264       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2265         {
2266           *err = REG_ESUBREG;
2267           return NULL;
2268         }
2269       dfa->used_bkref_map |= 1 << token->opr.idx;
2270       tree = create_token_tree (dfa, NULL, NULL, token);
2271       if (BE (tree == NULL, 0))
2272         {
2273           *err = REG_ESPACE;
2274           return NULL;
2275         }
2276       ++dfa->nbackref;
2277       dfa->has_mb_node = 1;
2278       break;
2279     case OP_OPEN_DUP_NUM:
2280       if (syntax & RE_CONTEXT_INVALID_DUP)
2281         {
2282           *err = REG_BADRPT;
2283           return NULL;
2284         }
2285       /* FALLTHROUGH */
2286     case OP_DUP_ASTERISK:
2287     case OP_DUP_PLUS:
2288     case OP_DUP_QUESTION:
2289       if (syntax & RE_CONTEXT_INVALID_OPS)
2290         {
2291           *err = REG_BADRPT;
2292           return NULL;
2293         }
2294       else if (syntax & RE_CONTEXT_INDEP_OPS)
2295         {
2296           fetch_token (token, regexp, syntax);
2297           return parse_expression (regexp, preg, token, syntax, nest, err);
2298         }
2299       /* else fall through  */
2300     case OP_CLOSE_SUBEXP:
2301       if ((token->type == OP_CLOSE_SUBEXP) &&
2302           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2303         {
2304           *err = REG_ERPAREN;
2305           return NULL;
2306         }
2307       /* else fall through  */
2308     case OP_CLOSE_DUP_NUM:
2309       /* We treat it as a normal character.  */
2310
2311       /* Then we can these characters as normal characters.  */
2312       token->type = CHARACTER;
2313       /* mb_partial and word_char bits should be initialized already
2314          by peek_token.  */
2315       tree = create_token_tree (dfa, NULL, NULL, token);
2316       if (BE (tree == NULL, 0))
2317         {
2318           *err = REG_ESPACE;
2319           return NULL;
2320         }
2321       break;
2322     case ANCHOR:
2323       if ((token->opr.ctx_type
2324            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2325           && dfa->word_ops_used == 0)
2326         init_word_char (dfa);
2327       if (token->opr.ctx_type == WORD_DELIM
2328           || token->opr.ctx_type == NOT_WORD_DELIM)
2329         {
2330           bin_tree_t *tree_first, *tree_last;
2331           if (token->opr.ctx_type == WORD_DELIM)
2332             {
2333               token->opr.ctx_type = WORD_FIRST;
2334               tree_first = create_token_tree (dfa, NULL, NULL, token);
2335               token->opr.ctx_type = WORD_LAST;
2336             }
2337           else
2338             {
2339               token->opr.ctx_type = INSIDE_WORD;
2340               tree_first = create_token_tree (dfa, NULL, NULL, token);
2341               token->opr.ctx_type = INSIDE_NOTWORD;
2342             }
2343           tree_last = create_token_tree (dfa, NULL, NULL, token);
2344           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2345           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2346             {
2347               *err = REG_ESPACE;
2348               return NULL;
2349             }
2350         }
2351       else
2352         {
2353           tree = create_token_tree (dfa, NULL, NULL, token);
2354           if (BE (tree == NULL, 0))
2355             {
2356               *err = REG_ESPACE;
2357               return NULL;
2358             }
2359         }
2360       /* We must return here, since ANCHORs can't be followed
2361          by repetition operators.
2362          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2363              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2364       fetch_token (token, regexp, syntax);
2365       return tree;
2366     case OP_PERIOD:
2367       tree = create_token_tree (dfa, NULL, NULL, token);
2368       if (BE (tree == NULL, 0))
2369         {
2370           *err = REG_ESPACE;
2371           return NULL;
2372         }
2373       if (dfa->mb_cur_max > 1)
2374         dfa->has_mb_node = 1;
2375       break;
2376     case OP_WORD:
2377     case OP_NOTWORD:
2378       tree = build_charclass_op (dfa, regexp->trans,
2379                                  (const unsigned char *) "alnum",
2380                                  (const unsigned char *) "_",
2381                                  token->type == OP_NOTWORD, err);
2382       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2383         return NULL;
2384       break;
2385     case OP_SPACE:
2386     case OP_NOTSPACE:
2387       tree = build_charclass_op (dfa, regexp->trans,
2388                                  (const unsigned char *) "space",
2389                                  (const unsigned char *) "",
2390                                  token->type == OP_NOTSPACE, err);
2391       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2392         return NULL;
2393       break;
2394     case OP_ALT:
2395     case END_OF_RE:
2396       return NULL;
2397     case BACK_SLASH:
2398       *err = REG_EESCAPE;
2399       return NULL;
2400     default:
2401       /* Must not happen?  */
2402 #ifdef DEBUG
2403       assert (0);
2404 #endif
2405       return NULL;
2406     }
2407   fetch_token (token, regexp, syntax);
2408
2409   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2410          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2411     {
2412       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2413       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2414         return NULL;
2415       /* In BRE consecutive duplications are not allowed.  */
2416       if ((syntax & RE_CONTEXT_INVALID_DUP)
2417           && (token->type == OP_DUP_ASTERISK
2418               || token->type == OP_OPEN_DUP_NUM))
2419         {
2420           *err = REG_BADRPT;
2421           return NULL;
2422         }
2423     }
2424
2425   return tree;
2426 }
2427
2428 /* This function build the following tree, from regular expression
2429    (<reg_exp>):
2430          SUBEXP
2431             |
2432         <reg_exp>
2433 */
2434
2435 static bin_tree_t *
2436 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2437                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2438 {
2439   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2440   bin_tree_t *tree;
2441   size_t cur_nsub;
2442   cur_nsub = preg->re_nsub++;
2443
2444   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2445
2446   /* The subexpression may be a null string.  */
2447   if (token->type == OP_CLOSE_SUBEXP)
2448     tree = NULL;
2449   else
2450     {
2451       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2452       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2453         {
2454           if (tree != NULL)
2455             postorder (tree, free_tree, NULL);
2456           *err = REG_EPAREN;
2457         }
2458       if (BE (*err != REG_NOERROR, 0))
2459         return NULL;
2460     }
2461
2462   if (cur_nsub <= '9' - '1')
2463     dfa->completed_bkref_map |= 1 << cur_nsub;
2464
2465   tree = create_tree (dfa, tree, NULL, SUBEXP);
2466   if (BE (tree == NULL, 0))
2467     {
2468       *err = REG_ESPACE;
2469       return NULL;
2470     }
2471   tree->token.opr.idx = cur_nsub;
2472   return tree;
2473 }
2474
2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2476
2477 static bin_tree_t *
2478 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2480 {
2481   bin_tree_t *tree = NULL, *old_tree = NULL;
2482   int i, start, end, start_idx = re_string_cur_idx (regexp);
2483   re_token_t start_token = *token;
2484
2485   if (token->type == OP_OPEN_DUP_NUM)
2486     {
2487       end = 0;
2488       start = fetch_number (regexp, token, syntax);
2489       if (start == -1)
2490         {
2491           if (token->type == CHARACTER && token->opr.c == ',')
2492             start = 0; /* We treat "{,m}" as "{0,m}".  */
2493           else
2494             {
2495               *err = REG_BADBR; /* <re>{} is invalid.  */
2496               return NULL;
2497             }
2498         }
2499       if (BE (start != -2, 1))
2500         {
2501           /* We treat "{n}" as "{n,n}".  */
2502           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2503                  : ((token->type == CHARACTER && token->opr.c == ',')
2504                     ? fetch_number (regexp, token, syntax) : -2));
2505         }
2506       if (BE (start == -2 || end == -2, 0))
2507         {
2508           /* Invalid sequence.  */
2509           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2510             {
2511               if (token->type == END_OF_RE)
2512                 *err = REG_EBRACE;
2513               else
2514                 *err = REG_BADBR;
2515
2516               return NULL;
2517             }
2518
2519           /* If the syntax bit is set, rollback.  */
2520           re_string_set_index (regexp, start_idx);
2521           *token = start_token;
2522           token->type = CHARACTER;
2523           /* mb_partial and word_char bits should be already initialized by
2524              peek_token.  */
2525           return elem;
2526         }
2527
2528       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2529         {
2530           /* First number greater than second.  */
2531           *err = REG_BADBR;
2532           return NULL;
2533         }
2534     }
2535   else
2536     {
2537       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2538       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2539     }
2540
2541   fetch_token (token, regexp, syntax);
2542
2543   if (BE (elem == NULL, 0))
2544     return NULL;
2545   if (BE (start == 0 && end == 0, 0))
2546     {
2547       postorder (elem, free_tree, NULL);
2548       return NULL;
2549     }
2550
2551   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2552   if (BE (start > 0, 0))
2553     {
2554       tree = elem;
2555       for (i = 2; i <= start; ++i)
2556         {
2557           elem = duplicate_tree (elem, dfa);
2558           tree = create_tree (dfa, tree, elem, CONCAT);
2559           if (BE (elem == NULL || tree == NULL, 0))
2560             goto parse_dup_op_espace;
2561         }
2562
2563       if (start == end)
2564         return tree;
2565
2566       /* Duplicate ELEM before it is marked optional.  */
2567       elem = duplicate_tree (elem, dfa);
2568       old_tree = tree;
2569     }
2570   else
2571     old_tree = NULL;
2572
2573   if (elem->token.type == SUBEXP)
2574     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2575
2576   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2577   if (BE (tree == NULL, 0))
2578     goto parse_dup_op_espace;
2579
2580   /* This loop is actually executed only when end != -1,
2581      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2582      already created the start+1-th copy.  */
2583   for (i = start + 2; i <= end; ++i)
2584     {
2585       elem = duplicate_tree (elem, dfa);
2586       tree = create_tree (dfa, tree, elem, CONCAT);
2587       if (BE (elem == NULL || tree == NULL, 0))
2588         goto parse_dup_op_espace;
2589
2590       tree = create_tree (dfa, tree, NULL, OP_ALT);
2591       if (BE (tree == NULL, 0))
2592         goto parse_dup_op_espace;
2593     }
2594
2595   if (old_tree)
2596     tree = create_tree (dfa, old_tree, tree, CONCAT);
2597
2598   return tree;
2599
2600  parse_dup_op_espace:
2601   *err = REG_ESPACE;
2602   return NULL;
2603 }
2604
2605 /* Size of the names for collating symbol/equivalence_class/character_class.
2606    I'm not sure, but maybe enough.  */
2607 #define BRACKET_NAME_BUF_SIZE 32
2608
2609 #ifndef _LIBC
2610   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2611      Build the range expression which starts from START_ELEM, and ends
2612      at END_ELEM.  The result are written to MBCSET and SBCSET.
2613      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2614      mbcset->range_ends, is a pointer argument sinse we may
2615      update it.  */
2616
2617 static reg_errcode_t
2618 internal_function
2619 # ifdef RE_ENABLE_I18N
2620 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2621                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2622 # else /* not RE_ENABLE_I18N */
2623 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2624                  bracket_elem_t *end_elem)
2625 # endif /* not RE_ENABLE_I18N */
2626 {
2627   unsigned int start_ch, end_ch;
2628   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2629   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2630           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2631           0))
2632     return REG_ERANGE;
2633
2634   /* We can handle no multi character collating elements without libc
2635      support.  */
2636   if (BE ((start_elem->type == COLL_SYM
2637            && strlen ((char *) start_elem->opr.name) > 1)
2638           || (end_elem->type == COLL_SYM
2639               && strlen ((char *) end_elem->opr.name) > 1), 0))
2640     return REG_ECOLLATE;
2641
2642 # ifdef RE_ENABLE_I18N
2643   {
2644     wchar_t wc;
2645     wint_t start_wc;
2646     wint_t end_wc;
2647     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2648
2649     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2650                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2651                    : 0));
2652     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2653               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2654                  : 0));
2655     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2656                 ? __btowc (start_ch) : start_elem->opr.wch);
2657     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2658               ? __btowc (end_ch) : end_elem->opr.wch);
2659     if (start_wc == WEOF || end_wc == WEOF)
2660       return REG_ECOLLATE;
2661     cmp_buf[0] = start_wc;
2662     cmp_buf[4] = end_wc;
2663     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2664       return REG_ERANGE;
2665
2666     /* Got valid collation sequence values, add them as a new entry.
2667        However, for !_LIBC we have no collation elements: if the
2668        character set is single byte, the single byte character set
2669        that we build below suffices.  parse_bracket_exp passes
2670        no MBCSET if dfa->mb_cur_max == 1.  */
2671     if (mbcset)
2672       {
2673         /* Check the space of the arrays.  */
2674         if (BE (*range_alloc == mbcset->nranges, 0))
2675           {
2676             /* There is not enough space, need realloc.  */
2677             wchar_t *new_array_start, *new_array_end;
2678             int new_nranges;
2679
2680             /* +1 in case of mbcset->nranges is 0.  */
2681             new_nranges = 2 * mbcset->nranges + 1;
2682             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2683                are NULL if *range_alloc == 0.  */
2684             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2685                                           new_nranges);
2686             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2687                                         new_nranges);
2688
2689             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2690               return REG_ESPACE;
2691
2692             mbcset->range_starts = new_array_start;
2693             mbcset->range_ends = new_array_end;
2694             *range_alloc = new_nranges;
2695           }
2696
2697         mbcset->range_starts[mbcset->nranges] = start_wc;
2698         mbcset->range_ends[mbcset->nranges++] = end_wc;
2699       }
2700
2701     /* Build the table for single byte characters.  */
2702     for (wc = 0; wc < SBC_MAX; ++wc)
2703       {
2704         cmp_buf[2] = wc;
2705         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2706             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2707           bitset_set (sbcset, wc);
2708       }
2709   }
2710 # else /* not RE_ENABLE_I18N */
2711   {
2712     unsigned int ch;
2713     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2714                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2715                    : 0));
2716     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2717               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2718                  : 0));
2719     if (start_ch > end_ch)
2720       return REG_ERANGE;
2721     /* Build the table for single byte characters.  */
2722     for (ch = 0; ch < SBC_MAX; ++ch)
2723       if (start_ch <= ch  && ch <= end_ch)
2724         bitset_set (sbcset, ch);
2725   }
2726 # endif /* not RE_ENABLE_I18N */
2727   return REG_NOERROR;
2728 }
2729 #endif /* not _LIBC */
2730
2731 #ifndef _LIBC
2732 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2733    Build the collating element which is represented by NAME.
2734    The result are written to MBCSET and SBCSET.
2735    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2736    pointer argument since we may update it.  */
2737
2738 static reg_errcode_t
2739 internal_function
2740 # ifdef RE_ENABLE_I18N
2741 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2742                         int *coll_sym_alloc, const unsigned char *name)
2743 # else /* not RE_ENABLE_I18N */
2744 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2745 # endif /* not RE_ENABLE_I18N */
2746 {
2747   size_t name_len = strlen ((const char *) name);
2748   if (BE (name_len != 1, 0))
2749     return REG_ECOLLATE;
2750   else
2751     {
2752       bitset_set (sbcset, name[0]);
2753       return REG_NOERROR;
2754     }
2755 }
2756 #endif /* not _LIBC */
2757
2758 /* This function parse bracket expression like "[abc]", "[a-c]",
2759    "[[.a-a.]]" etc.  */
2760
2761 static bin_tree_t *
2762 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2763                    reg_syntax_t syntax, reg_errcode_t *err)
2764 {
2765 #ifdef _LIBC
2766   const unsigned char *collseqmb;
2767   const char *collseqwc;
2768   uint32_t nrules;
2769   int32_t table_size;
2770   const int32_t *symb_table;
2771   const unsigned char *extra;
2772
2773   /* Local function for parse_bracket_exp used in _LIBC environement.
2774      Seek the collating symbol entry correspondings to NAME.
2775      Return the index of the symbol in the SYMB_TABLE.  */
2776
2777   auto inline int32_t
2778   __attribute ((always_inline))
2779   seek_collating_symbol_entry (name, name_len)
2780          const unsigned char *name;
2781          size_t name_len;
2782     {
2783       int32_t hash = elem_hash ((const char *) name, name_len);
2784       int32_t elem = hash % table_size;
2785       if (symb_table[2 * elem] != 0)
2786         {
2787           int32_t second = hash % (table_size - 2) + 1;
2788
2789           do
2790             {
2791               /* First compare the hashing value.  */
2792               if (symb_table[2 * elem] == hash
2793                   /* Compare the length of the name.  */
2794                   && name_len == extra[symb_table[2 * elem + 1]]
2795                   /* Compare the name.  */
2796                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2797                              name_len) == 0)
2798                 {
2799                   /* Yep, this is the entry.  */
2800                   break;
2801                 }
2802
2803               /* Next entry.  */
2804               elem += second;
2805             }
2806           while (symb_table[2 * elem] != 0);
2807         }
2808       return elem;
2809     }
2810
2811   /* Local function for parse_bracket_exp used in _LIBC environment.
2812      Look up the collation sequence value of BR_ELEM.
2813      Return the value if succeeded, UINT_MAX otherwise.  */
2814
2815   auto inline unsigned int
2816   __attribute ((always_inline))
2817   lookup_collation_sequence_value (br_elem)
2818          bracket_elem_t *br_elem;
2819     {
2820       if (br_elem->type == SB_CHAR)
2821         {
2822           /*
2823           if (MB_CUR_MAX == 1)
2824           */
2825           if (nrules == 0)
2826             return collseqmb[br_elem->opr.ch];
2827           else
2828             {
2829               wint_t wc = __btowc (br_elem->opr.ch);
2830               return __collseq_table_lookup (collseqwc, wc);
2831             }
2832         }
2833       else if (br_elem->type == MB_CHAR)
2834         {
2835           if (nrules != 0)
2836             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2837         }
2838       else if (br_elem->type == COLL_SYM)
2839         {
2840           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2841           if (nrules != 0)
2842             {
2843               int32_t elem, idx;
2844               elem = seek_collating_symbol_entry (br_elem->opr.name,
2845                                                   sym_name_len);
2846               if (symb_table[2 * elem] != 0)
2847                 {
2848                   /* We found the entry.  */
2849                   idx = symb_table[2 * elem + 1];
2850                   /* Skip the name of collating element name.  */
2851                   idx += 1 + extra[idx];
2852                   /* Skip the byte sequence of the collating element.  */
2853                   idx += 1 + extra[idx];
2854                   /* Adjust for the alignment.  */
2855                   idx = (idx + 3) & ~3;
2856                   /* Skip the multibyte collation sequence value.  */
2857                   idx += sizeof (unsigned int);
2858                   /* Skip the wide char sequence of the collating element.  */
2859                   idx += sizeof (unsigned int) *
2860                     (1 + *(unsigned int *) (extra + idx));
2861                   /* Return the collation sequence value.  */
2862                   return *(unsigned int *) (extra + idx);
2863                 }
2864               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2865                 {
2866                   /* No valid character.  Match it as a single byte
2867                      character.  */
2868                   return collseqmb[br_elem->opr.name[0]];
2869                 }
2870             }
2871           else if (sym_name_len == 1)
2872             return collseqmb[br_elem->opr.name[0]];
2873         }
2874       return UINT_MAX;
2875     }
2876
2877   /* Local function for parse_bracket_exp used in _LIBC environement.
2878      Build the range expression which starts from START_ELEM, and ends
2879      at END_ELEM.  The result are written to MBCSET and SBCSET.
2880      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2881      mbcset->range_ends, is a pointer argument sinse we may
2882      update it.  */
2883
2884   auto inline reg_errcode_t
2885   __attribute ((always_inline))
2886   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2887          re_charset_t *mbcset;
2888          int *range_alloc;
2889          bitset_t sbcset;
2890          bracket_elem_t *start_elem, *end_elem;
2891     {
2892       unsigned int ch;
2893       uint32_t start_collseq;
2894       uint32_t end_collseq;
2895
2896       /* Equivalence Classes and Character Classes can't be a range
2897          start/end.  */
2898       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2899               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2900               0))
2901         return REG_ERANGE;
2902
2903       start_collseq = lookup_collation_sequence_value (start_elem);
2904       end_collseq = lookup_collation_sequence_value (end_elem);
2905       /* Check start/end collation sequence values.  */
2906       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2907         return REG_ECOLLATE;
2908       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2909         return REG_ERANGE;
2910
2911       /* Got valid collation sequence values, add them as a new entry.
2912          However, if we have no collation elements, and the character set
2913          is single byte, the single byte character set that we
2914          build below suffices. */
2915       if (nrules > 0 || dfa->mb_cur_max > 1)
2916         {
2917           /* Check the space of the arrays.  */
2918           if (BE (*range_alloc == mbcset->nranges, 0))
2919             {
2920               /* There is not enough space, need realloc.  */
2921               uint32_t *new_array_start;
2922               uint32_t *new_array_end;
2923               int new_nranges;
2924
2925               /* +1 in case of mbcset->nranges is 0.  */
2926               new_nranges = 2 * mbcset->nranges + 1;
2927               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2928                                             new_nranges);
2929               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2930                                           new_nranges);
2931
2932               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2933                 return REG_ESPACE;
2934
2935               mbcset->range_starts = new_array_start;
2936               mbcset->range_ends = new_array_end;
2937               *range_alloc = new_nranges;
2938             }
2939
2940           mbcset->range_starts[mbcset->nranges] = start_collseq;
2941           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2942         }
2943
2944       /* Build the table for single byte characters.  */
2945       for (ch = 0; ch < SBC_MAX; ch++)
2946         {
2947           uint32_t ch_collseq;
2948           /*
2949           if (MB_CUR_MAX == 1)
2950           */
2951           if (nrules == 0)
2952             ch_collseq = collseqmb[ch];
2953           else
2954             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2955           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2956             bitset_set (sbcset, ch);
2957         }
2958       return REG_NOERROR;
2959     }
2960
2961   /* Local function for parse_bracket_exp used in _LIBC environement.
2962      Build the collating element which is represented by NAME.
2963      The result are written to MBCSET and SBCSET.
2964      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2965      pointer argument sinse we may update it.  */
2966
2967   auto inline reg_errcode_t
2968   __attribute ((always_inline))
2969   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2970          re_charset_t *mbcset;
2971          int *coll_sym_alloc;
2972          bitset_t sbcset;
2973          const unsigned char *name;
2974     {
2975       int32_t elem, idx;
2976       size_t name_len = strlen ((const char *) name);
2977       if (nrules != 0)
2978         {
2979           elem = seek_collating_symbol_entry (name, name_len);
2980           if (symb_table[2 * elem] != 0)
2981             {
2982               /* We found the entry.  */
2983               idx = symb_table[2 * elem + 1];
2984               /* Skip the name of collating element name.  */
2985               idx += 1 + extra[idx];
2986             }
2987           else if (symb_table[2 * elem] == 0 && name_len == 1)
2988             {
2989               /* No valid character, treat it as a normal
2990                  character.  */
2991               bitset_set (sbcset, name[0]);
2992               return REG_NOERROR;
2993             }
2994           else
2995             return REG_ECOLLATE;
2996
2997           /* Got valid collation sequence, add it as a new entry.  */
2998           /* Check the space of the arrays.  */
2999           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3000             {
3001               /* Not enough, realloc it.  */
3002               /* +1 in case of mbcset->ncoll_syms is 0.  */
3003               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3004               /* Use realloc since mbcset->coll_syms is NULL
3005                  if *alloc == 0.  */
3006               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3007                                                    new_coll_sym_alloc);
3008               if (BE (new_coll_syms == NULL, 0))
3009                 return REG_ESPACE;
3010               mbcset->coll_syms = new_coll_syms;
3011               *coll_sym_alloc = new_coll_sym_alloc;
3012             }
3013           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3014           return REG_NOERROR;
3015         }
3016       else
3017         {
3018           if (BE (name_len != 1, 0))
3019             return REG_ECOLLATE;
3020           else
3021             {
3022               bitset_set (sbcset, name[0]);
3023               return REG_NOERROR;
3024             }
3025         }
3026     }
3027 #endif
3028
3029   re_token_t br_token;
3030   re_bitset_ptr_t sbcset;
3031 #ifdef RE_ENABLE_I18N
3032   re_charset_t *mbcset;
3033   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3034   int equiv_class_alloc = 0, char_class_alloc = 0;
3035 #endif /* not RE_ENABLE_I18N */
3036   int non_match = 0;
3037   bin_tree_t *work_tree;
3038   int token_len;
3039   int first_round = 1;
3040 #ifdef _LIBC
3041   collseqmb = (const unsigned char *)
3042     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3043   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3044   if (nrules)
3045     {
3046       /*
3047       if (MB_CUR_MAX > 1)
3048       */
3049       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3050       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3051       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3052                                                   _NL_COLLATE_SYMB_TABLEMB);
3053       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3054                                                    _NL_COLLATE_SYMB_EXTRAMB);
3055     }
3056 #endif
3057   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3058 #ifdef RE_ENABLE_I18N
3059   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3060 #endif /* RE_ENABLE_I18N */
3061 #ifdef RE_ENABLE_I18N
3062   if (BE (sbcset == NULL || mbcset == NULL, 0))
3063 #else
3064   if (BE (sbcset == NULL, 0))
3065 #endif /* RE_ENABLE_I18N */
3066     {
3067       re_free (sbcset);
3068 #ifdef RE_ENABLE_I18N
3069       re_free (mbcset);
3070 #endif
3071       *err = REG_ESPACE;
3072       return NULL;
3073     }
3074
3075   token_len = peek_token_bracket (token, regexp, syntax);
3076   if (BE (token->type == END_OF_RE, 0))
3077     {
3078       *err = REG_BADPAT;
3079       goto parse_bracket_exp_free_return;
3080     }
3081   if (token->type == OP_NON_MATCH_LIST)
3082     {
3083 #ifdef RE_ENABLE_I18N
3084       mbcset->non_match = 1;
3085 #endif /* not RE_ENABLE_I18N */
3086       non_match = 1;
3087       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3088         bitset_set (sbcset, '\n');
3089       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3090       token_len = peek_token_bracket (token, regexp, syntax);
3091       if (BE (token->type == END_OF_RE, 0))
3092         {
3093           *err = REG_BADPAT;
3094           goto parse_bracket_exp_free_return;
3095         }
3096     }
3097
3098   /* We treat the first ']' as a normal character.  */
3099   if (token->type == OP_CLOSE_BRACKET)
3100     token->type = CHARACTER;
3101
3102   while (1)
3103     {
3104       bracket_elem_t start_elem, end_elem;
3105       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3106       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3107       reg_errcode_t ret;
3108       int token_len2 = 0, is_range_exp = 0;
3109       re_token_t token2;
3110
3111       start_elem.opr.name = start_name_buf;
3112       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3113                                    syntax, first_round);
3114       if (BE (ret != REG_NOERROR, 0))
3115         {
3116           *err = ret;
3117           goto parse_bracket_exp_free_return;
3118         }
3119       first_round = 0;
3120
3121       /* Get information about the next token.  We need it in any case.  */
3122       token_len = peek_token_bracket (token, regexp, syntax);
3123
3124       /* Do not check for ranges if we know they are not allowed.  */
3125       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3126         {
3127           if (BE (token->type == END_OF_RE, 0))
3128             {
3129               *err = REG_EBRACK;
3130               goto parse_bracket_exp_free_return;
3131             }
3132           if (token->type == OP_CHARSET_RANGE)
3133             {
3134               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3135               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3136               if (BE (token2.type == END_OF_RE, 0))
3137                 {
3138                   *err = REG_EBRACK;
3139                   goto parse_bracket_exp_free_return;
3140                 }
3141               if (token2.type == OP_CLOSE_BRACKET)
3142                 {
3143                   /* We treat the last '-' as a normal character.  */
3144                   re_string_skip_bytes (regexp, -token_len);
3145                   token->type = CHARACTER;
3146                 }
3147               else
3148                 is_range_exp = 1;
3149             }
3150         }
3151
3152       if (is_range_exp == 1)
3153         {
3154           end_elem.opr.name = end_name_buf;
3155           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3156                                        dfa, syntax, 1);
3157           if (BE (ret != REG_NOERROR, 0))
3158             {
3159               *err = ret;
3160               goto parse_bracket_exp_free_return;
3161             }
3162
3163           token_len = peek_token_bracket (token, regexp, syntax);
3164
3165 #ifdef _LIBC
3166           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3167                                   &start_elem, &end_elem);
3168 #else
3169 # ifdef RE_ENABLE_I18N
3170           *err = build_range_exp (sbcset,
3171                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3172                                   &range_alloc, &start_elem, &end_elem);
3173 # else
3174           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3175 # endif
3176 #endif /* RE_ENABLE_I18N */
3177           if (BE (*err != REG_NOERROR, 0))
3178             goto parse_bracket_exp_free_return;
3179         }
3180       else
3181         {
3182           switch (start_elem.type)
3183             {
3184             case SB_CHAR:
3185               bitset_set (sbcset, start_elem.opr.ch);
3186               break;
3187 #ifdef RE_ENABLE_I18N
3188             case MB_CHAR:
3189               /* Check whether the array has enough space.  */
3190               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3191                 {
3192                   wchar_t *new_mbchars;
3193                   /* Not enough, realloc it.  */
3194                   /* +1 in case of mbcset->nmbchars is 0.  */
3195                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3196                   /* Use realloc since array is NULL if *alloc == 0.  */
3197                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3198                                             mbchar_alloc);
3199                   if (BE (new_mbchars == NULL, 0))
3200                     goto parse_bracket_exp_espace;
3201                   mbcset->mbchars = new_mbchars;
3202                 }
3203               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3204               break;
3205 #endif /* RE_ENABLE_I18N */
3206             case EQUIV_CLASS:
3207               *err = build_equiv_class (sbcset,
3208 #ifdef RE_ENABLE_I18N
3209                                         mbcset, &equiv_class_alloc,
3210 #endif /* RE_ENABLE_I18N */
3211                                         start_elem.opr.name);
3212               if (BE (*err != REG_NOERROR, 0))
3213                 goto parse_bracket_exp_free_return;
3214               break;
3215             case COLL_SYM:
3216               *err = build_collating_symbol (sbcset,
3217 #ifdef RE_ENABLE_I18N
3218                                              mbcset, &coll_sym_alloc,
3219 #endif /* RE_ENABLE_I18N */
3220                                              start_elem.opr.name);
3221               if (BE (*err != REG_NOERROR, 0))
3222                 goto parse_bracket_exp_free_return;
3223               break;
3224             case CHAR_CLASS:
3225               *err = build_charclass (regexp->trans, sbcset,
3226 #ifdef RE_ENABLE_I18N
3227                                       mbcset, &char_class_alloc,
3228 #endif /* RE_ENABLE_I18N */
3229                                       start_elem.opr.name, syntax);
3230               if (BE (*err != REG_NOERROR, 0))
3231                goto parse_bracket_exp_free_return;
3232               break;
3233             default:
3234               assert (0);
3235               break;
3236             }
3237         }
3238       if (BE (token->type == END_OF_RE, 0))
3239         {
3240           *err = REG_EBRACK;
3241           goto parse_bracket_exp_free_return;
3242         }
3243       if (token->type == OP_CLOSE_BRACKET)
3244         break;
3245     }
3246
3247   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3248
3249   /* If it is non-matching list.  */
3250   if (non_match)
3251     bitset_not (sbcset);
3252
3253 #ifdef RE_ENABLE_I18N
3254   /* Ensure only single byte characters are set.  */
3255   if (dfa->mb_cur_max > 1)
3256     bitset_mask (sbcset, dfa->sb_char);
3257
3258   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3259       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3260                                                      || mbcset->non_match)))
3261     {
3262       bin_tree_t *mbc_tree;
3263       int sbc_idx;
3264       /* Build a tree for complex bracket.  */
3265       dfa->has_mb_node = 1;
3266       br_token.type = COMPLEX_BRACKET;
3267       br_token.opr.mbcset = mbcset;
3268       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3269       if (BE (mbc_tree == NULL, 0))
3270         goto parse_bracket_exp_espace;
3271       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3272         if (sbcset[sbc_idx])
3273           break;
3274       /* If there are no bits set in sbcset, there is no point
3275          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3276       if (sbc_idx < BITSET_WORDS)
3277         {
3278           /* Build a tree for simple bracket.  */
3279           br_token.type = SIMPLE_BRACKET;
3280           br_token.opr.sbcset = sbcset;
3281           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3282           if (BE (work_tree == NULL, 0))
3283             goto parse_bracket_exp_espace;
3284
3285           /* Then join them by ALT node.  */
3286           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3287           if (BE (work_tree == NULL, 0))
3288             goto parse_bracket_exp_espace;
3289         }
3290       else
3291         {
3292           re_free (sbcset);
3293           work_tree = mbc_tree;
3294         }
3295     }
3296   else
3297 #endif /* not RE_ENABLE_I18N */
3298     {
3299 #ifdef RE_ENABLE_I18N
3300       free_charset (mbcset);
3301 #endif
3302       /* Build a tree for simple bracket.  */
3303       br_token.type = SIMPLE_BRACKET;
3304       br_token.opr.sbcset = sbcset;
3305       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3306       if (BE (work_tree == NULL, 0))
3307         goto parse_bracket_exp_espace;
3308     }
3309   return work_tree;
3310
3311  parse_bracket_exp_espace:
3312   *err = REG_ESPACE;
3313  parse_bracket_exp_free_return:
3314   re_free (sbcset);
3315 #ifdef RE_ENABLE_I18N
3316   free_charset (mbcset);
3317 #endif /* RE_ENABLE_I18N */
3318   return NULL;
3319 }
3320
3321 /* Parse an element in the bracket expression.  */
3322
3323 static reg_errcode_t
3324 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3325                        re_token_t *token, int token_len, re_dfa_t *dfa,
3326                        reg_syntax_t syntax, int accept_hyphen)
3327 {
3328 #ifdef RE_ENABLE_I18N
3329   int cur_char_size;
3330   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3331   if (cur_char_size > 1)
3332     {
3333       elem->type = MB_CHAR;
3334       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3335       re_string_skip_bytes (regexp, cur_char_size);
3336       return REG_NOERROR;
3337     }
3338 #endif /* RE_ENABLE_I18N */
3339   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3340   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3341       || token->type == OP_OPEN_EQUIV_CLASS)
3342     return parse_bracket_symbol (elem, regexp, token);
3343   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3344     {
3345       /* A '-' must only appear as anything but a range indicator before
3346          the closing bracket.  Everything else is an error.  */
3347       re_token_t token2;
3348       (void) peek_token_bracket (&token2, regexp, syntax);
3349       if (token2.type != OP_CLOSE_BRACKET)
3350         /* The actual error value is not standardized since this whole
3351            case is undefined.  But ERANGE makes good sense.  */
3352         return REG_ERANGE;
3353     }
3354   elem->type = SB_CHAR;
3355   elem->opr.ch = token->opr.c;
3356   return REG_NOERROR;
3357 }
3358
3359 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3360    such as [:<character_class>:], [.<collating_element>.], and
3361    [=<equivalent_class>=].  */
3362
3363 static reg_errcode_t
3364 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3365                       re_token_t *token)
3366 {
3367   unsigned char ch, delim = token->opr.c;
3368   int i = 0;
3369   if (re_string_eoi(regexp))
3370     return REG_EBRACK;
3371   for (;; ++i)
3372     {
3373       if (i >= BRACKET_NAME_BUF_SIZE)
3374         return REG_EBRACK;
3375       if (token->type == OP_OPEN_CHAR_CLASS)
3376         ch = re_string_fetch_byte_case (regexp);
3377       else
3378         ch = re_string_fetch_byte (regexp);
3379       if (re_string_eoi(regexp))
3380         return REG_EBRACK;
3381       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3382         break;
3383       elem->opr.name[i] = ch;
3384     }
3385   re_string_skip_bytes (regexp, 1);
3386   elem->opr.name[i] = '\0';
3387   switch (token->type)
3388     {
3389     case OP_OPEN_COLL_ELEM:
3390       elem->type = COLL_SYM;
3391       break;
3392     case OP_OPEN_EQUIV_CLASS:
3393       elem->type = EQUIV_CLASS;
3394       break;
3395     case OP_OPEN_CHAR_CLASS:
3396       elem->type = CHAR_CLASS;
3397       break;
3398     default:
3399       break;
3400     }
3401   return REG_NOERROR;
3402 }
3403
3404   /* Helper function for parse_bracket_exp.
3405      Build the equivalence class which is represented by NAME.
3406      The result are written to MBCSET and SBCSET.
3407      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3408      is a pointer argument sinse we may update it.  */
3409
3410 static reg_errcode_t
3411 #ifdef RE_ENABLE_I18N
3412 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3413                    int *equiv_class_alloc, const unsigned char *name)
3414 #else /* not RE_ENABLE_I18N */
3415 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3416 #endif /* not RE_ENABLE_I18N */
3417 {
3418 #ifdef _LIBC
3419   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3420   if (nrules != 0)
3421     {
3422       const int32_t *table, *indirect;
3423       const unsigned char *weights, *extra, *cp;
3424       unsigned char char_buf[2];
3425       int32_t idx1, idx2;
3426       unsigned int ch;
3427       size_t len;
3428       /* This #include defines a local function!  */
3429 # include <locale/weight.h>
3430       /* Calculate the index for equivalence class.  */
3431       cp = name;
3432       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3433       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3434                                                _NL_COLLATE_WEIGHTMB);
3435       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3436                                                    _NL_COLLATE_EXTRAMB);
3437       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3438                                                 _NL_COLLATE_INDIRECTMB);
3439       idx1 = findidx (&cp, -1);
3440       if (BE (idx1 == 0 || *cp != '\0', 0))
3441         /* This isn't a valid character.  */
3442         return REG_ECOLLATE;
3443
3444       /* Build single byte matcing table for this equivalence class.  */
3445       len = weights[idx1 & 0xffffff];
3446       for (ch = 0; ch < SBC_MAX; ++ch)
3447         {
3448           char_buf[0] = ch;
3449           cp = char_buf;
3450           idx2 = findidx (&cp, 1);
3451 /*
3452           idx2 = table[ch];
3453 */
3454           if (idx2 == 0)
3455             /* This isn't a valid character.  */
3456             continue;
3457           /* Compare only if the length matches and the collation rule
3458              index is the same.  */
3459           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3460             {
3461               int cnt = 0;
3462
3463               while (cnt <= len &&
3464                      weights[(idx1 & 0xffffff) + 1 + cnt]
3465                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3466                 ++cnt;
3467
3468               if (cnt > len)
3469                 bitset_set (sbcset, ch);
3470             }
3471         }
3472       /* Check whether the array has enough space.  */
3473       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3474         {
3475           /* Not enough, realloc it.  */
3476           /* +1 in case of mbcset->nequiv_classes is 0.  */
3477           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3478           /* Use realloc since the array is NULL if *alloc == 0.  */
3479           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3480                                                    int32_t,
3481                                                    new_equiv_class_alloc);
3482           if (BE (new_equiv_classes == NULL, 0))
3483             return REG_ESPACE;
3484           mbcset->equiv_classes = new_equiv_classes;
3485           *equiv_class_alloc = new_equiv_class_alloc;
3486         }
3487       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3488     }
3489   else
3490 #endif /* _LIBC */
3491     {
3492       if (BE (strlen ((const char *) name) != 1, 0))
3493         return REG_ECOLLATE;
3494       bitset_set (sbcset, *name);
3495     }
3496   return REG_NOERROR;
3497 }
3498
3499   /* Helper function for parse_bracket_exp.
3500      Build the character class which is represented by NAME.
3501      The result are written to MBCSET and SBCSET.
3502      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3503      is a pointer argument sinse we may update it.  */
3504
3505 static reg_errcode_t
3506 #ifdef RE_ENABLE_I18N
3507 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3508                  re_charset_t *mbcset, int *char_class_alloc,
3509                  const unsigned char *class_name, reg_syntax_t syntax)
3510 #else /* not RE_ENABLE_I18N */
3511 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3512                  const unsigned char *class_name, reg_syntax_t syntax)
3513 #endif /* not RE_ENABLE_I18N */
3514 {
3515   int i;
3516   const char *name = (const char *) class_name;
3517
3518   /* In case of REG_ICASE "upper" and "lower" match the both of
3519      upper and lower cases.  */
3520   if ((syntax & RE_ICASE)
3521       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3522     name = "alpha";
3523
3524 #ifdef RE_ENABLE_I18N
3525   /* Check the space of the arrays.  */
3526   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3527     {
3528       /* Not enough, realloc it.  */
3529       /* +1 in case of mbcset->nchar_classes is 0.  */
3530       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3531       /* Use realloc since array is NULL if *alloc == 0.  */
3532       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3533                                                new_char_class_alloc);
3534       if (BE (new_char_classes == NULL, 0))
3535         return REG_ESPACE;
3536       mbcset->char_classes = new_char_classes;
3537       *char_class_alloc = new_char_class_alloc;
3538     }
3539   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3540 #endif /* RE_ENABLE_I18N */
3541
3542 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3543   do {                                          \
3544     if (BE (trans != NULL, 0))                  \
3545       {                                         \
3546         for (i = 0; i < SBC_MAX; ++i)           \
3547           if (ctype_func (i))                   \
3548             bitset_set (sbcset, trans[i]);      \
3549       }                                         \
3550     else                                        \
3551       {                                         \
3552         for (i = 0; i < SBC_MAX; ++i)           \
3553           if (ctype_func (i))                   \
3554             bitset_set (sbcset, i);             \
3555       }                                         \
3556   } while (0)
3557
3558   if (strcmp (name, "alnum") == 0)
3559     BUILD_CHARCLASS_LOOP (isalnum);
3560   else if (strcmp (name, "cntrl") == 0)
3561     BUILD_CHARCLASS_LOOP (iscntrl);
3562   else if (strcmp (name, "lower") == 0)
3563     BUILD_CHARCLASS_LOOP (islower);
3564   else if (strcmp (name, "space") == 0)
3565     BUILD_CHARCLASS_LOOP (isspace);
3566   else if (strcmp (name, "alpha") == 0)
3567     BUILD_CHARCLASS_LOOP (isalpha);
3568   else if (strcmp (name, "digit") == 0)
3569     BUILD_CHARCLASS_LOOP (isdigit);
3570   else if (strcmp (name, "print") == 0)
3571     BUILD_CHARCLASS_LOOP (isprint);
3572   else if (strcmp (name, "upper") == 0)
3573     BUILD_CHARCLASS_LOOP (isupper);
3574   else if (strcmp (name, "blank") == 0)
3575     BUILD_CHARCLASS_LOOP (isblank);
3576   else if (strcmp (name, "graph") == 0)
3577     BUILD_CHARCLASS_LOOP (isgraph);
3578   else if (strcmp (name, "punct") == 0)
3579     BUILD_CHARCLASS_LOOP (ispunct);
3580   else if (strcmp (name, "xdigit") == 0)
3581     BUILD_CHARCLASS_LOOP (isxdigit);
3582   else
3583     return REG_ECTYPE;
3584
3585   return REG_NOERROR;
3586 }
3587
3588 static bin_tree_t *
3589 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3590                     const unsigned char *class_name,
3591                     const unsigned char *extra, int non_match,
3592                     reg_errcode_t *err)
3593 {
3594   re_bitset_ptr_t sbcset;
3595 #ifdef RE_ENABLE_I18N
3596   re_charset_t *mbcset;
3597   int alloc = 0;
3598 #endif /* not RE_ENABLE_I18N */
3599   reg_errcode_t ret;
3600   re_token_t br_token;
3601   bin_tree_t *tree;
3602
3603   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3604 #ifdef RE_ENABLE_I18N
3605   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3606 #endif /* RE_ENABLE_I18N */
3607
3608 #ifdef RE_ENABLE_I18N
3609   if (BE (sbcset == NULL || mbcset == NULL, 0))
3610 #else /* not RE_ENABLE_I18N */
3611   if (BE (sbcset == NULL, 0))
3612 #endif /* not RE_ENABLE_I18N */
3613     {
3614       *err = REG_ESPACE;
3615       return NULL;
3616     }
3617
3618   if (non_match)
3619     {
3620 #ifdef RE_ENABLE_I18N
3621       mbcset->non_match = 1;
3622 #endif /* not RE_ENABLE_I18N */
3623     }
3624
3625   /* We don't care the syntax in this case.  */
3626   ret = build_charclass (trans, sbcset,
3627 #ifdef RE_ENABLE_I18N
3628                          mbcset, &alloc,
3629 #endif /* RE_ENABLE_I18N */
3630                          class_name, 0);
3631
3632   if (BE (ret != REG_NOERROR, 0))
3633     {
3634       re_free (sbcset);
3635 #ifdef RE_ENABLE_I18N
3636       free_charset (mbcset);
3637 #endif /* RE_ENABLE_I18N */
3638       *err = ret;
3639       return NULL;
3640     }
3641   /* \w match '_' also.  */
3642   for (; *extra; extra++)
3643     bitset_set (sbcset, *extra);
3644
3645   /* If it is non-matching list.  */
3646   if (non_match)
3647     bitset_not (sbcset);
3648
3649 #ifdef RE_ENABLE_I18N
3650   /* Ensure only single byte characters are set.  */
3651   if (dfa->mb_cur_max > 1)
3652     bitset_mask (sbcset, dfa->sb_char);
3653 #endif
3654
3655   /* Build a tree for simple bracket.  */
3656   br_token.type = SIMPLE_BRACKET;
3657   br_token.opr.sbcset = sbcset;
3658   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3659   if (BE (tree == NULL, 0))
3660     goto build_word_op_espace;
3661
3662 #ifdef RE_ENABLE_I18N
3663   if (dfa->mb_cur_max > 1)
3664     {
3665       bin_tree_t *mbc_tree;
3666       /* Build a tree for complex bracket.  */
3667       br_token.type = COMPLEX_BRACKET;
3668       br_token.opr.mbcset = mbcset;
3669       dfa->has_mb_node = 1;
3670       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3671       if (BE (mbc_tree == NULL, 0))
3672         goto build_word_op_espace;
3673       /* Then join them by ALT node.  */
3674       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3675       if (BE (mbc_tree != NULL, 1))
3676         return tree;
3677     }
3678   else
3679     {
3680       free_charset (mbcset);
3681       return tree;
3682     }
3683 #else /* not RE_ENABLE_I18N */
3684   return tree;
3685 #endif /* not RE_ENABLE_I18N */
3686
3687  build_word_op_espace:
3688   re_free (sbcset);
3689 #ifdef RE_ENABLE_I18N
3690   free_charset (mbcset);
3691 #endif /* RE_ENABLE_I18N */
3692   *err = REG_ESPACE;
3693   return NULL;
3694 }
3695
3696 /* This is intended for the expressions like "a{1,3}".
3697    Fetch a number from `input', and return the number.
3698    Return -1, if the number field is empty like "{,1}".
3699    Return -2, If an error is occured.  */
3700
3701 static int
3702 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3703 {
3704   int num = -1;
3705   unsigned char c;
3706   while (1)
3707     {
3708       fetch_token (token, input, syntax);
3709       c = token->opr.c;
3710       if (BE (token->type == END_OF_RE, 0))
3711         return -2;
3712       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3713         break;
3714       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3715              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3716       num = (num > RE_DUP_MAX) ? -2 : num;
3717     }
3718   return num;
3719 }
3720 \f
3721 #ifdef RE_ENABLE_I18N
3722 static void
3723 free_charset (re_charset_t *cset)
3724 {
3725   re_free (cset->mbchars);
3726 # ifdef _LIBC
3727   re_free (cset->coll_syms);
3728   re_free (cset->equiv_classes);
3729   re_free (cset->range_starts);
3730   re_free (cset->range_ends);
3731 # endif
3732   re_free (cset->char_classes);
3733   re_free (cset);
3734 }
3735 #endif /* RE_ENABLE_I18N */
3736 \f
3737 /* Functions for binary tree operation.  */
3738
3739 /* Create a tree node.  */
3740
3741 static bin_tree_t *
3742 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3743              re_token_type_t type)
3744 {
3745   re_token_t t;
3746   t.type = type;
3747   return create_token_tree (dfa, left, right, &t);
3748 }
3749
3750 static bin_tree_t *
3751 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3752                    const re_token_t *token)
3753 {
3754   bin_tree_t *tree;
3755   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3756     {
3757       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3758
3759       if (storage == NULL)
3760         return NULL;
3761       storage->next = dfa->str_tree_storage;
3762       dfa->str_tree_storage = storage;
3763       dfa->str_tree_storage_idx = 0;
3764     }
3765   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3766
3767   tree->parent = NULL;
3768   tree->left = left;
3769   tree->right = right;
3770   tree->token = *token;
3771   tree->token.duplicated = 0;
3772   tree->token.opt_subexp = 0;
3773   tree->first = NULL;
3774   tree->next = NULL;
3775   tree->node_idx = -1;
3776
3777   if (left != NULL)
3778     left->parent = tree;
3779   if (right != NULL)
3780     right->parent = tree;
3781   return tree;
3782 }
3783
3784 /* Mark the tree SRC as an optional subexpression.
3785    To be called from preorder or postorder.  */
3786
3787 static reg_errcode_t
3788 mark_opt_subexp (void *extra, bin_tree_t *node)
3789 {
3790   int idx = (int) (long) extra;
3791   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3792     node->token.opt_subexp = 1;
3793
3794   return REG_NOERROR;
3795 }
3796
3797 /* Free the allocated memory inside NODE. */
3798
3799 static void
3800 free_token (re_token_t *node)
3801 {
3802 #ifdef RE_ENABLE_I18N
3803   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3804     free_charset (node->opr.mbcset);
3805   else
3806 #endif /* RE_ENABLE_I18N */
3807     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3808       re_free (node->opr.sbcset);
3809 }
3810
3811 /* Worker function for tree walking.  Free the allocated memory inside NODE
3812    and its children. */
3813
3814 static reg_errcode_t
3815 free_tree (void *extra, bin_tree_t *node)
3816 {
3817   free_token (&node->token);
3818   return REG_NOERROR;
3819 }
3820
3821
3822 /* Duplicate the node SRC, and return new node.  This is a preorder
3823    visit similar to the one implemented by the generic visitor, but
3824    we need more infrastructure to maintain two parallel trees --- so,
3825    it's easier to duplicate.  */
3826
3827 static bin_tree_t *
3828 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3829 {
3830   const bin_tree_t *node;
3831   bin_tree_t *dup_root;
3832   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3833
3834   for (node = root; ; )
3835     {
3836       /* Create a new tree and link it back to the current parent.  */
3837       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3838       if (*p_new == NULL)
3839         return NULL;
3840       (*p_new)->parent = dup_node;
3841       (*p_new)->token.duplicated = 1;
3842       dup_node = *p_new;
3843
3844       /* Go to the left node, or up and to the right.  */
3845       if (node->left)
3846         {
3847           node = node->left;
3848           p_new = &dup_node->left;
3849         }
3850       else
3851         {
3852           const bin_tree_t *prev = NULL;
3853           while (node->right == prev || node->right == NULL)
3854             {
3855               prev = node;
3856               node = node->parent;
3857               dup_node = dup_node->parent;
3858               if (!node)
3859                 return dup_root;
3860             }
3861           node = node->right;
3862           p_new = &dup_node->right;
3863         }
3864     }
3865 }