posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static void init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  38 static reg_errcode_t preorder (bin_tree_t *root,
  39                                reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                void *extra);
  41 static reg_errcode_t postorder (bin_tree_t *root,
  42                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                 void *extra);
  44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  47                                  bin_tree_t *node);
  48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  51 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  52                                              int top_clone_node, int root_node,
  53                                              unsigned int constraint);
  54 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  55                                      unsigned int constraint);
  56 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  57                                    unsigned int constraint);
  58 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  59 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  60                                          int node, int root);
  61 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  62 static int fetch_number (re_string_t *input, re_token_t *token,
  63                          reg_syntax_t syntax);
  64 static void fetch_token (re_token_t *result, re_string_t *input,
  65                          reg_syntax_t syntax);
  66 static int peek_token (re_token_t *token, re_string_t *input,
  67                         reg_syntax_t syntax);
  68 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  69                                reg_syntax_t syntax);
  70 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  71                           reg_syntax_t syntax, reg_errcode_t *err);
  72 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  73                                   re_token_t *token, reg_syntax_t syntax,
  74                                   int nest, reg_errcode_t *err);
  75 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  76                                  re_token_t *token, reg_syntax_t syntax,
  77                                  int nest, reg_errcode_t *err);
  78 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  79                                      re_token_t *token, reg_syntax_t syntax,
  80                                      int nest, reg_errcode_t *err);
  81 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  82                                   re_token_t *token, reg_syntax_t syntax,
  83                                   int nest, reg_errcode_t *err);
  84 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  85                                  re_dfa_t *dfa, re_token_t *token,
  86                                  reg_syntax_t syntax, reg_errcode_t *err);
  87 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  88                                       re_token_t *token, reg_syntax_t syntax,
  89                                       reg_errcode_t *err);
  90 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  91                                             re_string_t *regexp,
  92                                             re_token_t *token, int token_len,
  93                                             re_dfa_t *dfa,
  94                                             reg_syntax_t syntax,
  95                                             int accept_hyphen);
  96 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  97                                           re_string_t *regexp,
  98                                           re_token_t *token);
  99 #ifndef _LIBC
 100 # ifdef RE_ENABLE_I18N
 101 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 102                                       re_charset_t *mbcset, int *range_alloc,
 103                                       bracket_elem_t *start_elem,
 104                                       bracket_elem_t *end_elem);
 105 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 106                                              re_charset_t *mbcset,
 107                                              int *coll_sym_alloc,
 108                                              const unsigned char *name);
 109 # else /* not RE_ENABLE_I18N */
 110 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 111                                       bracket_elem_t *start_elem,
 112                                       bracket_elem_t *end_elem);
 113 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 114                                              const unsigned char *name);
 115 # endif /* not RE_ENABLE_I18N */
 116 #endif /* not _LIBC */
 117 #ifdef RE_ENABLE_I18N
 118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 119                                         re_charset_t *mbcset,
 120                                         int *equiv_class_alloc,
 121                                         const unsigned char *name);
 122 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 123                                       re_bitset_ptr_t sbcset,
 124                                       re_charset_t *mbcset,
 125                                       int *char_class_alloc,
 126                                       const unsigned char *class_name,
 127                                       reg_syntax_t syntax);
 128 #else  /* not RE_ENABLE_I18N */
 129 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 130                                         const unsigned char *name);
 131 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 132                                       re_bitset_ptr_t sbcset,
 133                                       const unsigned char *class_name,
 134                                       reg_syntax_t syntax);
 135 #endif /* not RE_ENABLE_I18N */
 136 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 137                                        unsigned RE_TRANSLATE_TYPE trans,
 138                                        const unsigned char *class_name,
 139                                        const unsigned char *extra,
 140                                        int non_match, reg_errcode_t *err);
 141 static bin_tree_t *create_tree (re_dfa_t *dfa,
 142                                 bin_tree_t *left, bin_tree_t *right,
 143                                 re_token_type_t type);
 144 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 145                                       bin_tree_t *left, bin_tree_t *right,
 146                                       const re_token_t *token);
 147 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 148 static void free_token (re_token_t *node);
 149 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 150 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 151 \f
 152 /* This table gives an error message for each of the error codes listed
 153    in regex.h.  Obviously the order here has to be same as there.
 154    POSIX doesn't require that we do anything for REG_NOERROR,
 155    but why not be nice?  */
 156
 157 const char __re_error_msgid[] attribute_hidden =
 158   {
 159 #define REG_NOERROR_IDX 0
 160     gettext_noop ("Success")    /* REG_NOERROR */
 161     "\0"
 162 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 163     gettext_noop ("No match")   /* REG_NOMATCH */
 164     "\0"
 165 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 166     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 167     "\0"
 168 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 169     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 170     "\0"
 171 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 172     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 173     "\0"
 174 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 175     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 176     "\0"
 177 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 178     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 179     "\0"
 180 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 181     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 182     "\0"
 183 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 184     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 185     "\0"
 186 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 187     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 188     "\0"
 189 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 190     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 191     "\0"
 192 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 193     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 194     "\0"
 195 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 196     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 197     "\0"
 198 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 199     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 200     "\0"
 201 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 202     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 203     "\0"
 204 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 205     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 206     "\0"
 207 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 208     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 209   };
 210
 211 const size_t __re_error_msgid_idx[] attribute_hidden =
 212   {
 213     REG_NOERROR_IDX,
 214     REG_NOMATCH_IDX,
 215     REG_BADPAT_IDX,
 216     REG_ECOLLATE_IDX,
 217     REG_ECTYPE_IDX,
 218     REG_EESCAPE_IDX,
 219     REG_ESUBREG_IDX,
 220     REG_EBRACK_IDX,
 221     REG_EPAREN_IDX,
 222     REG_EBRACE_IDX,
 223     REG_BADBR_IDX,
 224     REG_ERANGE_IDX,
 225     REG_ESPACE_IDX,
 226     REG_BADRPT_IDX,
 227     REG_EEND_IDX,
 228     REG_ESIZE_IDX,
 229     REG_ERPAREN_IDX
 230   };
 231 \f
 232 /* Entry points for GNU code.  */
 233
 234 /* re_compile_pattern is the GNU regular expression compiler: it
 235    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 236    Returns 0 if the pattern was valid, otherwise an error string.
 237
 238    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 239    are set in BUFP on entry.  */
 240
 241 const char *
 242 re_compile_pattern (pattern, length, bufp)
 243     const char *pattern;
 244     size_t length;
 245     struct re_pattern_buffer *bufp;
 246 {
 247   reg_errcode_t ret;
 248
 249   /* And GNU code determines whether or not to get register information
 250      by passing null for the REGS argument to re_match, etc., not by
 251      setting no_sub, unless RE_NO_SUB is set.  */
 252   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 253
 254   /* Match anchors at newline.  */
 255   bufp->newline_anchor = 1;
 256
 257   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 258
 259   if (!ret)
 260     return NULL;
 261   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_compile_pattern, re_compile_pattern)
 265 #endif
 266
 267 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 268    also be assigned to arbitrarily: each pattern buffer stores its own
 269    syntax, so it can be changed between regex compilations.  */
 270 /* This has no initializer because initialized variables in Emacs
 271    become read-only after dumping.  */
 272 reg_syntax_t re_syntax_options;
 273
 274
 275 /* Specify the precise syntax of regexps for compilation.  This provides
 276    for compatibility for various utilities which historically have
 277    different, incompatible syntaxes.
 278
 279    The argument SYNTAX is a bit mask comprised of the various bits
 280    defined in regex.h.  We return the old syntax.  */
 281
 282 reg_syntax_t
 283 re_set_syntax (syntax)
 284     reg_syntax_t syntax;
 285 {
 286   reg_syntax_t ret = re_syntax_options;
 287
 288   re_syntax_options = syntax;
 289   return ret;
 290 }
 291 #ifdef _LIBC
 292 weak_alias (__re_set_syntax, re_set_syntax)
 293 #endif
 294
 295 int
 296 re_compile_fastmap (bufp)
 297     struct re_pattern_buffer *bufp;
 298 {
 299   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 300   char *fastmap = bufp->fastmap;
 301
 302   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 303   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 304   if (dfa->init_state != dfa->init_state_word)
 305     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 306   if (dfa->init_state != dfa->init_state_nl)
 307     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 308   if (dfa->init_state != dfa->init_state_begbuf)
 309     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 310   bufp->fastmap_accurate = 1;
 311   return 0;
 312 }
 313 #ifdef _LIBC
 314 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 315 #endif
 316
 317 static inline void
 318 __attribute ((always_inline))
 319 re_set_fastmap (char *fastmap, int icase, int ch)
 320 {
 321   fastmap[ch] = 1;
 322   if (icase)
 323     fastmap[tolower (ch)] = 1;
 324 }
 325
 326 /* Helper function for re_compile_fastmap.
 327    Compile fastmap for the initial_state INIT_STATE.  */
 328
 329 static void
 330 re_compile_fastmap_iter (bufp, init_state, fastmap)
 331      regex_t *bufp;
 332      const re_dfastate_t *init_state;
 333      char *fastmap;
 334 {
 335   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 336   int node_cnt;
 337   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 338   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 339     {
 340       int node = init_state->nodes.elems[node_cnt];
 341       re_token_type_t type = dfa->nodes[node].type;
 342
 343       if (type == CHARACTER)
 344         {
 345           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 346 #ifdef RE_ENABLE_I18N
 347           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 348             {
 349               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 350               wchar_t wc;
 351               mbstate_t state;
 352
 353               p = buf;
 354               *p++ = dfa->nodes[node].opr.c;
 355               while (++node < dfa->nodes_len
 356                      && dfa->nodes[node].type == CHARACTER
 357                      && dfa->nodes[node].mb_partial)
 358                 *p++ = dfa->nodes[node].opr.c;
 359               memset (&state, 0, sizeof (state));
 360               if (mbrtowc (&wc, (const char *) buf, p - buf,
 361                            &state) == p - buf
 362                   && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
 363                 re_set_fastmap (fastmap, 0, buf[0]);
 364             }
 365 #endif
 366         }
 367       else if (type == SIMPLE_BRACKET)
 368         {
 369           int i, j, ch;
 370           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 371             for (j = 0; j < UINT_BITS; ++j, ++ch)
 372               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 373                 re_set_fastmap (fastmap, icase, ch);
 374         }
 375 #ifdef RE_ENABLE_I18N
 376       else if (type == COMPLEX_BRACKET)
 377         {
 378           int i;
 379           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 380           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 381               || cset->nranges || cset->nchar_classes)
 382             {
 383 # ifdef _LIBC
 384               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 385                 {
 386                   /* In this case we want to catch the bytes which are
 387                      the first byte of any collation elements.
 388                      e.g. In da_DK, we want to catch 'a' since "aa"
 389                           is a valid collation element, and don't catch
 390                           'b' since 'b' is the only collation element
 391                           which starts from 'b'.  */
 392                   int j, ch;
 393                   const int32_t *table = (const int32_t *)
 394                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 395                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 396                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 397                       if (table[ch] < 0)
 398                         re_set_fastmap (fastmap, icase, ch);
 399                 }
 400 # else
 401               if (dfa->mb_cur_max > 1)
 402                 for (i = 0; i < SBC_MAX; ++i)
 403                   if (__btowc (i) == WEOF)
 404                     re_set_fastmap (fastmap, icase, i);
 405 # endif /* not _LIBC */
 406             }
 407           for (i = 0; i < cset->nmbchars; ++i)
 408             {
 409               char buf[256];
 410               mbstate_t state;
 411               memset (&state, '\0', sizeof (state));
 412               __wcrtomb (buf, cset->mbchars[i], &state);
 413               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 414               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 415                 {
 416                   __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 417                   re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 418                 }
 419             }
 420         }
 421 #endif /* RE_ENABLE_I18N */
 422       else if (type == OP_PERIOD
 423 #ifdef RE_ENABLE_I18N
 424                || type == OP_UTF8_PERIOD
 425 #endif /* RE_ENABLE_I18N */
 426                || type == END_OF_RE)
 427         {
 428           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 429           if (type == END_OF_RE)
 430             bufp->can_be_null = 1;
 431           return;
 432         }
 433     }
 434 }
 435 \f
 436 /* Entry point for POSIX code.  */
 437 /* regcomp takes a regular expression as a string and compiles it.
 438
 439    PREG is a regex_t *.  We do not expect any fields to be initialized,
 440    since POSIX says we shouldn't.  Thus, we set
 441
 442      `buffer' to the compiled pattern;
 443      `used' to the length of the compiled pattern;
 444      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 446        RE_SYNTAX_POSIX_BASIC;
 447      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 448      `fastmap' to an allocated space for the fastmap;
 449      `fastmap_accurate' to zero;
 450      `re_nsub' to the number of subexpressions in PATTERN.
 451
 452    PATTERN is the address of the pattern string.
 453
 454    CFLAGS is a series of bits which affect compilation.
 455
 456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 457      use POSIX basic syntax.
 458
 459      If REG_NEWLINE is set, then . and [^...] don't match newline.
 460      Also, regexec will try a match beginning after every newline.
 461
 462      If REG_ICASE is set, then we considers upper- and lowercase
 463      versions of letters to be equivalent when matching.
 464
 465      If REG_NOSUB is set, then when PREG is passed to regexec, that
 466      routine will report only success or failure, and nothing about the
 467      registers.
 468
 469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 470    the return codes and their meanings.)  */
 471
 472 int
 473 regcomp (preg, pattern, cflags)
 474     regex_t *__restrict preg;
 475     const char *__restrict pattern;
 476     int cflags;
 477 {
 478   reg_errcode_t ret;
 479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 480                          : RE_SYNTAX_POSIX_BASIC);
 481
 482   preg->buffer = NULL;
 483   preg->allocated = 0;
 484   preg->used = 0;
 485
 486   /* Try to allocate space for the fastmap.  */
 487   preg->fastmap = re_malloc (char, SBC_MAX);
 488   if (BE (preg->fastmap == NULL, 0))
 489     return REG_ESPACE;
 490
 491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 492
 493   /* If REG_NEWLINE is set, newlines are treated differently.  */
 494   if (cflags & REG_NEWLINE)
 495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 496       syntax &= ~RE_DOT_NEWLINE;
 497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 498       /* It also changes the matching behavior.  */
 499       preg->newline_anchor = 1;
 500     }
 501   else
 502     preg->newline_anchor = 0;
 503   preg->no_sub = !!(cflags & REG_NOSUB);
 504   preg->translate = NULL;
 505
 506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 507
 508   /* POSIX doesn't distinguish between an unmatched open-group and an
 509      unmatched close-group: both are REG_EPAREN.  */
 510   if (ret == REG_ERPAREN)
 511     ret = REG_EPAREN;
 512
 513   /* We have already checked preg->fastmap != NULL.  */
 514   if (BE (ret == REG_NOERROR, 1))
 515     /* Compute the fastmap now, since regexec cannot modify the pattern
 516        buffer.  This function never fails in this implementation.  */
 517     (void) re_compile_fastmap (preg);
 518   else
 519     {
 520       /* Some error occurred while compiling the expression.  */
 521       re_free (preg->fastmap);
 522       preg->fastmap = NULL;
 523     }
 524
 525   return (int) ret;
 526 }
 527 #ifdef _LIBC
 528 weak_alias (__regcomp, regcomp)
 529 #endif
 530
 531 /* Returns a message corresponding to an error code, ERRCODE, returned
 532    from either regcomp or regexec.   We don't use PREG here.  */
 533
 534 size_t
 535 regerror (errcode, preg, errbuf, errbuf_size)
 536     int errcode;
 537     const regex_t *preg;
 538     char *errbuf;
 539     size_t errbuf_size;
 540 {
 541   const char *msg;
 542   size_t msg_size;
 543
 544   if (BE (errcode < 0
 545           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 546                                / sizeof (__re_error_msgid_idx[0])), 0))
 547     /* Only error codes returned by the rest of the code should be passed
 548        to this routine.  If we are given anything else, or if other regex
 549        code generates an invalid error code, then the program has a bug.
 550        Dump core so we can fix it.  */
 551     abort ();
 552
 553   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 554
 555   msg_size = strlen (msg) + 1; /* Includes the null.  */
 556
 557   if (BE (errbuf_size != 0, 1))
 558     {
 559       if (BE (msg_size > errbuf_size, 0))
 560         {
 561 #if defined HAVE_MEMPCPY || defined _LIBC
 562           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 563 #else
 564           memcpy (errbuf, msg, errbuf_size - 1);
 565           errbuf[errbuf_size - 1] = 0;
 566 #endif
 567         }
 568       else
 569         memcpy (errbuf, msg, msg_size);
 570     }
 571
 572   return msg_size;
 573 }
 574 #ifdef _LIBC
 575 weak_alias (__regerror, regerror)
 576 #endif
 577
 578
 579 #ifdef RE_ENABLE_I18N
 580 /* This static array is used for the map to single-byte characters when
 581    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 582    it the same all the time.  UTF-8 is the preferred encoding so this is
 583    a worthwhile optimization.  */
 584 static const bitset utf8_sb_map =
 585 {
 586   /* Set the first 128 bits.  */
 587 # if UINT_MAX == 0xffffffff
 588   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 589 # else
 590 #  error "Add case for new unsigned int size"
 591 # endif
 592 };
 593 #endif
 594
 595
 596 static void
 597 free_dfa_content (re_dfa_t *dfa)
 598 {
 599   int i, j;
 600
 601   if (dfa->nodes)
 602     for (i = 0; i < dfa->nodes_len; ++i)
 603       free_token (dfa->nodes + i);
 604   re_free (dfa->nexts);
 605   for (i = 0; i < dfa->nodes_len; ++i)
 606     {
 607       if (dfa->eclosures != NULL)
 608         re_node_set_free (dfa->eclosures + i);
 609       if (dfa->inveclosures != NULL)
 610         re_node_set_free (dfa->inveclosures + i);
 611       if (dfa->edests != NULL)
 612         re_node_set_free (dfa->edests + i);
 613     }
 614   re_free (dfa->edests);
 615   re_free (dfa->eclosures);
 616   re_free (dfa->inveclosures);
 617   re_free (dfa->nodes);
 618
 619   if (dfa->state_table)
 620     for (i = 0; i <= dfa->state_hash_mask; ++i)
 621       {
 622         struct re_state_table_entry *entry = dfa->state_table + i;
 623         for (j = 0; j < entry->num; ++j)
 624           {
 625             re_dfastate_t *state = entry->array[j];
 626             free_state (state);
 627           }
 628         re_free (entry->array);
 629       }
 630   re_free (dfa->state_table);
 631 #ifdef RE_ENABLE_I18N
 632   if (dfa->sb_char != utf8_sb_map)
 633     re_free (dfa->sb_char);
 634 #endif
 635   re_free (dfa->subexp_map);
 636 #ifdef DEBUG
 637   re_free (dfa->re_str);
 638 #endif
 639
 640   re_free (dfa);
 641 }
 642
 643
 644 /* Free dynamically allocated space used by PREG.  */
 645
 646 void
 647 regfree (preg)
 648     regex_t *preg;
 649 {
 650   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 651   if (BE (dfa != NULL, 1))
 652     free_dfa_content (dfa);
 653   preg->buffer = NULL;
 654   preg->allocated = 0;
 655
 656   re_free (preg->fastmap);
 657   preg->fastmap = NULL;
 658
 659   re_free (preg->translate);
 660   preg->translate = NULL;
 661 }
 662 #ifdef _LIBC
 663 weak_alias (__regfree, regfree)
 664 #endif
 665 \f
 666 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 667    them unless specifically requested.  */
 668
 669 #if defined _REGEX_RE_COMP || defined _LIBC
 670
 671 /* BSD has one and only one pattern buffer.  */
 672 static struct re_pattern_buffer re_comp_buf;
 673
 674 char *
 675 # ifdef _LIBC
 676 /* Make these definitions weak in libc, so POSIX programs can redefine
 677    these names if they don't use our functions, and still use
 678    regcomp/regexec above without link errors.  */
 679 weak_function
 680 # endif
 681 re_comp (s)
 682      const char *s;
 683 {
 684   reg_errcode_t ret;
 685   char *fastmap;
 686
 687   if (!s)
 688     {
 689       if (!re_comp_buf.buffer)
 690         return gettext ("No previous regular expression");
 691       return 0;
 692     }
 693
 694   if (re_comp_buf.buffer)
 695     {
 696       fastmap = re_comp_buf.fastmap;
 697       re_comp_buf.fastmap = NULL;
 698       __regfree (&re_comp_buf);
 699       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 700       re_comp_buf.fastmap = fastmap;
 701     }
 702
 703   if (re_comp_buf.fastmap == NULL)
 704     {
 705       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 706       if (re_comp_buf.fastmap == NULL)
 707         return (char *) gettext (__re_error_msgid
 708                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 709     }
 710
 711   /* Since `re_exec' always passes NULL for the `regs' argument, we
 712      don't need to initialize the pattern buffer fields which affect it.  */
 713
 714   /* Match anchors at newlines.  */
 715   re_comp_buf.newline_anchor = 1;
 716
 717   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 718
 719   if (!ret)
 720     return NULL;
 721
 722   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 723   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 724 }
 725
 726 #ifdef _LIBC
 727 libc_freeres_fn (free_mem)
 728 {
 729   __regfree (&re_comp_buf);
 730 }
 731 #endif
 732
 733 #endif /* _REGEX_RE_COMP */
 734 \f
 735 /* Internal entry point.
 736    Compile the regular expression PATTERN, whose length is LENGTH.
 737    SYNTAX indicate regular expression's syntax.  */
 738
 739 static reg_errcode_t
 740 re_compile_internal (preg, pattern, length, syntax)
 741      regex_t *preg;
 742      const char * pattern;
 743      int length;
 744      reg_syntax_t syntax;
 745 {
 746   reg_errcode_t err = REG_NOERROR;
 747   re_dfa_t *dfa;
 748   re_string_t regexp;
 749
 750   /* Initialize the pattern buffer.  */
 751   preg->fastmap_accurate = 0;
 752   preg->syntax = syntax;
 753   preg->not_bol = preg->not_eol = 0;
 754   preg->used = 0;
 755   preg->re_nsub = 0;
 756   preg->can_be_null = 0;
 757   preg->regs_allocated = REGS_UNALLOCATED;
 758
 759   /* Initialize the dfa.  */
 760   dfa = (re_dfa_t *) preg->buffer;
 761   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 762     {
 763       /* If zero allocated, but buffer is non-null, try to realloc
 764          enough space.  This loses if buffer's address is bogus, but
 765          that is the user's responsibility.  If ->buffer is NULL this
 766          is a simple allocation.  */
 767       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 768       if (dfa == NULL)
 769         return REG_ESPACE;
 770       preg->allocated = sizeof (re_dfa_t);
 771       preg->buffer = (unsigned char *) dfa;
 772     }
 773   preg->used = sizeof (re_dfa_t);
 774
 775   err = init_dfa (dfa, length);
 776   if (BE (err != REG_NOERROR, 0))
 777     {
 778       free_dfa_content (dfa);
 779       preg->buffer = NULL;
 780       preg->allocated = 0;
 781       return err;
 782     }
 783 #ifdef DEBUG
 784   dfa->re_str = re_malloc (char, length + 1);
 785   strncpy (dfa->re_str, pattern, length + 1);
 786 #endif
 787
 788   err = re_string_construct (&regexp, pattern, length, preg->translate,
 789                              syntax & RE_ICASE, dfa);
 790   if (BE (err != REG_NOERROR, 0))
 791     {
 792     re_compile_internal_free_return:
 793       free_workarea_compile (preg);
 794       re_string_destruct (&regexp);
 795       free_dfa_content (dfa);
 796       preg->buffer = NULL;
 797       preg->allocated = 0;
 798       return err;
 799     }
 800
 801   /* Parse the regular expression, and build a structure tree.  */
 802   preg->re_nsub = 0;
 803   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 804   if (BE (dfa->str_tree == NULL, 0))
 805     goto re_compile_internal_free_return;
 806
 807   /* Analyze the tree and create the nfa.  */
 808   err = analyze (preg);
 809   if (BE (err != REG_NOERROR, 0))
 810     goto re_compile_internal_free_return;
 811
 812 #ifdef RE_ENABLE_I18N
 813   /* If possible, do searching in single byte encoding to speed things up.  */
 814   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 815     optimize_utf8 (dfa);
 816 #endif
 817
 818   /* Then create the initial state of the dfa.  */
 819   err = create_initial_state (dfa);
 820
 821   /* Release work areas.  */
 822   free_workarea_compile (preg);
 823   re_string_destruct (&regexp);
 824
 825   if (BE (err != REG_NOERROR, 0))
 826     {
 827       free_dfa_content (dfa);
 828       preg->buffer = NULL;
 829       preg->allocated = 0;
 830     }
 831
 832   return err;
 833 }
 834
 835 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 836    as the initial length of some arrays.  */
 837
 838 static reg_errcode_t
 839 init_dfa (dfa, pat_len)
 840      re_dfa_t *dfa;
 841      int pat_len;
 842 {
 843   int table_size;
 844 #ifndef _LIBC
 845   char *codeset_name;
 846 #endif
 847
 848   memset (dfa, '\0', sizeof (re_dfa_t));
 849
 850   /* Force allocation of str_tree_storage the first time.  */
 851   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 852
 853   dfa->nodes_alloc = pat_len + 1;
 854   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 855
 856   dfa->states_alloc = pat_len + 1;
 857
 858   /*  table_size = 2 ^ ceil(log pat_len) */
 859   for (table_size = 1; table_size > 0; table_size <<= 1)
 860     if (table_size > pat_len)
 861       break;
 862
 863   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 864   dfa->state_hash_mask = table_size - 1;
 865
 866   dfa->mb_cur_max = MB_CUR_MAX;
 867 #ifdef _LIBC
 868   if (dfa->mb_cur_max == 6
 869       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 870     dfa->is_utf8 = 1;
 871   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 872                        != 0);
 873 #else
 874 # ifdef HAVE_LANGINFO_CODESET
 875   codeset_name = nl_langinfo (CODESET);
 876 # else
 877   codeset_name = getenv ("LC_ALL");
 878   if (codeset_name == NULL || codeset_name[0] == '\0')
 879     codeset_name = getenv ("LC_CTYPE");
 880   if (codeset_name == NULL || codeset_name[0] == '\0')
 881     codeset_name = getenv ("LANG");
 882   if (codeset_name == NULL)
 883     codeset_name = "";
 884   else if (strchr (codeset_name, '.') !=  NULL)
 885     codeset_name = strchr (codeset_name, '.') + 1;
 886 # endif
 887
 888   if (strcasecmp (codeset_name, "UTF-8") == 0
 889       || strcasecmp (codeset_name, "UTF8") == 0)
 890     dfa->is_utf8 = 1;
 891
 892   /* We check exhaustively in the loop below if this charset is a
 893      superset of ASCII.  */
 894   dfa->map_notascii = 0;
 895 #endif
 896
 897 #ifdef RE_ENABLE_I18N
 898   if (dfa->mb_cur_max > 1)
 899     {
 900       if (dfa->is_utf8)
 901         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 902       else
 903         {
 904           int i, j, ch;
 905
 906           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 907           if (BE (dfa->sb_char == NULL, 0))
 908             return REG_ESPACE;
 909
 910           /* Clear all bits by, then set those corresponding to single
 911              byte chars.  */
 912           bitset_empty (dfa->sb_char);
 913
 914           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 915             for (j = 0; j < UINT_BITS; ++j, ++ch)
 916               {
 917                 wchar_t wch = __btowc (ch);
 918                 if (wch != WEOF)
 919                   dfa->sb_char[i] |= 1 << j;
 920 # ifndef _LIBC
 921                 if (isascii (ch) && wch != (wchar_t) ch)
 922                   dfa->map_notascii = 1;
 923 # endif
 924               }
 925         }
 926     }
 927 #endif
 928
 929   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 930     return REG_ESPACE;
 931   return REG_NOERROR;
 932 }
 933
 934 /* Initialize WORD_CHAR table, which indicate which character is
 935    "word".  In this case "word" means that it is the word construction
 936    character used by some operators like "\<", "\>", etc.  */
 937
 938 static void
 939 init_word_char (dfa)
 940      re_dfa_t *dfa;
 941 {
 942   int i, j, ch;
 943   dfa->word_ops_used = 1;
 944   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 945     for (j = 0; j < UINT_BITS; ++j, ++ch)
 946       if (isalnum (ch) || ch == '_')
 947         dfa->word_char[i] |= 1 << j;
 948 }
 949
 950 /* Free the work area which are only used while compiling.  */
 951
 952 static void
 953 free_workarea_compile (preg)
 954      regex_t *preg;
 955 {
 956   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 957   bin_tree_storage_t *storage, *next;
 958   for (storage = dfa->str_tree_storage; storage; storage = next)
 959     {
 960       next = storage->next;
 961       re_free (storage);
 962     }
 963   dfa->str_tree_storage = NULL;
 964   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 965   dfa->str_tree = NULL;
 966   re_free (dfa->org_indices);
 967   dfa->org_indices = NULL;
 968 }
 969
 970 /* Create initial states for all contexts.  */
 971
 972 static reg_errcode_t
 973 create_initial_state (dfa)
 974      re_dfa_t *dfa;
 975 {
 976   int first, i;
 977   reg_errcode_t err;
 978   re_node_set init_nodes;
 979
 980   /* Initial states have the epsilon closure of the node which is
 981      the first node of the regular expression.  */
 982   first = dfa->str_tree->first->node_idx;
 983   dfa->init_node = first;
 984   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 985   if (BE (err != REG_NOERROR, 0))
 986     return err;
 987
 988   /* The back-references which are in initial states can epsilon transit,
 989      since in this case all of the subexpressions can be null.
 990      Then we add epsilon closures of the nodes which are the next nodes of
 991      the back-references.  */
 992   if (dfa->nbackref > 0)
 993     for (i = 0; i < init_nodes.nelem; ++i)
 994       {
 995         int node_idx = init_nodes.elems[i];
 996         re_token_type_t type = dfa->nodes[node_idx].type;
 997
 998         int clexp_idx;
 999         if (type != OP_BACK_REF)
1000           continue;
1001         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1002           {
1003             re_token_t *clexp_node;
1004             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1005             if (clexp_node->type == OP_CLOSE_SUBEXP
1006                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1007               break;
1008           }
1009         if (clexp_idx == init_nodes.nelem)
1010           continue;
1011
1012         if (type == OP_BACK_REF)
1013           {
1014             int dest_idx = dfa->edests[node_idx].elems[0];
1015             if (!re_node_set_contains (&init_nodes, dest_idx))
1016               {
1017                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1018                 i = 0;
1019               }
1020           }
1021       }
1022
1023   /* It must be the first time to invoke acquire_state.  */
1024   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1025   /* We don't check ERR here, since the initial state must not be NULL.  */
1026   if (BE (dfa->init_state == NULL, 0))
1027     return err;
1028   if (dfa->init_state->has_constraint)
1029     {
1030       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1031                                                        CONTEXT_WORD);
1032       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1033                                                      CONTEXT_NEWLINE);
1034       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1035                                                          &init_nodes,
1036                                                          CONTEXT_NEWLINE
1037                                                          | CONTEXT_BEGBUF);
1038       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1039               || dfa->init_state_begbuf == NULL, 0))
1040         return err;
1041     }
1042   else
1043     dfa->init_state_word = dfa->init_state_nl
1044       = dfa->init_state_begbuf = dfa->init_state;
1045
1046   re_node_set_free (&init_nodes);
1047   return REG_NOERROR;
1048 }
1049 \f
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053    DFA nodes where needed.  */
1054
1055 static void
1056 optimize_utf8 (dfa)
1057      re_dfa_t *dfa;
1058 {
1059   int node, i, mb_chars = 0, has_period = 0;
1060
1061   for (node = 0; node < dfa->nodes_len; ++node)
1062     switch (dfa->nodes[node].type)
1063       {
1064       case CHARACTER:
1065         if (dfa->nodes[node].opr.c >= 0x80)
1066           mb_chars = 1;
1067         break;
1068       case ANCHOR:
1069         switch (dfa->nodes[node].opr.idx)
1070           {
1071           case LINE_FIRST:
1072           case LINE_LAST:
1073           case BUF_FIRST:
1074           case BUF_LAST:
1075             break;
1076           default:
1077             /* Word anchors etc. cannot be handled.  */
1078             return;
1079           }
1080         break;
1081       case OP_PERIOD:
1082         has_period = 1;
1083         break;
1084       case OP_BACK_REF:
1085       case OP_ALT:
1086       case END_OF_RE:
1087       case OP_DUP_ASTERISK:
1088       case OP_OPEN_SUBEXP:
1089       case OP_CLOSE_SUBEXP:
1090         break;
1091       case COMPLEX_BRACKET:
1092         return;
1093       case SIMPLE_BRACKET:
1094         /* Just double check.  */
1095         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1096           if (dfa->nodes[node].opr.sbcset[i])
1097             return;
1098         break;
1099       default:
1100         abort ();
1101       }
1102
1103   if (mb_chars || has_period)
1104     for (node = 0; node < dfa->nodes_len; ++node)
1105       {
1106         if (dfa->nodes[node].type == CHARACTER
1107             && dfa->nodes[node].opr.c >= 0x80)
1108           dfa->nodes[node].mb_partial = 0;
1109         else if (dfa->nodes[node].type == OP_PERIOD)
1110           dfa->nodes[node].type = OP_UTF8_PERIOD;
1111       }
1112
1113   /* The search can be in single byte locale.  */
1114   dfa->mb_cur_max = 1;
1115   dfa->is_utf8 = 0;
1116   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1117 }
1118 #endif
1119 \f
1120 /* Analyze the structure tree, and calculate "first", "next", "edest",
1121    "eclosure", and "inveclosure".  */
1122
1123 static reg_errcode_t
1124 analyze (preg)
1125      regex_t *preg;
1126 {
1127   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1128   reg_errcode_t ret;
1129
1130   /* Allocate arrays.  */
1131   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1132   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1133   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1136           || dfa->eclosures == NULL, 0))
1137     return REG_ESPACE;
1138
1139   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1140   if (dfa->subexp_map != NULL)
1141     {
1142       int i;
1143       for (i = 0; i < preg->re_nsub; i++)
1144         dfa->subexp_map[i] = i;
1145       preorder (dfa->str_tree, optimize_subexps, dfa);
1146       for (i = 0; i < preg->re_nsub; i++)
1147         if (dfa->subexp_map[i] != i)
1148           break;
1149       if (i == preg->re_nsub)
1150         {
1151           free (dfa->subexp_map);
1152           dfa->subexp_map = NULL;
1153         }
1154     }
1155
1156   ret = postorder (dfa->str_tree, lower_subexps, preg);
1157   if (BE (ret != REG_NOERROR, 0))
1158     return ret;
1159   ret = postorder (dfa->str_tree, calc_first, dfa);
1160   if (BE (ret != REG_NOERROR, 0))
1161     return ret;
1162   preorder (dfa->str_tree, calc_next, dfa);
1163   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164   if (BE (ret != REG_NOERROR, 0))
1165     return ret;
1166   ret = calc_eclosure (dfa);
1167   if (BE (ret != REG_NOERROR, 0))
1168     return ret;
1169
1170   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1172   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173       || dfa->nbackref)
1174     {
1175       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176       if (BE (dfa->inveclosures == NULL, 0))
1177         return REG_ESPACE;
1178       ret = calc_inveclosure (dfa);
1179     }
1180
1181   return ret;
1182 }
1183
1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
1185    implement parse tree visits.  Instead, we use parent pointers and
1186    some hairy code in these two functions.  */
1187 static reg_errcode_t
1188 postorder (root, fn, extra)
1189      bin_tree_t *root;
1190      reg_errcode_t (fn (void *, bin_tree_t *));
1191      void *extra;
1192 {
1193   bin_tree_t *node, *prev;
1194
1195   for (node = root; ; )
1196     {
1197       /* Descend down the tree, preferably to the left (or to the right
1198          if that's the only child).  */
1199       while (node->left || node->right)
1200         if (node->left)
1201           node = node->left;
1202         else
1203           node = node->right;
1204
1205       do
1206         {
1207           reg_errcode_t err = fn (extra, node);
1208           if (BE (err != REG_NOERROR, 0))
1209             return err;
1210           if (node->parent == NULL)
1211             return REG_NOERROR;
1212           prev = node;
1213           node = node->parent;
1214         }
1215       /* Go up while we have a node that is reached from the right.  */
1216       while (node->right == prev || node->right == NULL);
1217       node = node->right;
1218     }
1219 }
1220
1221 static reg_errcode_t
1222 preorder (root, fn, extra)
1223      bin_tree_t *root;
1224      reg_errcode_t (fn (void *, bin_tree_t *));
1225      void *extra;
1226 {
1227   bin_tree_t *node;
1228
1229   for (node = root; ; )
1230     {
1231       reg_errcode_t err = fn (extra, node);
1232       if (BE (err != REG_NOERROR, 0))
1233         return err;
1234
1235       /* Go to the left node, or up and to the right.  */
1236       if (node->left)
1237         node = node->left;
1238       else
1239         {
1240           bin_tree_t *prev = NULL;
1241           while (node->right == prev || node->right == NULL)
1242             {
1243               prev = node;
1244               node = node->parent;
1245               if (!node)
1246                 return REG_NOERROR;
1247             }
1248           node = node->right;
1249         }
1250     }
1251 }
1252
1253 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1254    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1255    backreferences as well.  Requires a preorder visit.  */
1256 static reg_errcode_t
1257 optimize_subexps (extra, node)
1258      void *extra;
1259      bin_tree_t *node;
1260 {
1261   re_dfa_t *dfa = (re_dfa_t *) extra;
1262
1263   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1264     {
1265       int idx = node->token.opr.idx;
1266       node->token.opr.idx = dfa->subexp_map[idx];
1267       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1268     }
1269
1270   else if (node->token.type == SUBEXP
1271            && node->left && node->left->token.type == SUBEXP)
1272     {
1273       int other_idx = node->left->token.opr.idx;
1274
1275       node->left = node->left->left;
1276       if (node->left)
1277         node->left->parent = node;
1278
1279       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1280       if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1281         dfa->used_bkref_map &= ~(1 << other_idx);
1282     }
1283
1284   return REG_NOERROR;
1285 }
1286
1287 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1288    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1289 static reg_errcode_t
1290 lower_subexps (extra, node)
1291      void *extra;
1292      bin_tree_t *node;
1293 {
1294   regex_t *preg = (regex_t *) extra;
1295   reg_errcode_t err = REG_NOERROR;
1296
1297   if (node->left && node->left->token.type == SUBEXP)
1298     {
1299       node->left = lower_subexp (&err, preg, node->left);
1300       if (node->left)
1301         node->left->parent = node;
1302     }
1303   if (node->right && node->right->token.type == SUBEXP)
1304     {
1305       node->right = lower_subexp (&err, preg, node->right);
1306       if (node->right)
1307         node->right->parent = node;
1308     }
1309
1310   return err;
1311 }
1312
1313 static bin_tree_t *
1314 lower_subexp (err, preg, node)
1315      reg_errcode_t *err;
1316      regex_t *preg;
1317      bin_tree_t *node;
1318 {
1319   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1320   bin_tree_t *body = node->left;
1321   bin_tree_t *op, *cls, *tree1, *tree;
1322
1323   if (preg->no_sub
1324       && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1325           || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1326     return node->left;
1327
1328   /* Convert the SUBEXP node to the concatenation of an
1329      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1330   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1331   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1332   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1333   tree = create_tree (dfa, op, tree1, CONCAT);
1334   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1335     {
1336       *err = REG_ESPACE;
1337       return NULL;
1338     }
1339
1340   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1341   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1342   return tree;
1343 }
1344
1345 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1346    nodes.  Requires a postorder visit.  */
1347 static reg_errcode_t
1348 calc_first (extra, node)
1349      void *extra;
1350      bin_tree_t *node;
1351 {
1352   re_dfa_t *dfa = (re_dfa_t *) extra;
1353   if (node->token.type == CONCAT)
1354     {
1355       node->first = node->left->first;
1356       node->node_idx = node->left->node_idx;
1357     }
1358   else
1359     {
1360       node->first = node;
1361       node->node_idx = re_dfa_add_node (dfa, node->token);
1362       if (BE (node->node_idx == -1, 0))
1363         return REG_ESPACE;
1364     }
1365   return REG_NOERROR;
1366 }
1367
1368 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1369 static reg_errcode_t
1370 calc_next (extra, node)
1371      void *extra;
1372      bin_tree_t *node;
1373 {
1374   switch (node->token.type)
1375     {
1376     case OP_DUP_ASTERISK:
1377       node->left->next = node;
1378       break;
1379     case CONCAT:
1380       node->left->next = node->right->first;
1381       node->right->next = node->next;
1382       break;
1383     default:
1384       if (node->left)
1385         node->left->next = node->next;
1386       if (node->right)
1387         node->right->next = node->next;
1388       break;
1389     }
1390   return REG_NOERROR;
1391 }
1392
1393 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1394 static reg_errcode_t
1395 link_nfa_nodes (extra, node)
1396      void *extra;
1397      bin_tree_t *node;
1398 {
1399   re_dfa_t *dfa = (re_dfa_t *) extra;
1400   int idx = node->node_idx;
1401   reg_errcode_t err = REG_NOERROR;
1402
1403   switch (node->token.type)
1404     {
1405     case CONCAT:
1406       break;
1407
1408     case END_OF_RE:
1409       assert (node->next == NULL);
1410       break;
1411
1412     case OP_DUP_ASTERISK:
1413     case OP_ALT:
1414       {
1415         int left, right;
1416         dfa->has_plural_match = 1;
1417         if (node->left != NULL)
1418           left = node->left->first->node_idx;
1419         else
1420           left = node->next->node_idx;
1421         if (node->right != NULL)
1422           right = node->right->first->node_idx;
1423         else
1424           right = node->next->node_idx;
1425         assert (left > -1);
1426         assert (right > -1);
1427         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1428       }
1429       break;
1430
1431     case ANCHOR:
1432     case OP_OPEN_SUBEXP:
1433     case OP_CLOSE_SUBEXP:
1434       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1435       break;
1436
1437     case OP_BACK_REF:
1438       dfa->nexts[idx] = node->next->node_idx;
1439       if (node->token.type == OP_BACK_REF)
1440         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1441       break;
1442
1443     default:
1444       assert (!IS_EPSILON_NODE (node->token.type));
1445       dfa->nexts[idx] = node->next->node_idx;
1446       break;
1447     }
1448
1449   return err;
1450 }
1451
1452 /* Duplicate the epsilon closure of the node ROOT_NODE.
1453    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1454    to their own constraint.  */
1455
1456 static reg_errcode_t
1457 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1458                         init_constraint)
1459      re_dfa_t *dfa;
1460      int top_org_node, top_clone_node, root_node;
1461      unsigned int init_constraint;
1462 {
1463   reg_errcode_t err;
1464   int org_node, clone_node, ret;
1465   unsigned int constraint = init_constraint;
1466   for (org_node = top_org_node, clone_node = top_clone_node;;)
1467     {
1468       int org_dest, clone_dest;
1469       if (dfa->nodes[org_node].type == OP_BACK_REF)
1470         {
1471           /* If the back reference epsilon-transit, its destination must
1472              also have the constraint.  Then duplicate the epsilon closure
1473              of the destination of the back reference, and store it in
1474              edests of the back reference.  */
1475           org_dest = dfa->nexts[org_node];
1476           re_node_set_empty (dfa->edests + clone_node);
1477           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1478           if (BE (err != REG_NOERROR, 0))
1479             return err;
1480           dfa->nexts[clone_node] = dfa->nexts[org_node];
1481           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1482           if (BE (ret < 0, 0))
1483             return REG_ESPACE;
1484         }
1485       else if (dfa->edests[org_node].nelem == 0)
1486         {
1487           /* In case of the node can't epsilon-transit, don't duplicate the
1488              destination and store the original destination as the
1489              destination of the node.  */
1490           dfa->nexts[clone_node] = dfa->nexts[org_node];
1491           break;
1492         }
1493       else if (dfa->edests[org_node].nelem == 1)
1494         {
1495           /* In case of the node can epsilon-transit, and it has only one
1496              destination.  */
1497           org_dest = dfa->edests[org_node].elems[0];
1498           re_node_set_empty (dfa->edests + clone_node);
1499           if (dfa->nodes[org_node].type == ANCHOR)
1500             {
1501               /* In case of the node has another constraint, append it.  */
1502               if (org_node == root_node && clone_node != org_node)
1503                 {
1504                   /* ...but if the node is root_node itself, it means the
1505                      epsilon closure have a loop, then tie it to the
1506                      destination of the root_node.  */
1507                   ret = re_node_set_insert (dfa->edests + clone_node,
1508                                             org_dest);
1509                   if (BE (ret < 0, 0))
1510                     return REG_ESPACE;
1511                   break;
1512                 }
1513               constraint |= dfa->nodes[org_node].opr.ctx_type;
1514             }
1515           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1516           if (BE (err != REG_NOERROR, 0))
1517             return err;
1518           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519           if (BE (ret < 0, 0))
1520             return REG_ESPACE;
1521         }
1522       else /* dfa->edests[org_node].nelem == 2 */
1523         {
1524           /* In case of the node can epsilon-transit, and it has two
1525              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1526           org_dest = dfa->edests[org_node].elems[0];
1527           re_node_set_empty (dfa->edests + clone_node);
1528           /* Search for a duplicated node which satisfies the constraint.  */
1529           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1530           if (clone_dest == -1)
1531             {
1532               /* There are no such a duplicated node, create a new one.  */
1533               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1534               if (BE (err != REG_NOERROR, 0))
1535                 return err;
1536               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1537               if (BE (ret < 0, 0))
1538                 return REG_ESPACE;
1539               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1540                                             root_node, constraint);
1541               if (BE (err != REG_NOERROR, 0))
1542                 return err;
1543             }
1544           else
1545             {
1546               /* There are a duplicated node which satisfy the constraint,
1547                  use it to avoid infinite loop.  */
1548               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1549               if (BE (ret < 0, 0))
1550                 return REG_ESPACE;
1551             }
1552
1553           org_dest = dfa->edests[org_node].elems[1];
1554           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1555           if (BE (err != REG_NOERROR, 0))
1556             return err;
1557           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1558           if (BE (ret < 0, 0))
1559             return REG_ESPACE;
1560         }
1561       org_node = org_dest;
1562       clone_node = clone_dest;
1563     }
1564   return REG_NOERROR;
1565 }
1566
1567 /* Search for a node which is duplicated from the node ORG_NODE, and
1568    satisfies the constraint CONSTRAINT.  */
1569
1570 static int
1571 search_duplicated_node (dfa, org_node, constraint)
1572      re_dfa_t *dfa;
1573      int org_node;
1574      unsigned int constraint;
1575 {
1576   int idx;
1577   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1578     {
1579       if (org_node == dfa->org_indices[idx]
1580           && constraint == dfa->nodes[idx].constraint)
1581         return idx; /* Found.  */
1582     }
1583   return -1; /* Not found.  */
1584 }
1585
1586 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1587    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1588    otherwise return the error code.  */
1589
1590 static reg_errcode_t
1591 duplicate_node (new_idx, dfa, org_idx, constraint)
1592      re_dfa_t *dfa;
1593      int *new_idx, org_idx;
1594      unsigned int constraint;
1595 {
1596   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1597   if (BE (dup_idx == -1, 0))
1598     return REG_ESPACE;
1599   dfa->nodes[dup_idx].constraint = constraint;
1600   if (dfa->nodes[org_idx].type == ANCHOR)
1601     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1602   dfa->nodes[dup_idx].duplicated = 1;
1603
1604   /* Store the index of the original node.  */
1605   dfa->org_indices[dup_idx] = org_idx;
1606   *new_idx = dup_idx;
1607   return REG_NOERROR;
1608 }
1609
1610 static reg_errcode_t
1611 calc_inveclosure (dfa)
1612      re_dfa_t *dfa;
1613 {
1614   int src, idx, ret;
1615   for (idx = 0; idx < dfa->nodes_len; ++idx)
1616     re_node_set_init_empty (dfa->inveclosures + idx);
1617
1618   for (src = 0; src < dfa->nodes_len; ++src)
1619     {
1620       int *elems = dfa->eclosures[src].elems;
1621       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1622         {
1623           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1624           if (BE (ret == -1, 0))
1625             return REG_ESPACE;
1626         }
1627     }
1628
1629   return REG_NOERROR;
1630 }
1631
1632 /* Calculate "eclosure" for all the node in DFA.  */
1633
1634 static reg_errcode_t
1635 calc_eclosure (dfa)
1636      re_dfa_t *dfa;
1637 {
1638   int node_idx, incomplete;
1639 #ifdef DEBUG
1640   assert (dfa->nodes_len > 0);
1641 #endif
1642   incomplete = 0;
1643   /* For each nodes, calculate epsilon closure.  */
1644   for (node_idx = 0; ; ++node_idx)
1645     {
1646       reg_errcode_t err;
1647       re_node_set eclosure_elem;
1648       if (node_idx == dfa->nodes_len)
1649         {
1650           if (!incomplete)
1651             break;
1652           incomplete = 0;
1653           node_idx = 0;
1654         }
1655
1656 #ifdef DEBUG
1657       assert (dfa->eclosures[node_idx].nelem != -1);
1658 #endif
1659
1660       /* If we have already calculated, skip it.  */
1661       if (dfa->eclosures[node_idx].nelem != 0)
1662         continue;
1663       /* Calculate epsilon closure of `node_idx'.  */
1664       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1665       if (BE (err != REG_NOERROR, 0))
1666         return err;
1667
1668       if (dfa->eclosures[node_idx].nelem == 0)
1669         {
1670           incomplete = 1;
1671           re_node_set_free (&eclosure_elem);
1672         }
1673     }
1674   return REG_NOERROR;
1675 }
1676
1677 /* Calculate epsilon closure of NODE.  */
1678
1679 static reg_errcode_t
1680 calc_eclosure_iter (new_set, dfa, node, root)
1681      re_node_set *new_set;
1682      re_dfa_t *dfa;
1683      int node, root;
1684 {
1685   reg_errcode_t err;
1686   unsigned int constraint;
1687   int i, incomplete;
1688   re_node_set eclosure;
1689   incomplete = 0;
1690   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1691   if (BE (err != REG_NOERROR, 0))
1692     return err;
1693
1694   /* This indicates that we are calculating this node now.
1695      We reference this value to avoid infinite loop.  */
1696   dfa->eclosures[node].nelem = -1;
1697
1698   constraint = ((dfa->nodes[node].type == ANCHOR)
1699                 ? dfa->nodes[node].opr.ctx_type : 0);
1700   /* If the current node has constraints, duplicate all nodes.
1701      Since they must inherit the constraints.  */
1702   if (constraint
1703       && dfa->edests[node].nelem
1704       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1705     {
1706       int org_node, cur_node;
1707       org_node = cur_node = node;
1708       err = duplicate_node_closure (dfa, node, node, node, constraint);
1709       if (BE (err != REG_NOERROR, 0))
1710         return err;
1711     }
1712
1713   /* Expand each epsilon destination nodes.  */
1714   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1715     for (i = 0; i < dfa->edests[node].nelem; ++i)
1716       {
1717         re_node_set eclosure_elem;
1718         int edest = dfa->edests[node].elems[i];
1719         /* If calculating the epsilon closure of `edest' is in progress,
1720            return intermediate result.  */
1721         if (dfa->eclosures[edest].nelem == -1)
1722           {
1723             incomplete = 1;
1724             continue;
1725           }
1726         /* If we haven't calculated the epsilon closure of `edest' yet,
1727            calculate now. Otherwise use calculated epsilon closure.  */
1728         if (dfa->eclosures[edest].nelem == 0)
1729           {
1730             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1731             if (BE (err != REG_NOERROR, 0))
1732               return err;
1733           }
1734         else
1735           eclosure_elem = dfa->eclosures[edest];
1736         /* Merge the epsilon closure of `edest'.  */
1737         re_node_set_merge (&eclosure, &eclosure_elem);
1738         /* If the epsilon closure of `edest' is incomplete,
1739            the epsilon closure of this node is also incomplete.  */
1740         if (dfa->eclosures[edest].nelem == 0)
1741           {
1742             incomplete = 1;
1743             re_node_set_free (&eclosure_elem);
1744           }
1745       }
1746
1747   /* Epsilon closures include itself.  */
1748   re_node_set_insert (&eclosure, node);
1749   if (incomplete && !root)
1750     dfa->eclosures[node].nelem = 0;
1751   else
1752     dfa->eclosures[node] = eclosure;
1753   *new_set = eclosure;
1754   return REG_NOERROR;
1755 }
1756 \f
1757 /* Functions for token which are used in the parser.  */
1758
1759 /* Fetch a token from INPUT.
1760    We must not use this function inside bracket expressions.  */
1761
1762 static void
1763 fetch_token (result, input, syntax)
1764      re_token_t *result;
1765      re_string_t *input;
1766      reg_syntax_t syntax;
1767 {
1768   re_string_skip_bytes (input, peek_token (result, input, syntax));
1769 }
1770
1771 /* Peek a token from INPUT, and return the length of the token.
1772    We must not use this function inside bracket expressions.  */
1773
1774 static int
1775 peek_token (token, input, syntax)
1776      re_token_t *token;
1777      re_string_t *input;
1778      reg_syntax_t syntax;
1779 {
1780   unsigned char c;
1781
1782   if (re_string_eoi (input))
1783     {
1784       token->type = END_OF_RE;
1785       return 0;
1786     }
1787
1788   c = re_string_peek_byte (input, 0);
1789   token->opr.c = c;
1790
1791   token->word_char = 0;
1792 #ifdef RE_ENABLE_I18N
1793   token->mb_partial = 0;
1794   if (input->mb_cur_max > 1 &&
1795       !re_string_first_byte (input, re_string_cur_idx (input)))
1796     {
1797       token->type = CHARACTER;
1798       token->mb_partial = 1;
1799       return 1;
1800     }
1801 #endif
1802   if (c == '\\')
1803     {
1804       unsigned char c2;
1805       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1806         {
1807           token->type = BACK_SLASH;
1808           return 1;
1809         }
1810
1811       c2 = re_string_peek_byte_case (input, 1);
1812       token->opr.c = c2;
1813       token->type = CHARACTER;
1814 #ifdef RE_ENABLE_I18N
1815       if (input->mb_cur_max > 1)
1816         {
1817           wint_t wc = re_string_wchar_at (input,
1818                                           re_string_cur_idx (input) + 1);
1819           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1820         }
1821       else
1822 #endif
1823         token->word_char = IS_WORD_CHAR (c2) != 0;
1824
1825       switch (c2)
1826         {
1827         case '|':
1828           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1829             token->type = OP_ALT;
1830           break;
1831         case '1': case '2': case '3': case '4': case '5':
1832         case '6': case '7': case '8': case '9':
1833           if (!(syntax & RE_NO_BK_REFS))
1834             {
1835               token->type = OP_BACK_REF;
1836               token->opr.idx = c2 - '1';
1837             }
1838           break;
1839         case '<':
1840           if (!(syntax & RE_NO_GNU_OPS))
1841             {
1842               token->type = ANCHOR;
1843               token->opr.ctx_type = WORD_FIRST;
1844             }
1845           break;
1846         case '>':
1847           if (!(syntax & RE_NO_GNU_OPS))
1848             {
1849               token->type = ANCHOR;
1850               token->opr.ctx_type = WORD_LAST;
1851             }
1852           break;
1853         case 'b':
1854           if (!(syntax & RE_NO_GNU_OPS))
1855             {
1856               token->type = ANCHOR;
1857               token->opr.ctx_type = WORD_DELIM;
1858             }
1859           break;
1860         case 'B':
1861           if (!(syntax & RE_NO_GNU_OPS))
1862             {
1863               token->type = ANCHOR;
1864               token->opr.ctx_type = NOT_WORD_DELIM;
1865             }
1866           break;
1867         case 'w':
1868           if (!(syntax & RE_NO_GNU_OPS))
1869             token->type = OP_WORD;
1870           break;
1871         case 'W':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             token->type = OP_NOTWORD;
1874           break;
1875         case 's':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             token->type = OP_SPACE;
1878           break;
1879         case 'S':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             token->type = OP_NOTSPACE;
1882           break;
1883         case '`':
1884           if (!(syntax & RE_NO_GNU_OPS))
1885             {
1886               token->type = ANCHOR;
1887               token->opr.ctx_type = BUF_FIRST;
1888             }
1889           break;
1890         case '\'':
1891           if (!(syntax & RE_NO_GNU_OPS))
1892             {
1893               token->type = ANCHOR;
1894               token->opr.ctx_type = BUF_LAST;
1895             }
1896           break;
1897         case '(':
1898           if (!(syntax & RE_NO_BK_PARENS))
1899             token->type = OP_OPEN_SUBEXP;
1900           break;
1901         case ')':
1902           if (!(syntax & RE_NO_BK_PARENS))
1903             token->type = OP_CLOSE_SUBEXP;
1904           break;
1905         case '+':
1906           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907             token->type = OP_DUP_PLUS;
1908           break;
1909         case '?':
1910           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911             token->type = OP_DUP_QUESTION;
1912           break;
1913         case '{':
1914           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915             token->type = OP_OPEN_DUP_NUM;
1916           break;
1917         case '}':
1918           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919             token->type = OP_CLOSE_DUP_NUM;
1920           break;
1921         default:
1922           break;
1923         }
1924       return 2;
1925     }
1926
1927   token->type = CHARACTER;
1928 #ifdef RE_ENABLE_I18N
1929   if (input->mb_cur_max > 1)
1930     {
1931       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1932       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1933     }
1934   else
1935 #endif
1936     token->word_char = IS_WORD_CHAR (token->opr.c);
1937
1938   switch (c)
1939     {
1940     case '\n':
1941       if (syntax & RE_NEWLINE_ALT)
1942         token->type = OP_ALT;
1943       break;
1944     case '|':
1945       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1946         token->type = OP_ALT;
1947       break;
1948     case '*':
1949       token->type = OP_DUP_ASTERISK;
1950       break;
1951     case '+':
1952       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953         token->type = OP_DUP_PLUS;
1954       break;
1955     case '?':
1956       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1957         token->type = OP_DUP_QUESTION;
1958       break;
1959     case '{':
1960       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961         token->type = OP_OPEN_DUP_NUM;
1962       break;
1963     case '}':
1964       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1965         token->type = OP_CLOSE_DUP_NUM;
1966       break;
1967     case '(':
1968       if (syntax & RE_NO_BK_PARENS)
1969         token->type = OP_OPEN_SUBEXP;
1970       break;
1971     case ')':
1972       if (syntax & RE_NO_BK_PARENS)
1973         token->type = OP_CLOSE_SUBEXP;
1974       break;
1975     case '[':
1976       token->type = OP_OPEN_BRACKET;
1977       break;
1978     case '.':
1979       token->type = OP_PERIOD;
1980       break;
1981     case '^':
1982       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1983           re_string_cur_idx (input) != 0)
1984         {
1985           char prev = re_string_peek_byte (input, -1);
1986           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1987             break;
1988         }
1989       token->type = ANCHOR;
1990       token->opr.ctx_type = LINE_FIRST;
1991       break;
1992     case '$':
1993       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1994           re_string_cur_idx (input) + 1 != re_string_length (input))
1995         {
1996           re_token_t next;
1997           re_string_skip_bytes (input, 1);
1998           peek_token (&next, input, syntax);
1999           re_string_skip_bytes (input, -1);
2000           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2001             break;
2002         }
2003       token->type = ANCHOR;
2004       token->opr.ctx_type = LINE_LAST;
2005       break;
2006     default:
2007       break;
2008     }
2009   return 1;
2010 }
2011
2012 /* Peek a token from INPUT, and return the length of the token.
2013    We must not use this function out of bracket expressions.  */
2014
2015 static int
2016 peek_token_bracket (token, input, syntax)
2017      re_token_t *token;
2018      re_string_t *input;
2019      reg_syntax_t syntax;
2020 {
2021   unsigned char c;
2022   if (re_string_eoi (input))
2023     {
2024       token->type = END_OF_RE;
2025       return 0;
2026     }
2027   c = re_string_peek_byte (input, 0);
2028   token->opr.c = c;
2029
2030 #ifdef RE_ENABLE_I18N
2031   if (input->mb_cur_max > 1 &&
2032       !re_string_first_byte (input, re_string_cur_idx (input)))
2033     {
2034       token->type = CHARACTER;
2035       return 1;
2036     }
2037 #endif /* RE_ENABLE_I18N */
2038
2039   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2040       && re_string_cur_idx (input) + 1 < re_string_length (input))
2041     {
2042       /* In this case, '\' escape a character.  */
2043       unsigned char c2;
2044       re_string_skip_bytes (input, 1);
2045       c2 = re_string_peek_byte (input, 0);
2046       token->opr.c = c2;
2047       token->type = CHARACTER;
2048       return 1;
2049     }
2050   if (c == '[') /* '[' is a special char in a bracket exps.  */
2051     {
2052       unsigned char c2;
2053       int token_len;
2054       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2055         c2 = re_string_peek_byte (input, 1);
2056       else
2057         c2 = 0;
2058       token->opr.c = c2;
2059       token_len = 2;
2060       switch (c2)
2061         {
2062         case '.':
2063           token->type = OP_OPEN_COLL_ELEM;
2064           break;
2065         case '=':
2066           token->type = OP_OPEN_EQUIV_CLASS;
2067           break;
2068         case ':':
2069           if (syntax & RE_CHAR_CLASSES)
2070             {
2071               token->type = OP_OPEN_CHAR_CLASS;
2072               break;
2073             }
2074           /* else fall through.  */
2075         default:
2076           token->type = CHARACTER;
2077           token->opr.c = c;
2078           token_len = 1;
2079           break;
2080         }
2081       return token_len;
2082     }
2083   switch (c)
2084     {
2085     case '-':
2086       token->type = OP_CHARSET_RANGE;
2087       break;
2088     case ']':
2089       token->type = OP_CLOSE_BRACKET;
2090       break;
2091     case '^':
2092       token->type = OP_NON_MATCH_LIST;
2093       break;
2094     default:
2095       token->type = CHARACTER;
2096     }
2097   return 1;
2098 }
2099 \f
2100 /* Functions for parser.  */
2101
2102 /* Entry point of the parser.
2103    Parse the regular expression REGEXP and return the structure tree.
2104    If an error is occured, ERR is set by error code, and return NULL.
2105    This function build the following tree, from regular expression <reg_exp>:
2106            CAT
2107            / \
2108           /   \
2109    <reg_exp>  EOR
2110
2111    CAT means concatenation.
2112    EOR means end of regular expression.  */
2113
2114 static bin_tree_t *
2115 parse (regexp, preg, syntax, err)
2116      re_string_t *regexp;
2117      regex_t *preg;
2118      reg_syntax_t syntax;
2119      reg_errcode_t *err;
2120 {
2121   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2122   bin_tree_t *tree, *eor, *root;
2123   re_token_t current_token;
2124   dfa->syntax = syntax;
2125   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2127   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2128     return NULL;
2129   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2130   if (tree != NULL)
2131     root = create_tree (dfa, tree, eor, CONCAT);
2132   else
2133     root = eor;
2134   if (BE (eor == NULL || root == NULL, 0))
2135     {
2136       *err = REG_ESPACE;
2137       return NULL;
2138     }
2139   return root;
2140 }
2141
2142 /* This function build the following tree, from regular expression
2143    <branch1>|<branch2>:
2144            ALT
2145            / \
2146           /   \
2147    <branch1> <branch2>
2148
2149    ALT means alternative, which represents the operator `|'.  */
2150
2151 static bin_tree_t *
2152 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2153      re_string_t *regexp;
2154      regex_t *preg;
2155      re_token_t *token;
2156      reg_syntax_t syntax;
2157      int nest;
2158      reg_errcode_t *err;
2159 {
2160   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2161   bin_tree_t *tree, *branch = NULL;
2162   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2163   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2164     return NULL;
2165
2166   while (token->type == OP_ALT)
2167     {
2168       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2169       if (token->type != OP_ALT && token->type != END_OF_RE
2170           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2171         {
2172           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2173           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2174             return NULL;
2175         }
2176       else
2177         branch = NULL;
2178       tree = create_tree (dfa, tree, branch, OP_ALT);
2179       if (BE (tree == NULL, 0))
2180         {
2181           *err = REG_ESPACE;
2182           return NULL;
2183         }
2184     }
2185   return tree;
2186 }
2187
2188 /* This function build the following tree, from regular expression
2189    <exp1><exp2>:
2190         CAT
2191         / \
2192        /   \
2193    <exp1> <exp2>
2194
2195    CAT means concatenation.  */
2196
2197 static bin_tree_t *
2198 parse_branch (regexp, preg, token, syntax, nest, err)
2199      re_string_t *regexp;
2200      regex_t *preg;
2201      re_token_t *token;
2202      reg_syntax_t syntax;
2203      int nest;
2204      reg_errcode_t *err;
2205 {
2206   bin_tree_t *tree, *exp;
2207   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2208   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2209   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2210     return NULL;
2211
2212   while (token->type != OP_ALT && token->type != END_OF_RE
2213          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2214     {
2215       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2216       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2217         {
2218           return NULL;
2219         }
2220       if (tree != NULL && exp != NULL)
2221         {
2222           tree = create_tree (dfa, tree, exp, CONCAT);
2223           if (tree == NULL)
2224             {
2225               *err = REG_ESPACE;
2226               return NULL;
2227             }
2228         }
2229       else if (tree == NULL)
2230         tree = exp;
2231       /* Otherwise exp == NULL, we don't need to create new tree.  */
2232     }
2233   return tree;
2234 }
2235
2236 /* This function build the following tree, from regular expression a*:
2237          *
2238          |
2239          a
2240 */
2241
2242 static bin_tree_t *
2243 parse_expression (regexp, preg, token, syntax, nest, err)
2244      re_string_t *regexp;
2245      regex_t *preg;
2246      re_token_t *token;
2247      reg_syntax_t syntax;
2248      int nest;
2249      reg_errcode_t *err;
2250 {
2251   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2252   bin_tree_t *tree;
2253   switch (token->type)
2254     {
2255     case CHARACTER:
2256       tree = create_token_tree (dfa, NULL, NULL, token);
2257       if (BE (tree == NULL, 0))
2258         {
2259           *err = REG_ESPACE;
2260           return NULL;
2261         }
2262 #ifdef RE_ENABLE_I18N
2263       if (dfa->mb_cur_max > 1)
2264         {
2265           while (!re_string_eoi (regexp)
2266                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2267             {
2268               bin_tree_t *mbc_remain;
2269               fetch_token (token, regexp, syntax);
2270               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2271               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2272               if (BE (mbc_remain == NULL || tree == NULL, 0))
2273                 {
2274                   *err = REG_ESPACE;
2275                   return NULL;
2276                 }
2277             }
2278         }
2279 #endif
2280       break;
2281     case OP_OPEN_SUBEXP:
2282       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2283       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2284         return NULL;
2285       break;
2286     case OP_OPEN_BRACKET:
2287       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2288       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2289         return NULL;
2290       break;
2291     case OP_BACK_REF:
2292       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2293         {
2294           *err = REG_ESUBREG;
2295           return NULL;
2296         }
2297       dfa->used_bkref_map |= 1 << token->opr.idx;
2298       tree = create_token_tree (dfa, NULL, NULL, token);
2299       if (BE (tree == NULL, 0))
2300         {
2301           *err = REG_ESPACE;
2302           return NULL;
2303         }
2304       ++dfa->nbackref;
2305       dfa->has_mb_node = 1;
2306       break;
2307     case OP_OPEN_DUP_NUM:
2308       if (syntax & RE_CONTEXT_INVALID_DUP)
2309         {
2310           *err = REG_BADRPT;
2311           return NULL;
2312         }
2313       /* FALLTHROUGH */
2314     case OP_DUP_ASTERISK:
2315     case OP_DUP_PLUS:
2316     case OP_DUP_QUESTION:
2317       if (syntax & RE_CONTEXT_INVALID_OPS)
2318         {
2319           *err = REG_BADRPT;
2320           return NULL;
2321         }
2322       else if (syntax & RE_CONTEXT_INDEP_OPS)
2323         {
2324           fetch_token (token, regexp, syntax);
2325           return parse_expression (regexp, preg, token, syntax, nest, err);
2326         }
2327       /* else fall through  */
2328     case OP_CLOSE_SUBEXP:
2329       if ((token->type == OP_CLOSE_SUBEXP) &&
2330           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2331         {
2332           *err = REG_ERPAREN;
2333           return NULL;
2334         }
2335       /* else fall through  */
2336     case OP_CLOSE_DUP_NUM:
2337       /* We treat it as a normal character.  */
2338
2339       /* Then we can these characters as normal characters.  */
2340       token->type = CHARACTER;
2341       /* mb_partial and word_char bits should be initialized already
2342          by peek_token.  */
2343       tree = create_token_tree (dfa, NULL, NULL, token);
2344       if (BE (tree == NULL, 0))
2345         {
2346           *err = REG_ESPACE;
2347           return NULL;
2348         }
2349       break;
2350     case ANCHOR:
2351       if ((token->opr.ctx_type
2352            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2353           && dfa->word_ops_used == 0)
2354         init_word_char (dfa);
2355       if (token->opr.ctx_type == WORD_DELIM
2356           || token->opr.ctx_type == NOT_WORD_DELIM)
2357         {
2358           bin_tree_t *tree_first, *tree_last;
2359           if (token->opr.ctx_type == WORD_DELIM)
2360             {
2361               token->opr.ctx_type = WORD_FIRST;
2362               tree_first = create_token_tree (dfa, NULL, NULL, token);
2363               token->opr.ctx_type = WORD_LAST;
2364             }
2365           else
2366             {
2367               token->opr.ctx_type = INSIDE_WORD;
2368               tree_first = create_token_tree (dfa, NULL, NULL, token);
2369               token->opr.ctx_type = INSIDE_NOTWORD;
2370             }
2371           tree_last = create_token_tree (dfa, NULL, NULL, token);
2372           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2373           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2374             {
2375               *err = REG_ESPACE;
2376               return NULL;
2377             }
2378         }
2379       else
2380         {
2381           tree = create_token_tree (dfa, NULL, NULL, token);
2382           if (BE (tree == NULL, 0))
2383             {
2384               *err = REG_ESPACE;
2385               return NULL;
2386             }
2387         }
2388       /* We must return here, since ANCHORs can't be followed
2389          by repetition operators.
2390          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2391              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2392       fetch_token (token, regexp, syntax);
2393       return tree;
2394     case OP_PERIOD:
2395       tree = create_token_tree (dfa, NULL, NULL, token);
2396       if (BE (tree == NULL, 0))
2397         {
2398           *err = REG_ESPACE;
2399           return NULL;
2400         }
2401       if (dfa->mb_cur_max > 1)
2402         dfa->has_mb_node = 1;
2403       break;
2404     case OP_WORD:
2405     case OP_NOTWORD:
2406       tree = build_charclass_op (dfa, regexp->trans,
2407                                  (const unsigned char *) "alnum",
2408                                  (const unsigned char *) "_",
2409                                  token->type == OP_NOTWORD, err);
2410       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2411         return NULL;
2412       break;
2413     case OP_SPACE:
2414     case OP_NOTSPACE:
2415       tree = build_charclass_op (dfa, regexp->trans,
2416                                  (const unsigned char *) "space",
2417                                  (const unsigned char *) "",
2418                                  token->type == OP_NOTSPACE, err);
2419       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2420         return NULL;
2421       break;
2422     case OP_ALT:
2423     case END_OF_RE:
2424       return NULL;
2425     case BACK_SLASH:
2426       *err = REG_EESCAPE;
2427       return NULL;
2428     default:
2429       /* Must not happen?  */
2430 #ifdef DEBUG
2431       assert (0);
2432 #endif
2433       return NULL;
2434     }
2435   fetch_token (token, regexp, syntax);
2436
2437   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2438          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2439     {
2440       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2441       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2442         return NULL;
2443       /* In BRE consecutive duplications are not allowed.  */
2444       if ((syntax & RE_CONTEXT_INVALID_DUP)
2445           && (token->type == OP_DUP_ASTERISK
2446               || token->type == OP_OPEN_DUP_NUM))
2447         {
2448           *err = REG_BADRPT;
2449           return NULL;
2450         }
2451     }
2452
2453   return tree;
2454 }
2455
2456 /* This function build the following tree, from regular expression
2457    (<reg_exp>):
2458          SUBEXP
2459             |
2460         <reg_exp>
2461 */
2462
2463 static bin_tree_t *
2464 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2465      re_string_t *regexp;
2466      regex_t *preg;
2467      re_token_t *token;
2468      reg_syntax_t syntax;
2469      int nest;
2470      reg_errcode_t *err;
2471 {
2472   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2473   bin_tree_t *tree;
2474   size_t cur_nsub;
2475   cur_nsub = preg->re_nsub++;
2476
2477   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2478
2479   /* The subexpression may be a null string.  */
2480   if (token->type == OP_CLOSE_SUBEXP)
2481     tree = NULL;
2482   else
2483     {
2484       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2485       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2486         *err = REG_EPAREN;
2487       if (BE (*err != REG_NOERROR, 0))
2488         return NULL;
2489     }
2490   dfa->completed_bkref_map |= 1 << cur_nsub;
2491
2492   tree = create_tree (dfa, tree, NULL, SUBEXP);
2493   if (BE (tree == NULL, 0))
2494     {
2495       *err = REG_ESPACE;
2496       return NULL;
2497     }
2498   tree->token.opr.idx = cur_nsub;
2499   return tree;
2500 }
2501
2502 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2503
2504 static bin_tree_t *
2505 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2506      bin_tree_t *elem;
2507      re_string_t *regexp;
2508      re_dfa_t *dfa;
2509      re_token_t *token;
2510      reg_syntax_t syntax;
2511      reg_errcode_t *err;
2512 {
2513   bin_tree_t *tree = NULL, *old_tree = NULL;
2514   int i, start, end, start_idx = re_string_cur_idx (regexp);
2515   re_token_t start_token = *token;
2516
2517   if (token->type == OP_OPEN_DUP_NUM)
2518     {
2519       end = 0;
2520       start = fetch_number (regexp, token, syntax);
2521       if (start == -1)
2522         {
2523           if (token->type == CHARACTER && token->opr.c == ',')
2524             start = 0; /* We treat "{,m}" as "{0,m}".  */
2525           else
2526             {
2527               *err = REG_BADBR; /* <re>{} is invalid.  */
2528               return NULL;
2529             }
2530         }
2531       if (BE (start != -2, 1))
2532         {
2533           /* We treat "{n}" as "{n,n}".  */
2534           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2535                  : ((token->type == CHARACTER && token->opr.c == ',')
2536                     ? fetch_number (regexp, token, syntax) : -2));
2537         }
2538       if (BE (start == -2 || end == -2, 0))
2539         {
2540           /* Invalid sequence.  */
2541           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2542             {
2543               if (token->type == END_OF_RE)
2544                 *err = REG_EBRACE;
2545               else
2546                 *err = REG_BADBR;
2547
2548               return NULL;
2549             }
2550
2551           /* If the syntax bit is set, rollback.  */
2552           re_string_set_index (regexp, start_idx);
2553           *token = start_token;
2554           token->type = CHARACTER;
2555           /* mb_partial and word_char bits should be already initialized by
2556              peek_token.  */
2557           return elem;
2558         }
2559
2560       if (BE (end != -1 && start > end, 0))
2561         {
2562           /* First number greater than second.  */
2563           *err = REG_BADBR;
2564           return NULL;
2565         }
2566     }
2567   else
2568     {
2569       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2570       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2571     }
2572
2573   fetch_token (token, regexp, syntax);
2574
2575   if (BE (elem == NULL, 0))
2576     return NULL;
2577   if (BE (start == 0 && end == 0, 0))
2578     {
2579       postorder (elem, free_tree, NULL);
2580       return NULL;
2581     }
2582
2583   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2584   if (BE (start > 0, 0))
2585     {
2586       tree = elem;
2587       for (i = 2; i <= start; ++i)
2588         {
2589           elem = duplicate_tree (elem, dfa);
2590           tree = create_tree (dfa, tree, elem, CONCAT);
2591           if (BE (elem == NULL || tree == NULL, 0))
2592             goto parse_dup_op_espace;
2593         }
2594
2595       if (start == end)
2596         return tree;
2597
2598       /* Duplicate ELEM before it is marked optional.  */
2599       elem = duplicate_tree (elem, dfa);
2600       old_tree = tree;
2601     }
2602   else
2603     old_tree = NULL;
2604
2605   if (elem->token.type == SUBEXP)
2606     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2607
2608   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2609   if (BE (tree == NULL, 0))
2610     goto parse_dup_op_espace;
2611
2612   /* This loop is actually executed only when end != -1,
2613      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2614      already created the start+1-th copy.  */
2615   for (i = start + 2; i <= end; ++i)
2616     {
2617       elem = duplicate_tree (elem, dfa);
2618       tree = create_tree (dfa, tree, elem, CONCAT);
2619       if (BE (elem == NULL || tree == NULL, 0))
2620         goto parse_dup_op_espace;
2621
2622       tree = create_tree (dfa, tree, NULL, OP_ALT);
2623       if (BE (tree == NULL, 0))
2624         goto parse_dup_op_espace;
2625     }
2626
2627   if (old_tree)
2628     tree = create_tree (dfa, old_tree, tree, CONCAT);
2629
2630   return tree;
2631
2632  parse_dup_op_espace:
2633   *err = REG_ESPACE;
2634   return NULL;
2635 }
2636
2637 /* Size of the names for collating symbol/equivalence_class/character_class.
2638    I'm not sure, but maybe enough.  */
2639 #define BRACKET_NAME_BUF_SIZE 32
2640
2641 #ifndef _LIBC
2642   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2643      Build the range expression which starts from START_ELEM, and ends
2644      at END_ELEM.  The result are written to MBCSET and SBCSET.
2645      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2646      mbcset->range_ends, is a pointer argument sinse we may
2647      update it.  */
2648
2649 static reg_errcode_t
2650 # ifdef RE_ENABLE_I18N
2651 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2652      re_charset_t *mbcset;
2653      int *range_alloc;
2654 # else /* not RE_ENABLE_I18N */
2655 build_range_exp (sbcset, start_elem, end_elem)
2656 # endif /* not RE_ENABLE_I18N */
2657      re_bitset_ptr_t sbcset;
2658      bracket_elem_t *start_elem, *end_elem;
2659 {
2660   unsigned int start_ch, end_ch;
2661   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2662   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2663           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2664           0))
2665     return REG_ERANGE;
2666
2667   /* We can handle no multi character collating elements without libc
2668      support.  */
2669   if (BE ((start_elem->type == COLL_SYM
2670            && strlen ((char *) start_elem->opr.name) > 1)
2671           || (end_elem->type == COLL_SYM
2672               && strlen ((char *) end_elem->opr.name) > 1), 0))
2673     return REG_ECOLLATE;
2674
2675 # ifdef RE_ENABLE_I18N
2676   {
2677     wchar_t wc, start_wc, end_wc;
2678     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2679
2680     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2681                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2682                    : 0));
2683     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2684               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2685                  : 0));
2686     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2687                 ? __btowc (start_ch) : start_elem->opr.wch);
2688     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2689               ? __btowc (end_ch) : end_elem->opr.wch);
2690     if (start_wc == WEOF || end_wc == WEOF)
2691       return REG_ECOLLATE;
2692     cmp_buf[0] = start_wc;
2693     cmp_buf[4] = end_wc;
2694     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2695       return REG_ERANGE;
2696
2697     /* Got valid collation sequence values, add them as a new entry.
2698        However, for !_LIBC we have no collation elements: if the
2699        character set is single byte, the single byte character set
2700        that we build below suffices.  parse_bracket_exp passes
2701        no MBCSET if dfa->mb_cur_max == 1.  */
2702     if (mbcset)
2703       {
2704         /* Check the space of the arrays.  */
2705         if (BE (*range_alloc == mbcset->nranges, 0))
2706           {
2707             /* There is not enough space, need realloc.  */
2708             wchar_t *new_array_start, *new_array_end;
2709             int new_nranges;
2710
2711             /* +1 in case of mbcset->nranges is 0.  */
2712             new_nranges = 2 * mbcset->nranges + 1;
2713             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2714                are NULL if *range_alloc == 0.  */
2715             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2716                                           new_nranges);
2717             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2718                                         new_nranges);
2719
2720             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2721               return REG_ESPACE;
2722
2723             mbcset->range_starts = new_array_start;
2724             mbcset->range_ends = new_array_end;
2725             *range_alloc = new_nranges;
2726           }
2727
2728         mbcset->range_starts[mbcset->nranges] = start_wc;
2729         mbcset->range_ends[mbcset->nranges++] = end_wc;
2730       }
2731
2732     /* Build the table for single byte characters.  */
2733     for (wc = 0; wc < SBC_MAX; ++wc)
2734       {
2735         cmp_buf[2] = wc;
2736         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2737             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2738           bitset_set (sbcset, wc);
2739       }
2740   }
2741 # else /* not RE_ENABLE_I18N */
2742   {
2743     unsigned int ch;
2744     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2745                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2746                    : 0));
2747     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2748               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2749                  : 0));
2750     if (start_ch > end_ch)
2751       return REG_ERANGE;
2752     /* Build the table for single byte characters.  */
2753     for (ch = 0; ch < SBC_MAX; ++ch)
2754       if (start_ch <= ch  && ch <= end_ch)
2755         bitset_set (sbcset, ch);
2756   }
2757 # endif /* not RE_ENABLE_I18N */
2758   return REG_NOERROR;
2759 }
2760 #endif /* not _LIBC */
2761
2762 #ifndef _LIBC
2763 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2764    Build the collating element which is represented by NAME.
2765    The result are written to MBCSET and SBCSET.
2766    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2767    pointer argument since we may update it.  */
2768
2769 static reg_errcode_t
2770 # ifdef RE_ENABLE_I18N
2771 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2772      re_charset_t *mbcset;
2773      int *coll_sym_alloc;
2774 # else /* not RE_ENABLE_I18N */
2775 build_collating_symbol (sbcset, name)
2776 # endif /* not RE_ENABLE_I18N */
2777      re_bitset_ptr_t sbcset;
2778      const unsigned char *name;
2779 {
2780   size_t name_len = strlen ((const char *) name);
2781   if (BE (name_len != 1, 0))
2782     return REG_ECOLLATE;
2783   else
2784     {
2785       bitset_set (sbcset, name[0]);
2786       return REG_NOERROR;
2787     }
2788 }
2789 #endif /* not _LIBC */
2790
2791 /* This function parse bracket expression like "[abc]", "[a-c]",
2792    "[[.a-a.]]" etc.  */
2793
2794 static bin_tree_t *
2795 parse_bracket_exp (regexp, dfa, token, syntax, err)
2796      re_string_t *regexp;
2797      re_dfa_t *dfa;
2798      re_token_t *token;
2799      reg_syntax_t syntax;
2800      reg_errcode_t *err;
2801 {
2802 #ifdef _LIBC
2803   const unsigned char *collseqmb;
2804   const char *collseqwc;
2805   uint32_t nrules;
2806   int32_t table_size;
2807   const int32_t *symb_table;
2808   const unsigned char *extra;
2809
2810   /* Local function for parse_bracket_exp used in _LIBC environement.
2811      Seek the collating symbol entry correspondings to NAME.
2812      Return the index of the symbol in the SYMB_TABLE.  */
2813
2814   auto inline int32_t
2815   __attribute ((always_inline))
2816   seek_collating_symbol_entry (name, name_len)
2817          const unsigned char *name;
2818          size_t name_len;
2819     {
2820       int32_t hash = elem_hash ((const char *) name, name_len);
2821       int32_t elem = hash % table_size;
2822       int32_t second = hash % (table_size - 2);
2823       while (symb_table[2 * elem] != 0)
2824         {
2825           /* First compare the hashing value.  */
2826           if (symb_table[2 * elem] == hash
2827               /* Compare the length of the name.  */
2828               && name_len == extra[symb_table[2 * elem + 1]]
2829               /* Compare the name.  */
2830               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2831                          name_len) == 0)
2832             {
2833               /* Yep, this is the entry.  */
2834               break;
2835             }
2836
2837           /* Next entry.  */
2838           elem += second;
2839         }
2840       return elem;
2841     }
2842
2843   /* Local function for parse_bracket_exp used in _LIBC environement.
2844      Look up the collation sequence value of BR_ELEM.
2845      Return the value if succeeded, UINT_MAX otherwise.  */
2846
2847   auto inline unsigned int
2848   __attribute ((always_inline))
2849   lookup_collation_sequence_value (br_elem)
2850          bracket_elem_t *br_elem;
2851     {
2852       if (br_elem->type == SB_CHAR)
2853         {
2854           /*
2855           if (MB_CUR_MAX == 1)
2856           */
2857           if (nrules == 0)
2858             return collseqmb[br_elem->opr.ch];
2859           else
2860             {
2861               wint_t wc = __btowc (br_elem->opr.ch);
2862               return __collseq_table_lookup (collseqwc, wc);
2863             }
2864         }
2865       else if (br_elem->type == MB_CHAR)
2866         {
2867           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2868         }
2869       else if (br_elem->type == COLL_SYM)
2870         {
2871           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2872           if (nrules != 0)
2873             {
2874               int32_t elem, idx;
2875               elem = seek_collating_symbol_entry (br_elem->opr.name,
2876                                                   sym_name_len);
2877               if (symb_table[2 * elem] != 0)
2878                 {
2879                   /* We found the entry.  */
2880                   idx = symb_table[2 * elem + 1];
2881                   /* Skip the name of collating element name.  */
2882                   idx += 1 + extra[idx];
2883                   /* Skip the byte sequence of the collating element.  */
2884                   idx += 1 + extra[idx];
2885                   /* Adjust for the alignment.  */
2886                   idx = (idx + 3) & ~3;
2887                   /* Skip the multibyte collation sequence value.  */
2888                   idx += sizeof (unsigned int);
2889                   /* Skip the wide char sequence of the collating element.  */
2890                   idx += sizeof (unsigned int) *
2891                     (1 + *(unsigned int *) (extra + idx));
2892                   /* Return the collation sequence value.  */
2893                   return *(unsigned int *) (extra + idx);
2894                 }
2895               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2896                 {
2897                   /* No valid character.  Match it as a single byte
2898                      character.  */
2899                   return collseqmb[br_elem->opr.name[0]];
2900                 }
2901             }
2902           else if (sym_name_len == 1)
2903             return collseqmb[br_elem->opr.name[0]];
2904         }
2905       return UINT_MAX;
2906     }
2907
2908   /* Local function for parse_bracket_exp used in _LIBC environement.
2909      Build the range expression which starts from START_ELEM, and ends
2910      at END_ELEM.  The result are written to MBCSET and SBCSET.
2911      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2912      mbcset->range_ends, is a pointer argument sinse we may
2913      update it.  */
2914
2915   auto inline reg_errcode_t
2916   __attribute ((always_inline))
2917   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2918          re_charset_t *mbcset;
2919          int *range_alloc;
2920          re_bitset_ptr_t sbcset;
2921          bracket_elem_t *start_elem, *end_elem;
2922     {
2923       unsigned int ch;
2924       uint32_t start_collseq;
2925       uint32_t end_collseq;
2926
2927       /* Equivalence Classes and Character Classes can't be a range
2928          start/end.  */
2929       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2930               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2931               0))
2932         return REG_ERANGE;
2933
2934       start_collseq = lookup_collation_sequence_value (start_elem);
2935       end_collseq = lookup_collation_sequence_value (end_elem);
2936       /* Check start/end collation sequence values.  */
2937       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2938         return REG_ECOLLATE;
2939       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2940         return REG_ERANGE;
2941
2942       /* Got valid collation sequence values, add them as a new entry.
2943          However, if we have no collation elements, and the character set
2944          is single byte, the single byte character set that we
2945          build below suffices. */
2946       if (nrules > 0 || dfa->mb_cur_max > 1)
2947         {
2948           /* Check the space of the arrays.  */
2949           if (BE (*range_alloc == mbcset->nranges, 0))
2950             {
2951               /* There is not enough space, need realloc.  */
2952               uint32_t *new_array_start;
2953               uint32_t *new_array_end;
2954               int new_nranges;
2955
2956               /* +1 in case of mbcset->nranges is 0.  */
2957               new_nranges = 2 * mbcset->nranges + 1;
2958               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2959                                             new_nranges);
2960               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2961                                           new_nranges);
2962
2963               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2964                 return REG_ESPACE;
2965
2966               mbcset->range_starts = new_array_start;
2967               mbcset->range_ends = new_array_end;
2968               *range_alloc = new_nranges;
2969             }
2970
2971           mbcset->range_starts[mbcset->nranges] = start_collseq;
2972           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2973         }
2974
2975       /* Build the table for single byte characters.  */
2976       for (ch = 0; ch < SBC_MAX; ch++)
2977         {
2978           uint32_t ch_collseq;
2979           /*
2980           if (MB_CUR_MAX == 1)
2981           */
2982           if (nrules == 0)
2983             ch_collseq = collseqmb[ch];
2984           else
2985             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2986           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2987             bitset_set (sbcset, ch);
2988         }
2989       return REG_NOERROR;
2990     }
2991
2992   /* Local function for parse_bracket_exp used in _LIBC environement.
2993      Build the collating element which is represented by NAME.
2994      The result are written to MBCSET and SBCSET.
2995      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2996      pointer argument sinse we may update it.  */
2997
2998   auto inline reg_errcode_t
2999   __attribute ((always_inline))
3000   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3001          re_charset_t *mbcset;
3002          int *coll_sym_alloc;
3003          re_bitset_ptr_t sbcset;
3004          const unsigned char *name;
3005     {
3006       int32_t elem, idx;
3007       size_t name_len = strlen ((const char *) name);
3008       if (nrules != 0)
3009         {
3010           elem = seek_collating_symbol_entry (name, name_len);
3011           if (symb_table[2 * elem] != 0)
3012             {
3013               /* We found the entry.  */
3014               idx = symb_table[2 * elem + 1];
3015               /* Skip the name of collating element name.  */
3016               idx += 1 + extra[idx];
3017             }
3018           else if (symb_table[2 * elem] == 0 && name_len == 1)
3019             {
3020               /* No valid character, treat it as a normal
3021                  character.  */
3022               bitset_set (sbcset, name[0]);
3023               return REG_NOERROR;
3024             }
3025           else
3026             return REG_ECOLLATE;
3027
3028           /* Got valid collation sequence, add it as a new entry.  */
3029           /* Check the space of the arrays.  */
3030           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3031             {
3032               /* Not enough, realloc it.  */
3033               /* +1 in case of mbcset->ncoll_syms is 0.  */
3034               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3035               /* Use realloc since mbcset->coll_syms is NULL
3036                  if *alloc == 0.  */
3037               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3038                                                    new_coll_sym_alloc);
3039               if (BE (new_coll_syms == NULL, 0))
3040                 return REG_ESPACE;
3041               mbcset->coll_syms = new_coll_syms;
3042               *coll_sym_alloc = new_coll_sym_alloc;
3043             }
3044           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3045           return REG_NOERROR;
3046         }
3047       else
3048         {
3049           if (BE (name_len != 1, 0))
3050             return REG_ECOLLATE;
3051           else
3052             {
3053               bitset_set (sbcset, name[0]);
3054               return REG_NOERROR;
3055             }
3056         }
3057     }
3058 #endif
3059
3060   re_token_t br_token;
3061   re_bitset_ptr_t sbcset;
3062 #ifdef RE_ENABLE_I18N
3063   re_charset_t *mbcset;
3064   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3065   int equiv_class_alloc = 0, char_class_alloc = 0;
3066 #endif /* not RE_ENABLE_I18N */
3067   int non_match = 0;
3068   bin_tree_t *work_tree;
3069   int token_len;
3070   int first_round = 1;
3071 #ifdef _LIBC
3072   collseqmb = (const unsigned char *)
3073     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3074   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3075   if (nrules)
3076     {
3077       /*
3078       if (MB_CUR_MAX > 1)
3079       */
3080         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3081       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3082       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3083                                                   _NL_COLLATE_SYMB_TABLEMB);
3084       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3085                                                    _NL_COLLATE_SYMB_EXTRAMB);
3086     }
3087 #endif
3088   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3089 #ifdef RE_ENABLE_I18N
3090   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3091 #endif /* RE_ENABLE_I18N */
3092 #ifdef RE_ENABLE_I18N
3093   if (BE (sbcset == NULL || mbcset == NULL, 0))
3094 #else
3095   if (BE (sbcset == NULL, 0))
3096 #endif /* RE_ENABLE_I18N */
3097     {
3098       *err = REG_ESPACE;
3099       return NULL;
3100     }
3101
3102   token_len = peek_token_bracket (token, regexp, syntax);
3103   if (BE (token->type == END_OF_RE, 0))
3104     {
3105       *err = REG_BADPAT;
3106       goto parse_bracket_exp_free_return;
3107     }
3108   if (token->type == OP_NON_MATCH_LIST)
3109     {
3110 #ifdef RE_ENABLE_I18N
3111       mbcset->non_match = 1;
3112 #endif /* not RE_ENABLE_I18N */
3113       non_match = 1;
3114       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3115         bitset_set (sbcset, '\0');
3116       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3117       token_len = peek_token_bracket (token, regexp, syntax);
3118       if (BE (token->type == END_OF_RE, 0))
3119         {
3120           *err = REG_BADPAT;
3121           goto parse_bracket_exp_free_return;
3122         }
3123     }
3124
3125   /* We treat the first ']' as a normal character.  */
3126   if (token->type == OP_CLOSE_BRACKET)
3127     token->type = CHARACTER;
3128
3129   while (1)
3130     {
3131       bracket_elem_t start_elem, end_elem;
3132       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3133       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3134       reg_errcode_t ret;
3135       int token_len2 = 0, is_range_exp = 0;
3136       re_token_t token2;
3137
3138       start_elem.opr.name = start_name_buf;
3139       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3140                                    syntax, first_round);
3141       if (BE (ret != REG_NOERROR, 0))
3142         {
3143           *err = ret;
3144           goto parse_bracket_exp_free_return;
3145         }
3146       first_round = 0;
3147
3148       /* Get information about the next token.  We need it in any case.  */
3149       token_len = peek_token_bracket (token, regexp, syntax);
3150
3151       /* Do not check for ranges if we know they are not allowed.  */
3152       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3153         {
3154           if (BE (token->type == END_OF_RE, 0))
3155             {
3156               *err = REG_EBRACK;
3157               goto parse_bracket_exp_free_return;
3158             }
3159           if (token->type == OP_CHARSET_RANGE)
3160             {
3161               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3162               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3163               if (BE (token2.type == END_OF_RE, 0))
3164                 {
3165                   *err = REG_EBRACK;
3166                   goto parse_bracket_exp_free_return;
3167                 }
3168               if (token2.type == OP_CLOSE_BRACKET)
3169                 {
3170                   /* We treat the last '-' as a normal character.  */
3171                   re_string_skip_bytes (regexp, -token_len);
3172                   token->type = CHARACTER;
3173                 }
3174               else
3175                 is_range_exp = 1;
3176             }
3177         }
3178
3179       if (is_range_exp == 1)
3180         {
3181           end_elem.opr.name = end_name_buf;
3182           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3183                                        dfa, syntax, 1);
3184           if (BE (ret != REG_NOERROR, 0))
3185             {
3186               *err = ret;
3187               goto parse_bracket_exp_free_return;
3188             }
3189
3190           token_len = peek_token_bracket (token, regexp, syntax);
3191
3192 #ifdef _LIBC
3193           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3194                                   &start_elem, &end_elem);
3195 #else
3196 # ifdef RE_ENABLE_I18N
3197           *err = build_range_exp (sbcset,
3198                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3199                                   &range_alloc, &start_elem, &end_elem);
3200 # else
3201           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3202 # endif
3203 #endif /* RE_ENABLE_I18N */
3204           if (BE (*err != REG_NOERROR, 0))
3205             goto parse_bracket_exp_free_return;
3206         }
3207       else
3208         {
3209           switch (start_elem.type)
3210             {
3211             case SB_CHAR:
3212               bitset_set (sbcset, start_elem.opr.ch);
3213               break;
3214 #ifdef RE_ENABLE_I18N
3215             case MB_CHAR:
3216               /* Check whether the array has enough space.  */
3217               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3218                 {
3219                   wchar_t *new_mbchars;
3220                   /* Not enough, realloc it.  */
3221                   /* +1 in case of mbcset->nmbchars is 0.  */
3222                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3223                   /* Use realloc since array is NULL if *alloc == 0.  */
3224                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3225                                             mbchar_alloc);
3226                   if (BE (new_mbchars == NULL, 0))
3227                     goto parse_bracket_exp_espace;
3228                   mbcset->mbchars = new_mbchars;
3229                 }
3230               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3231               break;
3232 #endif /* RE_ENABLE_I18N */
3233             case EQUIV_CLASS:
3234               *err = build_equiv_class (sbcset,
3235 #ifdef RE_ENABLE_I18N
3236                                         mbcset, &equiv_class_alloc,
3237 #endif /* RE_ENABLE_I18N */
3238                                         start_elem.opr.name);
3239               if (BE (*err != REG_NOERROR, 0))
3240                 goto parse_bracket_exp_free_return;
3241               break;
3242             case COLL_SYM:
3243               *err = build_collating_symbol (sbcset,
3244 #ifdef RE_ENABLE_I18N
3245                                              mbcset, &coll_sym_alloc,
3246 #endif /* RE_ENABLE_I18N */
3247                                              start_elem.opr.name);
3248               if (BE (*err != REG_NOERROR, 0))
3249                 goto parse_bracket_exp_free_return;
3250               break;
3251             case CHAR_CLASS:
3252               *err = build_charclass (regexp->trans, sbcset,
3253 #ifdef RE_ENABLE_I18N
3254                                       mbcset, &char_class_alloc,
3255 #endif /* RE_ENABLE_I18N */
3256                                       start_elem.opr.name, syntax);
3257               if (BE (*err != REG_NOERROR, 0))
3258                goto parse_bracket_exp_free_return;
3259               break;
3260             default:
3261               assert (0);
3262               break;
3263             }
3264         }
3265       if (BE (token->type == END_OF_RE, 0))
3266         {
3267           *err = REG_EBRACK;
3268           goto parse_bracket_exp_free_return;
3269         }
3270       if (token->type == OP_CLOSE_BRACKET)
3271         break;
3272     }
3273
3274   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3275
3276   /* If it is non-matching list.  */
3277   if (non_match)
3278     bitset_not (sbcset);
3279
3280 #ifdef RE_ENABLE_I18N
3281   /* Ensure only single byte characters are set.  */
3282   if (dfa->mb_cur_max > 1)
3283     bitset_mask (sbcset, dfa->sb_char);
3284
3285   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3286       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3287                                                      || mbcset->non_match)))
3288     {
3289       bin_tree_t *mbc_tree;
3290       int sbc_idx;
3291       /* Build a tree for complex bracket.  */
3292       dfa->has_mb_node = 1;
3293       br_token.type = COMPLEX_BRACKET;
3294       br_token.opr.mbcset = mbcset;
3295       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3296       if (BE (mbc_tree == NULL, 0))
3297         goto parse_bracket_exp_espace;
3298       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3299         if (sbcset[sbc_idx])
3300           break;
3301       /* If there are no bits set in sbcset, there is no point
3302          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3303       if (sbc_idx < BITSET_UINTS)
3304         {
3305           /* Build a tree for simple bracket.  */
3306           br_token.type = SIMPLE_BRACKET;
3307           br_token.opr.sbcset = sbcset;
3308           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3309           if (BE (work_tree == NULL, 0))
3310             goto parse_bracket_exp_espace;
3311
3312           /* Then join them by ALT node.  */
3313           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3314           if (BE (work_tree == NULL, 0))
3315             goto parse_bracket_exp_espace;
3316         }
3317       else
3318         {
3319           re_free (sbcset);
3320           work_tree = mbc_tree;
3321         }
3322     }
3323   else
3324 #endif /* not RE_ENABLE_I18N */
3325     {
3326 #ifdef RE_ENABLE_I18N
3327       free_charset (mbcset);
3328 #endif
3329       /* Build a tree for simple bracket.  */
3330       br_token.type = SIMPLE_BRACKET;
3331       br_token.opr.sbcset = sbcset;
3332       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3333       if (BE (work_tree == NULL, 0))
3334         goto parse_bracket_exp_espace;
3335     }
3336   return work_tree;
3337
3338  parse_bracket_exp_espace:
3339   *err = REG_ESPACE;
3340  parse_bracket_exp_free_return:
3341   re_free (sbcset);
3342 #ifdef RE_ENABLE_I18N
3343   free_charset (mbcset);
3344 #endif /* RE_ENABLE_I18N */
3345   return NULL;
3346 }
3347
3348 /* Parse an element in the bracket expression.  */
3349
3350 static reg_errcode_t
3351 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3352                        accept_hyphen)
3353      bracket_elem_t *elem;
3354      re_string_t *regexp;
3355      re_token_t *token;
3356      int token_len;
3357      re_dfa_t *dfa;
3358      reg_syntax_t syntax;
3359      int accept_hyphen;
3360 {
3361 #ifdef RE_ENABLE_I18N
3362   int cur_char_size;
3363   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3364   if (cur_char_size > 1)
3365     {
3366       elem->type = MB_CHAR;
3367       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3368       re_string_skip_bytes (regexp, cur_char_size);
3369       return REG_NOERROR;
3370     }
3371 #endif /* RE_ENABLE_I18N */
3372   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3373   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3374       || token->type == OP_OPEN_EQUIV_CLASS)
3375     return parse_bracket_symbol (elem, regexp, token);
3376   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3377     {
3378       /* A '-' must only appear as anything but a range indicator before
3379          the closing bracket.  Everything else is an error.  */
3380       re_token_t token2;
3381       (void) peek_token_bracket (&token2, regexp, syntax);
3382       if (token2.type != OP_CLOSE_BRACKET)
3383         /* The actual error value is not standardized since this whole
3384            case is undefined.  But ERANGE makes good sense.  */
3385         return REG_ERANGE;
3386     }
3387   elem->type = SB_CHAR;
3388   elem->opr.ch = token->opr.c;
3389   return REG_NOERROR;
3390 }
3391
3392 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3393    such as [:<character_class>:], [.<collating_element>.], and
3394    [=<equivalent_class>=].  */
3395
3396 static reg_errcode_t
3397 parse_bracket_symbol (elem, regexp, token)
3398      bracket_elem_t *elem;
3399      re_string_t *regexp;
3400      re_token_t *token;
3401 {
3402   unsigned char ch, delim = token->opr.c;
3403   int i = 0;
3404   if (re_string_eoi(regexp))
3405     return REG_EBRACK;
3406   for (;; ++i)
3407     {
3408       if (i >= BRACKET_NAME_BUF_SIZE)
3409         return REG_EBRACK;
3410       if (token->type == OP_OPEN_CHAR_CLASS)
3411         ch = re_string_fetch_byte_case (regexp);
3412       else
3413         ch = re_string_fetch_byte (regexp);
3414       if (re_string_eoi(regexp))
3415         return REG_EBRACK;
3416       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3417         break;
3418       elem->opr.name[i] = ch;
3419     }
3420   re_string_skip_bytes (regexp, 1);
3421   elem->opr.name[i] = '\0';
3422   switch (token->type)
3423     {
3424     case OP_OPEN_COLL_ELEM:
3425       elem->type = COLL_SYM;
3426       break;
3427     case OP_OPEN_EQUIV_CLASS:
3428       elem->type = EQUIV_CLASS;
3429       break;
3430     case OP_OPEN_CHAR_CLASS:
3431       elem->type = CHAR_CLASS;
3432       break;
3433     default:
3434       break;
3435     }
3436   return REG_NOERROR;
3437 }
3438
3439   /* Helper function for parse_bracket_exp.
3440      Build the equivalence class which is represented by NAME.
3441      The result are written to MBCSET and SBCSET.
3442      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3443      is a pointer argument sinse we may update it.  */
3444
3445 static reg_errcode_t
3446 #ifdef RE_ENABLE_I18N
3447 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3448      re_charset_t *mbcset;
3449      int *equiv_class_alloc;
3450 #else /* not RE_ENABLE_I18N */
3451 build_equiv_class (sbcset, name)
3452 #endif /* not RE_ENABLE_I18N */
3453      re_bitset_ptr_t sbcset;
3454      const unsigned char *name;
3455 {
3456 #if defined _LIBC
3457   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3458   if (nrules != 0)
3459     {
3460       const int32_t *table, *indirect;
3461       const unsigned char *weights, *extra, *cp;
3462       unsigned char char_buf[2];
3463       int32_t idx1, idx2;
3464       unsigned int ch;
3465       size_t len;
3466       /* This #include defines a local function!  */
3467 # include <locale/weight.h>
3468       /* Calculate the index for equivalence class.  */
3469       cp = name;
3470       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3471       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3472                                                _NL_COLLATE_WEIGHTMB);
3473       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3474                                                    _NL_COLLATE_EXTRAMB);
3475       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3476                                                 _NL_COLLATE_INDIRECTMB);
3477       idx1 = findidx (&cp);
3478       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3479         /* This isn't a valid character.  */
3480         return REG_ECOLLATE;
3481
3482       /* Build single byte matcing table for this equivalence class.  */
3483       char_buf[1] = (unsigned char) '\0';
3484       len = weights[idx1];
3485       for (ch = 0; ch < SBC_MAX; ++ch)
3486         {
3487           char_buf[0] = ch;
3488           cp = char_buf;
3489           idx2 = findidx (&cp);
3490 /*
3491           idx2 = table[ch];
3492 */
3493           if (idx2 == 0)
3494             /* This isn't a valid character.  */
3495             continue;
3496           if (len == weights[idx2])
3497             {
3498               int cnt = 0;
3499               while (cnt <= len &&
3500                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3501                 ++cnt;
3502
3503               if (cnt > len)
3504                 bitset_set (sbcset, ch);
3505             }
3506         }
3507       /* Check whether the array has enough space.  */
3508       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3509         {
3510           /* Not enough, realloc it.  */
3511           /* +1 in case of mbcset->nequiv_classes is 0.  */
3512           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3513           /* Use realloc since the array is NULL if *alloc == 0.  */
3514           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3515                                                    int32_t,
3516                                                    new_equiv_class_alloc);
3517           if (BE (new_equiv_classes == NULL, 0))
3518             return REG_ESPACE;
3519           mbcset->equiv_classes = new_equiv_classes;
3520           *equiv_class_alloc = new_equiv_class_alloc;
3521         }
3522       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3523     }
3524   else
3525 #endif /* _LIBC */
3526     {
3527       if (BE (strlen ((const char *) name) != 1, 0))
3528         return REG_ECOLLATE;
3529       bitset_set (sbcset, *name);
3530     }
3531   return REG_NOERROR;
3532 }
3533
3534   /* Helper function for parse_bracket_exp.
3535      Build the character class which is represented by NAME.
3536      The result are written to MBCSET and SBCSET.
3537      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3538      is a pointer argument sinse we may update it.  */
3539
3540 static reg_errcode_t
3541 #ifdef RE_ENABLE_I18N
3542 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3543      re_charset_t *mbcset;
3544      int *char_class_alloc;
3545 #else /* not RE_ENABLE_I18N */
3546 build_charclass (trans, sbcset, class_name, syntax)
3547 #endif /* not RE_ENABLE_I18N */
3548      unsigned RE_TRANSLATE_TYPE trans;
3549      re_bitset_ptr_t sbcset;
3550      const unsigned char *class_name;
3551      reg_syntax_t syntax;
3552 {
3553   int i;
3554   const char *name = (const char *) class_name;
3555
3556   /* In case of REG_ICASE "upper" and "lower" match the both of
3557      upper and lower cases.  */
3558   if ((syntax & RE_ICASE)
3559       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3560     name = "alpha";
3561
3562 #ifdef RE_ENABLE_I18N
3563   /* Check the space of the arrays.  */
3564   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3565     {
3566       /* Not enough, realloc it.  */
3567       /* +1 in case of mbcset->nchar_classes is 0.  */
3568       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3569       /* Use realloc since array is NULL if *alloc == 0.  */
3570       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3571                                                new_char_class_alloc);
3572       if (BE (new_char_classes == NULL, 0))
3573         return REG_ESPACE;
3574       mbcset->char_classes = new_char_classes;
3575       *char_class_alloc = new_char_class_alloc;
3576     }
3577   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3578 #endif /* RE_ENABLE_I18N */
3579
3580 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3581     for (i = 0; i < SBC_MAX; ++i)               \
3582       {                                         \
3583         if (ctype_func (i))                     \
3584           {                                     \
3585             int ch = trans ? trans[i] : i;      \
3586             bitset_set (sbcset, ch);            \
3587           }                                     \
3588       }
3589
3590   if (strcmp (name, "alnum") == 0)
3591     BUILD_CHARCLASS_LOOP (isalnum)
3592   else if (strcmp (name, "cntrl") == 0)
3593     BUILD_CHARCLASS_LOOP (iscntrl)
3594   else if (strcmp (name, "lower") == 0)
3595     BUILD_CHARCLASS_LOOP (islower)
3596   else if (strcmp (name, "space") == 0)
3597     BUILD_CHARCLASS_LOOP (isspace)
3598   else if (strcmp (name, "alpha") == 0)
3599     BUILD_CHARCLASS_LOOP (isalpha)
3600   else if (strcmp (name, "digit") == 0)
3601     BUILD_CHARCLASS_LOOP (isdigit)
3602   else if (strcmp (name, "print") == 0)
3603     BUILD_CHARCLASS_LOOP (isprint)
3604   else if (strcmp (name, "upper") == 0)
3605     BUILD_CHARCLASS_LOOP (isupper)
3606   else if (strcmp (name, "blank") == 0)
3607     BUILD_CHARCLASS_LOOP (isblank)
3608   else if (strcmp (name, "graph") == 0)
3609     BUILD_CHARCLASS_LOOP (isgraph)
3610   else if (strcmp (name, "punct") == 0)
3611     BUILD_CHARCLASS_LOOP (ispunct)
3612   else if (strcmp (name, "xdigit") == 0)
3613     BUILD_CHARCLASS_LOOP (isxdigit)
3614   else
3615     return REG_ECTYPE;
3616
3617   return REG_NOERROR;
3618 }
3619
3620 static bin_tree_t *
3621 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3622      re_dfa_t *dfa;
3623      unsigned RE_TRANSLATE_TYPE trans;
3624      const unsigned char *class_name;
3625      const unsigned char *extra;
3626      int non_match;
3627      reg_errcode_t *err;
3628 {
3629   re_bitset_ptr_t sbcset;
3630 #ifdef RE_ENABLE_I18N
3631   re_charset_t *mbcset;
3632   int alloc = 0;
3633 #endif /* not RE_ENABLE_I18N */
3634   reg_errcode_t ret;
3635   re_token_t br_token;
3636   bin_tree_t *tree;
3637
3638   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3639 #ifdef RE_ENABLE_I18N
3640   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3641 #endif /* RE_ENABLE_I18N */
3642
3643 #ifdef RE_ENABLE_I18N
3644   if (BE (sbcset == NULL || mbcset == NULL, 0))
3645 #else /* not RE_ENABLE_I18N */
3646   if (BE (sbcset == NULL, 0))
3647 #endif /* not RE_ENABLE_I18N */
3648     {
3649       *err = REG_ESPACE;
3650       return NULL;
3651     }
3652
3653   if (non_match)
3654     {
3655 #ifdef RE_ENABLE_I18N
3656       /*
3657       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3658         bitset_set(cset->sbcset, '\0');
3659       */
3660       mbcset->non_match = 1;
3661 #endif /* not RE_ENABLE_I18N */
3662     }
3663
3664   /* We don't care the syntax in this case.  */
3665   ret = build_charclass (trans, sbcset,
3666 #ifdef RE_ENABLE_I18N
3667                          mbcset, &alloc,
3668 #endif /* RE_ENABLE_I18N */
3669                          class_name, 0);
3670
3671   if (BE (ret != REG_NOERROR, 0))
3672     {
3673       re_free (sbcset);
3674 #ifdef RE_ENABLE_I18N
3675       free_charset (mbcset);
3676 #endif /* RE_ENABLE_I18N */
3677       *err = ret;
3678       return NULL;
3679     }
3680   /* \w match '_' also.  */
3681   for (; *extra; extra++)
3682     bitset_set (sbcset, *extra);
3683
3684   /* If it is non-matching list.  */
3685   if (non_match)
3686     bitset_not (sbcset);
3687
3688 #ifdef RE_ENABLE_I18N
3689   /* Ensure only single byte characters are set.  */
3690   if (dfa->mb_cur_max > 1)
3691     bitset_mask (sbcset, dfa->sb_char);
3692 #endif
3693
3694   /* Build a tree for simple bracket.  */
3695   br_token.type = SIMPLE_BRACKET;
3696   br_token.opr.sbcset = sbcset;
3697   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3698   if (BE (tree == NULL, 0))
3699     goto build_word_op_espace;
3700
3701 #ifdef RE_ENABLE_I18N
3702   if (dfa->mb_cur_max > 1)
3703     {
3704       bin_tree_t *mbc_tree;
3705       /* Build a tree for complex bracket.  */
3706       br_token.type = COMPLEX_BRACKET;
3707       br_token.opr.mbcset = mbcset;
3708       dfa->has_mb_node = 1;
3709       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3710       if (BE (mbc_tree == NULL, 0))
3711         goto build_word_op_espace;
3712       /* Then join them by ALT node.  */
3713       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3714       if (BE (mbc_tree != NULL, 1))
3715         return tree;
3716     }
3717   else
3718     {
3719       free_charset (mbcset);
3720       return tree;
3721     }
3722 #else /* not RE_ENABLE_I18N */
3723   return tree;
3724 #endif /* not RE_ENABLE_I18N */
3725
3726  build_word_op_espace:
3727   re_free (sbcset);
3728 #ifdef RE_ENABLE_I18N
3729   free_charset (mbcset);
3730 #endif /* RE_ENABLE_I18N */
3731   *err = REG_ESPACE;
3732   return NULL;
3733 }
3734
3735 /* This is intended for the expressions like "a{1,3}".
3736    Fetch a number from `input', and return the number.
3737    Return -1, if the number field is empty like "{,1}".
3738    Return -2, If an error is occured.  */
3739
3740 static int
3741 fetch_number (input, token, syntax)
3742      re_string_t *input;
3743      re_token_t *token;
3744      reg_syntax_t syntax;
3745 {
3746   int num = -1;
3747   unsigned char c;
3748   while (1)
3749     {
3750       fetch_token (token, input, syntax);
3751       c = token->opr.c;
3752       if (BE (token->type == END_OF_RE, 0))
3753         return -2;
3754       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3755         break;
3756       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3757              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3758       num = (num > RE_DUP_MAX) ? -2 : num;
3759     }
3760   return num;
3761 }
3762 \f
3763 #ifdef RE_ENABLE_I18N
3764 static void
3765 free_charset (re_charset_t *cset)
3766 {
3767   re_free (cset->mbchars);
3768 # ifdef _LIBC
3769   re_free (cset->coll_syms);
3770   re_free (cset->equiv_classes);
3771   re_free (cset->range_starts);
3772   re_free (cset->range_ends);
3773 # endif
3774   re_free (cset->char_classes);
3775   re_free (cset);
3776 }
3777 #endif /* RE_ENABLE_I18N */
3778 \f
3779 /* Functions for binary tree operation.  */
3780
3781 /* Create a tree node.  */
3782
3783 static bin_tree_t *
3784 create_tree (dfa, left, right, type)
3785      re_dfa_t *dfa;
3786      bin_tree_t *left;
3787      bin_tree_t *right;
3788      re_token_type_t type;
3789 {
3790   re_token_t t;
3791   t.type = type;
3792   return create_token_tree (dfa, left, right, &t);
3793 }
3794
3795 static bin_tree_t *
3796 create_token_tree (dfa, left, right, token)
3797      re_dfa_t *dfa;
3798      bin_tree_t *left;
3799      bin_tree_t *right;
3800      const re_token_t *token;
3801 {
3802   bin_tree_t *tree;
3803   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3804     {
3805       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3806
3807       if (storage == NULL)
3808         return NULL;
3809       storage->next = dfa->str_tree_storage;
3810       dfa->str_tree_storage = storage;
3811       dfa->str_tree_storage_idx = 0;
3812     }
3813   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3814
3815   tree->parent = NULL;
3816   tree->left = left;
3817   tree->right = right;
3818   tree->token = *token;
3819   tree->token.duplicated = 0;
3820   tree->token.opt_subexp = 0;
3821   tree->first = NULL;
3822   tree->next = NULL;
3823   tree->node_idx = -1;
3824
3825   if (left != NULL)
3826     left->parent = tree;
3827   if (right != NULL)
3828     right->parent = tree;
3829   return tree;
3830 }
3831
3832 /* Mark the tree SRC as an optional subexpression.
3833    To be called from preorder or postorder.  */
3834
3835 static reg_errcode_t
3836 mark_opt_subexp (extra, node)
3837      void *extra;
3838      bin_tree_t *node;
3839 {
3840   int idx = (int) (long) extra;
3841   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3842     node->token.opt_subexp = 1;
3843
3844   return REG_NOERROR;
3845 }
3846
3847 /* Free the allocated memory inside NODE. */
3848
3849 static void
3850 free_token (re_token_t *node)
3851 {
3852 #ifdef RE_ENABLE_I18N
3853   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3854     free_charset (node->opr.mbcset);
3855   else
3856 #endif /* RE_ENABLE_I18N */
3857     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3858       re_free (node->opr.sbcset);
3859 }
3860
3861 /* Worker function for tree walking.  Free the allocated memory inside NODE
3862    and its children. */
3863
3864 static reg_errcode_t
3865 free_tree (void *extra, bin_tree_t *node)
3866 {
3867   free_token (&node->token);
3868   return REG_NOERROR;
3869 }
3870
3871
3872 /* Duplicate the node SRC, and return new node.  This is a preorder
3873    visit similar to the one implemented by the generic visitor, but
3874    we need more infrastructure to maintain two parallel trees --- so,
3875    it's easier to duplicate.  */
3876
3877 static bin_tree_t *
3878 duplicate_tree (root, dfa)
3879      const bin_tree_t *root;
3880      re_dfa_t *dfa;
3881 {
3882   const bin_tree_t *node;
3883   bin_tree_t *dup_root;
3884   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3885
3886   for (node = root; ; )
3887     {
3888       /* Create a new tree and link it back to the current parent.  */
3889       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3890       if (*p_new == NULL)
3891         return NULL;
3892       (*p_new)->parent = dup_node;
3893       (*p_new)->token.duplicated = 1;
3894       dup_node = *p_new;
3895
3896       /* Go to the left node, or up and to the right.  */
3897       if (node->left)
3898         {
3899           node = node->left;
3900           p_new = &dup_node->left;
3901         }
3902       else
3903         {
3904           const bin_tree_t *prev = NULL;
3905           while (node->right == prev || node->right == NULL)
3906             {
3907               prev = node;
3908               node = node->parent;
3909               dup_node = dup_node->parent;
3910               if (!node)
3911                 return dup_root;
3912             }
3913           node = node->right;
3914           p_new = &dup_node->right;
3915         }
3916     }
3917 }