posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 static void init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t preorder (bin_tree_t *root,
  38                                reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                void *extra);
  40 static reg_errcode_t postorder (bin_tree_t *root,
  41                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  42                                 void *extra);
  43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  46                                  bin_tree_t *node);
  47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  50 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  51                                              int top_clone_node, int root_node,
  52                                              unsigned int constraint);
  53 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  54 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  55                                    unsigned int constraint);
  56 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  57 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  58                                          int node, int root);
  59 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  60 static int fetch_number (re_string_t *input, re_token_t *token,
  61                          reg_syntax_t syntax);
  62 static void fetch_token (re_token_t *result, re_string_t *input,
  63                          reg_syntax_t syntax);
  64 static int peek_token (re_token_t *token, re_string_t *input,
  65                         reg_syntax_t syntax);
  66 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  67                                reg_syntax_t syntax);
  68 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  69                           reg_syntax_t syntax, reg_errcode_t *err);
  70 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  74                                  re_token_t *token, reg_syntax_t syntax,
  75                                  int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  77                                      re_token_t *token, reg_syntax_t syntax,
  78                                      int nest, reg_errcode_t *err);
  79 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  80                                   re_token_t *token, reg_syntax_t syntax,
  81                                   int nest, reg_errcode_t *err);
  82 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  83                                  re_dfa_t *dfa, re_token_t *token,
  84                                  reg_syntax_t syntax, reg_errcode_t *err);
  85 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  86                                       re_token_t *token, reg_syntax_t syntax,
  87                                       reg_errcode_t *err);
  88 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  89                                             re_string_t *regexp,
  90                                             re_token_t *token, int token_len,
  91                                             re_dfa_t *dfa,
  92                                             reg_syntax_t syntax,
  93                                             int accept_hyphen);
  94 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  95                                           re_string_t *regexp,
  96                                           re_token_t *token);
  97 #ifndef _LIBC
  98 # ifdef RE_ENABLE_I18N
  99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 100                                       re_charset_t *mbcset, int *range_alloc,
 101                                       bracket_elem_t *start_elem,
 102                                       bracket_elem_t *end_elem);
 103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 104                                              re_charset_t *mbcset,
 105                                              int *coll_sym_alloc,
 106                                              const unsigned char *name);
 107 # else /* not RE_ENABLE_I18N */
 108 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 109                                       bracket_elem_t *start_elem,
 110                                       bracket_elem_t *end_elem);
 111 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 112                                              const unsigned char *name);
 113 # endif /* not RE_ENABLE_I18N */
 114 #endif /* not _LIBC */
 115 #ifdef RE_ENABLE_I18N
 116 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 117                                         re_charset_t *mbcset,
 118                                         int *equiv_class_alloc,
 119                                         const unsigned char *name);
 120 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 121                                       bitset_t sbcset,
 122                                       re_charset_t *mbcset,
 123                                       int *char_class_alloc,
 124                                       const unsigned char *class_name,
 125                                       reg_syntax_t syntax);
 126 #else  /* not RE_ENABLE_I18N */
 127 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 128                                         const unsigned char *name);
 129 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 130                                       bitset_t sbcset,
 131                                       const unsigned char *class_name,
 132                                       reg_syntax_t syntax);
 133 #endif /* not RE_ENABLE_I18N */
 134 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 135                                        RE_TRANSLATE_TYPE trans,
 136                                        const unsigned char *class_name,
 137                                        const unsigned char *extra,
 138                                        int non_match, reg_errcode_t *err);
 139 static bin_tree_t *create_tree (re_dfa_t *dfa,
 140                                 bin_tree_t *left, bin_tree_t *right,
 141                                 re_token_type_t type);
 142 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 143                                       bin_tree_t *left, bin_tree_t *right,
 144                                       const re_token_t *token);
 145 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 146 static void free_token (re_token_t *node);
 147 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 148 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 149 \f
 150 /* This table gives an error message for each of the error codes listed
 151    in regex.h.  Obviously the order here has to be same as there.
 152    POSIX doesn't require that we do anything for REG_NOERROR,
 153    but why not be nice?  */
 154
 155 const char __re_error_msgid[] attribute_hidden =
 156   {
 157 #define REG_NOERROR_IDX 0
 158     gettext_noop ("Success")    /* REG_NOERROR */
 159     "\0"
 160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 161     gettext_noop ("No match")   /* REG_NOMATCH */
 162     "\0"
 163 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 164     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 165     "\0"
 166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 167     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 168     "\0"
 169 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 170     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 171     "\0"
 172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 173     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 174     "\0"
 175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 176     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 177     "\0"
 178 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 179     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 180     "\0"
 181 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 182     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 183     "\0"
 184 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 185     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 186     "\0"
 187 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 188     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 189     "\0"
 190 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 191     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 192     "\0"
 193 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 194     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 195     "\0"
 196 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 197     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 198     "\0"
 199 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 200     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 201     "\0"
 202 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 203     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 204     "\0"
 205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 206     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 207   };
 208
 209 const size_t __re_error_msgid_idx[] attribute_hidden =
 210   {
 211     REG_NOERROR_IDX,
 212     REG_NOMATCH_IDX,
 213     REG_BADPAT_IDX,
 214     REG_ECOLLATE_IDX,
 215     REG_ECTYPE_IDX,
 216     REG_EESCAPE_IDX,
 217     REG_ESUBREG_IDX,
 218     REG_EBRACK_IDX,
 219     REG_EPAREN_IDX,
 220     REG_EBRACE_IDX,
 221     REG_BADBR_IDX,
 222     REG_ERANGE_IDX,
 223     REG_ESPACE_IDX,
 224     REG_BADRPT_IDX,
 225     REG_EEND_IDX,
 226     REG_ESIZE_IDX,
 227     REG_ERPAREN_IDX
 228   };
 229 \f
 230 /* Entry points for GNU code.  */
 231
 232 /* re_compile_pattern is the GNU regular expression compiler: it
 233    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 234    Returns 0 if the pattern was valid, otherwise an error string.
 235
 236    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 237    are set in BUFP on entry.  */
 238
 239 const char *
 240 re_compile_pattern (pattern, length, bufp)
 241     const char *pattern;
 242     size_t length;
 243     struct re_pattern_buffer *bufp;
 244 {
 245   reg_errcode_t ret;
 246
 247   /* And GNU code determines whether or not to get register information
 248      by passing null for the REGS argument to re_match, etc., not by
 249      setting no_sub, unless RE_NO_SUB is set.  */
 250   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 251
 252   /* Match anchors at newline.  */
 253   bufp->newline_anchor = 1;
 254
 255   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 256
 257   if (!ret)
 258     return NULL;
 259   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 260 }
 261 #ifdef _LIBC
 262 weak_alias (__re_compile_pattern, re_compile_pattern)
 263 #endif
 264
 265 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 266    also be assigned to arbitrarily: each pattern buffer stores its own
 267    syntax, so it can be changed between regex compilations.  */
 268 /* This has no initializer because initialized variables in Emacs
 269    become read-only after dumping.  */
 270 reg_syntax_t re_syntax_options;
 271
 272
 273 /* Specify the precise syntax of regexps for compilation.  This provides
 274    for compatibility for various utilities which historically have
 275    different, incompatible syntaxes.
 276
 277    The argument SYNTAX is a bit mask comprised of the various bits
 278    defined in regex.h.  We return the old syntax.  */
 279
 280 reg_syntax_t
 281 re_set_syntax (syntax)
 282     reg_syntax_t syntax;
 283 {
 284   reg_syntax_t ret = re_syntax_options;
 285
 286   re_syntax_options = syntax;
 287   return ret;
 288 }
 289 #ifdef _LIBC
 290 weak_alias (__re_set_syntax, re_set_syntax)
 291 #endif
 292
 293 int
 294 re_compile_fastmap (bufp)
 295     struct re_pattern_buffer *bufp;
 296 {
 297   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 298   char *fastmap = bufp->fastmap;
 299
 300   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 301   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 302   if (dfa->init_state != dfa->init_state_word)
 303     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 304   if (dfa->init_state != dfa->init_state_nl)
 305     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 306   if (dfa->init_state != dfa->init_state_begbuf)
 307     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 308   bufp->fastmap_accurate = 1;
 309   return 0;
 310 }
 311 #ifdef _LIBC
 312 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 313 #endif
 314
 315 static inline void
 316 __attribute ((always_inline))
 317 re_set_fastmap (char *fastmap, int icase, int ch)
 318 {
 319   fastmap[ch] = 1;
 320   if (icase)
 321     fastmap[tolower (ch)] = 1;
 322 }
 323
 324 /* Helper function for re_compile_fastmap.
 325    Compile fastmap for the initial_state INIT_STATE.  */
 326
 327 static void
 328 re_compile_fastmap_iter (bufp, init_state, fastmap)
 329      regex_t *bufp;
 330      const re_dfastate_t *init_state;
 331      char *fastmap;
 332 {
 333   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 334   int node_cnt;
 335   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 336   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 337     {
 338       int node = init_state->nodes.elems[node_cnt];
 339       re_token_type_t type = dfa->nodes[node].type;
 340
 341       if (type == CHARACTER)
 342         {
 343           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 344 #ifdef RE_ENABLE_I18N
 345           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 346             {
 347               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 348               wchar_t wc;
 349               mbstate_t state;
 350
 351               p = buf;
 352               *p++ = dfa->nodes[node].opr.c;
 353               while (++node < dfa->nodes_len
 354                      && dfa->nodes[node].type == CHARACTER
 355                      && dfa->nodes[node].mb_partial)
 356                 *p++ = dfa->nodes[node].opr.c;
 357               memset (&state, '\0', sizeof (state));
 358               if (mbrtowc (&wc, (const char *) buf, p - buf,
 359                            &state) == p - buf
 360                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 361                       != (size_t) -1))
 362                 re_set_fastmap (fastmap, 0, buf[0]);
 363             }
 364 #endif
 365         }
 366       else if (type == SIMPLE_BRACKET)
 367         {
 368           int i, ch;
 369           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 370             {
 371               int j;
 372               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 373               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 374                 if (w & ((bitset_word_t) 1 << j))
 375                   re_set_fastmap (fastmap, icase, ch);
 376             }
 377         }
 378 #ifdef RE_ENABLE_I18N
 379       else if (type == COMPLEX_BRACKET)
 380         {
 381           int i;
 382           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 383           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 384               || cset->nranges || cset->nchar_classes)
 385             {
 386 # ifdef _LIBC
 387               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 388                 {
 389                   /* In this case we want to catch the bytes which are
 390                      the first byte of any collation elements.
 391                      e.g. In da_DK, we want to catch 'a' since "aa"
 392                           is a valid collation element, and don't catch
 393                           'b' since 'b' is the only collation element
 394                           which starts from 'b'.  */
 395                   const int32_t *table = (const int32_t *)
 396                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 397                   for (i = 0; i < SBC_MAX; ++i)
 398                     if (table[i] < 0)
 399                       re_set_fastmap (fastmap, icase, i);
 400                 }
 401 # else
 402               if (dfa->mb_cur_max > 1)
 403                 for (i = 0; i < SBC_MAX; ++i)
 404                   if (__btowc (i) == WEOF)
 405                     re_set_fastmap (fastmap, icase, i);
 406 # endif /* not _LIBC */
 407             }
 408           for (i = 0; i < cset->nmbchars; ++i)
 409             {
 410               char buf[256];
 411               mbstate_t state;
 412               memset (&state, '\0', sizeof (state));
 413               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 414                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 415               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 416                 {
 417                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 418                       != (size_t) -1)
 419                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 420                 }
 421             }
 422         }
 423 #endif /* RE_ENABLE_I18N */
 424       else if (type == OP_PERIOD
 425 #ifdef RE_ENABLE_I18N
 426                || type == OP_UTF8_PERIOD
 427 #endif /* RE_ENABLE_I18N */
 428                || type == END_OF_RE)
 429         {
 430           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 431           if (type == END_OF_RE)
 432             bufp->can_be_null = 1;
 433           return;
 434         }
 435     }
 436 }
 437 \f
 438 /* Entry point for POSIX code.  */
 439 /* regcomp takes a regular expression as a string and compiles it.
 440
 441    PREG is a regex_t *.  We do not expect any fields to be initialized,
 442    since POSIX says we shouldn't.  Thus, we set
 443
 444      `buffer' to the compiled pattern;
 445      `used' to the length of the compiled pattern;
 446      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 447        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 448        RE_SYNTAX_POSIX_BASIC;
 449      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 450      `fastmap' to an allocated space for the fastmap;
 451      `fastmap_accurate' to zero;
 452      `re_nsub' to the number of subexpressions in PATTERN.
 453
 454    PATTERN is the address of the pattern string.
 455
 456    CFLAGS is a series of bits which affect compilation.
 457
 458      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 459      use POSIX basic syntax.
 460
 461      If REG_NEWLINE is set, then . and [^...] don't match newline.
 462      Also, regexec will try a match beginning after every newline.
 463
 464      If REG_ICASE is set, then we considers upper- and lowercase
 465      versions of letters to be equivalent when matching.
 466
 467      If REG_NOSUB is set, then when PREG is passed to regexec, that
 468      routine will report only success or failure, and nothing about the
 469      registers.
 470
 471    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 472    the return codes and their meanings.)  */
 473
 474 int
 475 regcomp (preg, pattern, cflags)
 476     regex_t *__restrict preg;
 477     const char *__restrict pattern;
 478     int cflags;
 479 {
 480   reg_errcode_t ret;
 481   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 482                          : RE_SYNTAX_POSIX_BASIC);
 483
 484   preg->buffer = NULL;
 485   preg->allocated = 0;
 486   preg->used = 0;
 487
 488   /* Try to allocate space for the fastmap.  */
 489   preg->fastmap = re_malloc (char, SBC_MAX);
 490   if (BE (preg->fastmap == NULL, 0))
 491     return REG_ESPACE;
 492
 493   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 494
 495   /* If REG_NEWLINE is set, newlines are treated differently.  */
 496   if (cflags & REG_NEWLINE)
 497     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 498       syntax &= ~RE_DOT_NEWLINE;
 499       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 500       /* It also changes the matching behavior.  */
 501       preg->newline_anchor = 1;
 502     }
 503   else
 504     preg->newline_anchor = 0;
 505   preg->no_sub = !!(cflags & REG_NOSUB);
 506   preg->translate = NULL;
 507
 508   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 509
 510   /* POSIX doesn't distinguish between an unmatched open-group and an
 511      unmatched close-group: both are REG_EPAREN.  */
 512   if (ret == REG_ERPAREN)
 513     ret = REG_EPAREN;
 514
 515   /* We have already checked preg->fastmap != NULL.  */
 516   if (BE (ret == REG_NOERROR, 1))
 517     /* Compute the fastmap now, since regexec cannot modify the pattern
 518        buffer.  This function never fails in this implementation.  */
 519     (void) re_compile_fastmap (preg);
 520   else
 521     {
 522       /* Some error occurred while compiling the expression.  */
 523       re_free (preg->fastmap);
 524       preg->fastmap = NULL;
 525     }
 526
 527   return (int) ret;
 528 }
 529 #ifdef _LIBC
 530 weak_alias (__regcomp, regcomp)
 531 #endif
 532
 533 /* Returns a message corresponding to an error code, ERRCODE, returned
 534    from either regcomp or regexec.   We don't use PREG here.  */
 535
 536 size_t
 537 regerror (errcode, preg, errbuf, errbuf_size)
 538     int errcode;
 539     const regex_t *__restrict preg;
 540     char *__restrict errbuf;
 541     size_t errbuf_size;
 542 {
 543   const char *msg;
 544   size_t msg_size;
 545
 546   if (BE (errcode < 0
 547           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 548                                / sizeof (__re_error_msgid_idx[0])), 0))
 549     /* Only error codes returned by the rest of the code should be passed
 550        to this routine.  If we are given anything else, or if other regex
 551        code generates an invalid error code, then the program has a bug.
 552        Dump core so we can fix it.  */
 553     abort ();
 554
 555   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 556
 557   msg_size = strlen (msg) + 1; /* Includes the null.  */
 558
 559   if (BE (errbuf_size != 0, 1))
 560     {
 561       if (BE (msg_size > errbuf_size, 0))
 562         {
 563 #if defined HAVE_MEMPCPY || defined _LIBC
 564           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 565 #else
 566           memcpy (errbuf, msg, errbuf_size - 1);
 567           errbuf[errbuf_size - 1] = 0;
 568 #endif
 569         }
 570       else
 571         memcpy (errbuf, msg, msg_size);
 572     }
 573
 574   return msg_size;
 575 }
 576 #ifdef _LIBC
 577 weak_alias (__regerror, regerror)
 578 #endif
 579
 580
 581 #ifdef RE_ENABLE_I18N
 582 /* This static array is used for the map to single-byte characters when
 583    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 584    it the same all the time.  UTF-8 is the preferred encoding so this is
 585    a worthwhile optimization.  */
 586 static const bitset_t utf8_sb_map =
 587 {
 588   /* Set the first 128 bits.  */
 589   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 590 };
 591 #endif
 592
 593
 594 static void
 595 free_dfa_content (re_dfa_t *dfa)
 596 {
 597   int i, j;
 598
 599   if (dfa->nodes)
 600     for (i = 0; i < dfa->nodes_len; ++i)
 601       free_token (dfa->nodes + i);
 602   re_free (dfa->nexts);
 603   for (i = 0; i < dfa->nodes_len; ++i)
 604     {
 605       if (dfa->eclosures != NULL)
 606         re_node_set_free (dfa->eclosures + i);
 607       if (dfa->inveclosures != NULL)
 608         re_node_set_free (dfa->inveclosures + i);
 609       if (dfa->edests != NULL)
 610         re_node_set_free (dfa->edests + i);
 611     }
 612   re_free (dfa->edests);
 613   re_free (dfa->eclosures);
 614   re_free (dfa->inveclosures);
 615   re_free (dfa->nodes);
 616
 617   if (dfa->state_table)
 618     for (i = 0; i <= dfa->state_hash_mask; ++i)
 619       {
 620         struct re_state_table_entry *entry = dfa->state_table + i;
 621         for (j = 0; j < entry->num; ++j)
 622           {
 623             re_dfastate_t *state = entry->array[j];
 624             free_state (state);
 625           }
 626         re_free (entry->array);
 627       }
 628   re_free (dfa->state_table);
 629 #ifdef RE_ENABLE_I18N
 630   if (dfa->sb_char != utf8_sb_map)
 631     re_free (dfa->sb_char);
 632 #endif
 633   re_free (dfa->subexp_map);
 634 #ifdef DEBUG
 635   re_free (dfa->re_str);
 636 #endif
 637
 638   re_free (dfa);
 639 }
 640
 641
 642 /* Free dynamically allocated space used by PREG.  */
 643
 644 void
 645 regfree (preg)
 646     regex_t *preg;
 647 {
 648   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 649   if (BE (dfa != NULL, 1))
 650     free_dfa_content (dfa);
 651   preg->buffer = NULL;
 652   preg->allocated = 0;
 653
 654   re_free (preg->fastmap);
 655   preg->fastmap = NULL;
 656
 657   re_free (preg->translate);
 658   preg->translate = NULL;
 659 }
 660 #ifdef _LIBC
 661 weak_alias (__regfree, regfree)
 662 #endif
 663 \f
 664 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 665    them unless specifically requested.  */
 666
 667 #if defined _REGEX_RE_COMP || defined _LIBC
 668
 669 /* BSD has one and only one pattern buffer.  */
 670 static struct re_pattern_buffer re_comp_buf;
 671
 672 char *
 673 # ifdef _LIBC
 674 /* Make these definitions weak in libc, so POSIX programs can redefine
 675    these names if they don't use our functions, and still use
 676    regcomp/regexec above without link errors.  */
 677 weak_function
 678 # endif
 679 re_comp (s)
 680      const char *s;
 681 {
 682   reg_errcode_t ret;
 683   char *fastmap;
 684
 685   if (!s)
 686     {
 687       if (!re_comp_buf.buffer)
 688         return gettext ("No previous regular expression");
 689       return 0;
 690     }
 691
 692   if (re_comp_buf.buffer)
 693     {
 694       fastmap = re_comp_buf.fastmap;
 695       re_comp_buf.fastmap = NULL;
 696       __regfree (&re_comp_buf);
 697       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 698       re_comp_buf.fastmap = fastmap;
 699     }
 700
 701   if (re_comp_buf.fastmap == NULL)
 702     {
 703       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 704       if (re_comp_buf.fastmap == NULL)
 705         return (char *) gettext (__re_error_msgid
 706                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 707     }
 708
 709   /* Since `re_exec' always passes NULL for the `regs' argument, we
 710      don't need to initialize the pattern buffer fields which affect it.  */
 711
 712   /* Match anchors at newlines.  */
 713   re_comp_buf.newline_anchor = 1;
 714
 715   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 716
 717   if (!ret)
 718     return NULL;
 719
 720   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 721   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 722 }
 723
 724 #ifdef _LIBC
 725 libc_freeres_fn (free_mem)
 726 {
 727   __regfree (&re_comp_buf);
 728 }
 729 #endif
 730
 731 #endif /* _REGEX_RE_COMP */
 732 \f
 733 /* Internal entry point.
 734    Compile the regular expression PATTERN, whose length is LENGTH.
 735    SYNTAX indicate regular expression's syntax.  */
 736
 737 static reg_errcode_t
 738 re_compile_internal (preg, pattern, length, syntax)
 739      regex_t *preg;
 740      const char * pattern;
 741      size_t length;
 742      reg_syntax_t syntax;
 743 {
 744   reg_errcode_t err = REG_NOERROR;
 745   re_dfa_t *dfa;
 746   re_string_t regexp;
 747
 748   /* Initialize the pattern buffer.  */
 749   preg->fastmap_accurate = 0;
 750   preg->syntax = syntax;
 751   preg->not_bol = preg->not_eol = 0;
 752   preg->used = 0;
 753   preg->re_nsub = 0;
 754   preg->can_be_null = 0;
 755   preg->regs_allocated = REGS_UNALLOCATED;
 756
 757   /* Initialize the dfa.  */
 758   dfa = (re_dfa_t *) preg->buffer;
 759   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 760     {
 761       /* If zero allocated, but buffer is non-null, try to realloc
 762          enough space.  This loses if buffer's address is bogus, but
 763          that is the user's responsibility.  If ->buffer is NULL this
 764          is a simple allocation.  */
 765       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 766       if (dfa == NULL)
 767         return REG_ESPACE;
 768       preg->allocated = sizeof (re_dfa_t);
 769       preg->buffer = (unsigned char *) dfa;
 770     }
 771   preg->used = sizeof (re_dfa_t);
 772
 773   err = init_dfa (dfa, length);
 774   if (BE (err != REG_NOERROR, 0))
 775     {
 776       free_dfa_content (dfa);
 777       preg->buffer = NULL;
 778       preg->allocated = 0;
 779       return err;
 780     }
 781 #ifdef DEBUG
 782   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 783   dfa->re_str = re_malloc (char, length + 1);
 784   strncpy (dfa->re_str, pattern, length + 1);
 785 #endif
 786
 787   __libc_lock_init (dfa->lock);
 788
 789   err = re_string_construct (&regexp, pattern, length, preg->translate,
 790                              syntax & RE_ICASE, dfa);
 791   if (BE (err != REG_NOERROR, 0))
 792     {
 793     re_compile_internal_free_return:
 794       free_workarea_compile (preg);
 795       re_string_destruct (&regexp);
 796       free_dfa_content (dfa);
 797       preg->buffer = NULL;
 798       preg->allocated = 0;
 799       return err;
 800     }
 801
 802   /* Parse the regular expression, and build a structure tree.  */
 803   preg->re_nsub = 0;
 804   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 805   if (BE (dfa->str_tree == NULL, 0))
 806     goto re_compile_internal_free_return;
 807
 808   /* Analyze the tree and create the nfa.  */
 809   err = analyze (preg);
 810   if (BE (err != REG_NOERROR, 0))
 811     goto re_compile_internal_free_return;
 812
 813 #ifdef RE_ENABLE_I18N
 814   /* If possible, do searching in single byte encoding to speed things up.  */
 815   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 816     optimize_utf8 (dfa);
 817 #endif
 818
 819   /* Then create the initial state of the dfa.  */
 820   err = create_initial_state (dfa);
 821
 822   /* Release work areas.  */
 823   free_workarea_compile (preg);
 824   re_string_destruct (&regexp);
 825
 826   if (BE (err != REG_NOERROR, 0))
 827     {
 828       free_dfa_content (dfa);
 829       preg->buffer = NULL;
 830       preg->allocated = 0;
 831     }
 832
 833   return err;
 834 }
 835
 836 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 837    as the initial length of some arrays.  */
 838
 839 static reg_errcode_t
 840 init_dfa (dfa, pat_len)
 841      re_dfa_t *dfa;
 842      size_t pat_len;
 843 {
 844   unsigned int table_size;
 845 #ifndef _LIBC
 846   char *codeset_name;
 847 #endif
 848
 849   memset (dfa, '\0', sizeof (re_dfa_t));
 850
 851   /* Force allocation of str_tree_storage the first time.  */
 852   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 853
 854   /* Avoid overflows.  */
 855   if (pat_len == SIZE_MAX)
 856     return REG_ESPACE;
 857
 858   dfa->nodes_alloc = pat_len + 1;
 859   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 860
 861   /*  table_size = 2 ^ ceil(log pat_len) */
 862   for (table_size = 1; ; table_size <<= 1)
 863     if (table_size > pat_len)
 864       break;
 865
 866   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 867   dfa->state_hash_mask = table_size - 1;
 868
 869   dfa->mb_cur_max = MB_CUR_MAX;
 870 #ifdef _LIBC
 871   if (dfa->mb_cur_max == 6
 872       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 873     dfa->is_utf8 = 1;
 874   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 875                        != 0);
 876 #else
 877 # ifdef HAVE_LANGINFO_CODESET
 878   codeset_name = nl_langinfo (CODESET);
 879 # else
 880   codeset_name = getenv ("LC_ALL");
 881   if (codeset_name == NULL || codeset_name[0] == '\0')
 882     codeset_name = getenv ("LC_CTYPE");
 883   if (codeset_name == NULL || codeset_name[0] == '\0')
 884     codeset_name = getenv ("LANG");
 885   if (codeset_name == NULL)
 886     codeset_name = "";
 887   else if (strchr (codeset_name, '.') !=  NULL)
 888     codeset_name = strchr (codeset_name, '.') + 1;
 889 # endif
 890
 891   if (strcasecmp (codeset_name, "UTF-8") == 0
 892       || strcasecmp (codeset_name, "UTF8") == 0)
 893     dfa->is_utf8 = 1;
 894
 895   /* We check exhaustively in the loop below if this charset is a
 896      superset of ASCII.  */
 897   dfa->map_notascii = 0;
 898 #endif
 899
 900 #ifdef RE_ENABLE_I18N
 901   if (dfa->mb_cur_max > 1)
 902     {
 903       if (dfa->is_utf8)
 904         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 905       else
 906         {
 907           int i, j, ch;
 908
 909           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 910           if (BE (dfa->sb_char == NULL, 0))
 911             return REG_ESPACE;
 912
 913           /* Set the bits corresponding to single byte chars.  */
 914           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 915             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 916               {
 917                 wint_t wch = __btowc (ch);
 918                 if (wch != WEOF)
 919                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 920 # ifndef _LIBC
 921                 if (isascii (ch) && wch != ch)
 922                   dfa->map_notascii = 1;
 923 # endif
 924               }
 925         }
 926     }
 927 #endif
 928
 929   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 930     return REG_ESPACE;
 931   return REG_NOERROR;
 932 }
 933
 934 /* Initialize WORD_CHAR table, which indicate which character is
 935    "word".  In this case "word" means that it is the word construction
 936    character used by some operators like "\<", "\>", etc.  */
 937
 938 static void
 939 init_word_char (dfa)
 940      re_dfa_t *dfa;
 941 {
 942   int i, j, ch;
 943   dfa->word_ops_used = 1;
 944   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 945     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 946       if (isalnum (ch) || ch == '_')
 947         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 948 }
 949
 950 /* Free the work area which are only used while compiling.  */
 951
 952 static void
 953 free_workarea_compile (preg)
 954      regex_t *preg;
 955 {
 956   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 957   bin_tree_storage_t *storage, *next;
 958   for (storage = dfa->str_tree_storage; storage; storage = next)
 959     {
 960       next = storage->next;
 961       re_free (storage);
 962     }
 963   dfa->str_tree_storage = NULL;
 964   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 965   dfa->str_tree = NULL;
 966   re_free (dfa->org_indices);
 967   dfa->org_indices = NULL;
 968 }
 969
 970 /* Create initial states for all contexts.  */
 971
 972 static reg_errcode_t
 973 create_initial_state (dfa)
 974      re_dfa_t *dfa;
 975 {
 976   int first, i;
 977   reg_errcode_t err;
 978   re_node_set init_nodes;
 979
 980   /* Initial states have the epsilon closure of the node which is
 981      the first node of the regular expression.  */
 982   first = dfa->str_tree->first->node_idx;
 983   dfa->init_node = first;
 984   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 985   if (BE (err != REG_NOERROR, 0))
 986     return err;
 987
 988   /* The back-references which are in initial states can epsilon transit,
 989      since in this case all of the subexpressions can be null.
 990      Then we add epsilon closures of the nodes which are the next nodes of
 991      the back-references.  */
 992   if (dfa->nbackref > 0)
 993     for (i = 0; i < init_nodes.nelem; ++i)
 994       {
 995         int node_idx = init_nodes.elems[i];
 996         re_token_type_t type = dfa->nodes[node_idx].type;
 997
 998         int clexp_idx;
 999         if (type != OP_BACK_REF)
1000           continue;
1001         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1002           {
1003             re_token_t *clexp_node;
1004             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1005             if (clexp_node->type == OP_CLOSE_SUBEXP
1006                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1007               break;
1008           }
1009         if (clexp_idx == init_nodes.nelem)
1010           continue;
1011
1012         if (type == OP_BACK_REF)
1013           {
1014             int dest_idx = dfa->edests[node_idx].elems[0];
1015             if (!re_node_set_contains (&init_nodes, dest_idx))
1016               {
1017                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1018                 i = 0;
1019               }
1020           }
1021       }
1022
1023   /* It must be the first time to invoke acquire_state.  */
1024   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1025   /* We don't check ERR here, since the initial state must not be NULL.  */
1026   if (BE (dfa->init_state == NULL, 0))
1027     return err;
1028   if (dfa->init_state->has_constraint)
1029     {
1030       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1031                                                        CONTEXT_WORD);
1032       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1033                                                      CONTEXT_NEWLINE);
1034       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1035                                                          &init_nodes,
1036                                                          CONTEXT_NEWLINE
1037                                                          | CONTEXT_BEGBUF);
1038       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1039               || dfa->init_state_begbuf == NULL, 0))
1040         return err;
1041     }
1042   else
1043     dfa->init_state_word = dfa->init_state_nl
1044       = dfa->init_state_begbuf = dfa->init_state;
1045
1046   re_node_set_free (&init_nodes);
1047   return REG_NOERROR;
1048 }
1049 \f
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053    DFA nodes where needed.  */
1054
1055 static void
1056 optimize_utf8 (dfa)
1057      re_dfa_t *dfa;
1058 {
1059   int node, i, mb_chars = 0, has_period = 0;
1060
1061   for (node = 0; node < dfa->nodes_len; ++node)
1062     switch (dfa->nodes[node].type)
1063       {
1064       case CHARACTER:
1065         if (dfa->nodes[node].opr.c >= 0x80)
1066           mb_chars = 1;
1067         break;
1068       case ANCHOR:
1069         switch (dfa->nodes[node].opr.idx)
1070           {
1071           case LINE_FIRST:
1072           case LINE_LAST:
1073           case BUF_FIRST:
1074           case BUF_LAST:
1075             break;
1076           default:
1077             /* Word anchors etc. cannot be handled.  */
1078             return;
1079           }
1080         break;
1081       case OP_PERIOD:
1082         has_period = 1;
1083         break;
1084       case OP_BACK_REF:
1085       case OP_ALT:
1086       case END_OF_RE:
1087       case OP_DUP_ASTERISK:
1088       case OP_OPEN_SUBEXP:
1089       case OP_CLOSE_SUBEXP:
1090         break;
1091       case COMPLEX_BRACKET:
1092         return;
1093       case SIMPLE_BRACKET:
1094         /* Just double check.  The non-ASCII range starts at 0x80.  */
1095         assert (0x80 % BITSET_WORD_BITS == 0);
1096         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1097           if (dfa->nodes[node].opr.sbcset[i])
1098             return;
1099         break;
1100       default:
1101         abort ();
1102       }
1103
1104   if (mb_chars || has_period)
1105     for (node = 0; node < dfa->nodes_len; ++node)
1106       {
1107         if (dfa->nodes[node].type == CHARACTER
1108             && dfa->nodes[node].opr.c >= 0x80)
1109           dfa->nodes[node].mb_partial = 0;
1110         else if (dfa->nodes[node].type == OP_PERIOD)
1111           dfa->nodes[node].type = OP_UTF8_PERIOD;
1112       }
1113
1114   /* The search can be in single byte locale.  */
1115   dfa->mb_cur_max = 1;
1116   dfa->is_utf8 = 0;
1117   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1118 }
1119 #endif
1120 \f
1121 /* Analyze the structure tree, and calculate "first", "next", "edest",
1122    "eclosure", and "inveclosure".  */
1123
1124 static reg_errcode_t
1125 analyze (preg)
1126      regex_t *preg;
1127 {
1128   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1129   reg_errcode_t ret;
1130
1131   /* Allocate arrays.  */
1132   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1133   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1134   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1135   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1136   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1137           || dfa->eclosures == NULL, 0))
1138     return REG_ESPACE;
1139
1140   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1141   if (dfa->subexp_map != NULL)
1142     {
1143       int i;
1144       for (i = 0; i < preg->re_nsub; i++)
1145         dfa->subexp_map[i] = i;
1146       preorder (dfa->str_tree, optimize_subexps, dfa);
1147       for (i = 0; i < preg->re_nsub; i++)
1148         if (dfa->subexp_map[i] != i)
1149           break;
1150       if (i == preg->re_nsub)
1151         {
1152           free (dfa->subexp_map);
1153           dfa->subexp_map = NULL;
1154         }
1155     }
1156
1157   ret = postorder (dfa->str_tree, lower_subexps, preg);
1158   if (BE (ret != REG_NOERROR, 0))
1159     return ret;
1160   ret = postorder (dfa->str_tree, calc_first, dfa);
1161   if (BE (ret != REG_NOERROR, 0))
1162     return ret;
1163   preorder (dfa->str_tree, calc_next, dfa);
1164   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1165   if (BE (ret != REG_NOERROR, 0))
1166     return ret;
1167   ret = calc_eclosure (dfa);
1168   if (BE (ret != REG_NOERROR, 0))
1169     return ret;
1170
1171   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1172      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1173   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1174       || dfa->nbackref)
1175     {
1176       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1177       if (BE (dfa->inveclosures == NULL, 0))
1178         return REG_ESPACE;
1179       ret = calc_inveclosure (dfa);
1180     }
1181
1182   return ret;
1183 }
1184
1185 /* Our parse trees are very unbalanced, so we cannot use a stack to
1186    implement parse tree visits.  Instead, we use parent pointers and
1187    some hairy code in these two functions.  */
1188 static reg_errcode_t
1189 postorder (root, fn, extra)
1190      bin_tree_t *root;
1191      reg_errcode_t (fn (void *, bin_tree_t *));
1192      void *extra;
1193 {
1194   bin_tree_t *node, *prev;
1195
1196   for (node = root; ; )
1197     {
1198       /* Descend down the tree, preferably to the left (or to the right
1199          if that's the only child).  */
1200       while (node->left || node->right)
1201         if (node->left)
1202           node = node->left;
1203         else
1204           node = node->right;
1205
1206       do
1207         {
1208           reg_errcode_t err = fn (extra, node);
1209           if (BE (err != REG_NOERROR, 0))
1210             return err;
1211           if (node->parent == NULL)
1212             return REG_NOERROR;
1213           prev = node;
1214           node = node->parent;
1215         }
1216       /* Go up while we have a node that is reached from the right.  */
1217       while (node->right == prev || node->right == NULL);
1218       node = node->right;
1219     }
1220 }
1221
1222 static reg_errcode_t
1223 preorder (root, fn, extra)
1224      bin_tree_t *root;
1225      reg_errcode_t (fn (void *, bin_tree_t *));
1226      void *extra;
1227 {
1228   bin_tree_t *node;
1229
1230   for (node = root; ; )
1231     {
1232       reg_errcode_t err = fn (extra, node);
1233       if (BE (err != REG_NOERROR, 0))
1234         return err;
1235
1236       /* Go to the left node, or up and to the right.  */
1237       if (node->left)
1238         node = node->left;
1239       else
1240         {
1241           bin_tree_t *prev = NULL;
1242           while (node->right == prev || node->right == NULL)
1243             {
1244               prev = node;
1245               node = node->parent;
1246               if (!node)
1247                 return REG_NOERROR;
1248             }
1249           node = node->right;
1250         }
1251     }
1252 }
1253
1254 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1255    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1256    backreferences as well.  Requires a preorder visit.  */
1257 static reg_errcode_t
1258 optimize_subexps (extra, node)
1259      void *extra;
1260      bin_tree_t *node;
1261 {
1262   re_dfa_t *dfa = (re_dfa_t *) extra;
1263
1264   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1265     {
1266       int idx = node->token.opr.idx;
1267       node->token.opr.idx = dfa->subexp_map[idx];
1268       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1269     }
1270
1271   else if (node->token.type == SUBEXP
1272            && node->left && node->left->token.type == SUBEXP)
1273     {
1274       int other_idx = node->left->token.opr.idx;
1275
1276       node->left = node->left->left;
1277       if (node->left)
1278         node->left->parent = node;
1279
1280       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1281       if (other_idx < BITSET_WORD_BITS)
1282           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1283     }
1284
1285   return REG_NOERROR;
1286 }
1287
1288 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1289    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1290 static reg_errcode_t
1291 lower_subexps (extra, node)
1292      void *extra;
1293      bin_tree_t *node;
1294 {
1295   regex_t *preg = (regex_t *) extra;
1296   reg_errcode_t err = REG_NOERROR;
1297
1298   if (node->left && node->left->token.type == SUBEXP)
1299     {
1300       node->left = lower_subexp (&err, preg, node->left);
1301       if (node->left)
1302         node->left->parent = node;
1303     }
1304   if (node->right && node->right->token.type == SUBEXP)
1305     {
1306       node->right = lower_subexp (&err, preg, node->right);
1307       if (node->right)
1308         node->right->parent = node;
1309     }
1310
1311   return err;
1312 }
1313
1314 static bin_tree_t *
1315 lower_subexp (err, preg, node)
1316      reg_errcode_t *err;
1317      regex_t *preg;
1318      bin_tree_t *node;
1319 {
1320   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1321   bin_tree_t *body = node->left;
1322   bin_tree_t *op, *cls, *tree1, *tree;
1323
1324   if (preg->no_sub
1325       /* We do not optimize empty subexpressions, because otherwise we may
1326          have bad CONCAT nodes with NULL children.  This is obviously not
1327          very common, so we do not lose much.  An example that triggers
1328          this case is the sed "script" /\(\)/x.  */
1329       && node->left != NULL
1330       && (node->token.opr.idx >= BITSET_WORD_BITS
1331           || !(dfa->used_bkref_map
1332                & ((bitset_word_t) 1 << node->token.opr.idx))))
1333     return node->left;
1334
1335   /* Convert the SUBEXP node to the concatenation of an
1336      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1337   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1338   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1339   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1340   tree = create_tree (dfa, op, tree1, CONCAT);
1341   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1342     {
1343       *err = REG_ESPACE;
1344       return NULL;
1345     }
1346
1347   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1348   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1349   return tree;
1350 }
1351
1352 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1353    nodes.  Requires a postorder visit.  */
1354 static reg_errcode_t
1355 calc_first (extra, node)
1356      void *extra;
1357      bin_tree_t *node;
1358 {
1359   re_dfa_t *dfa = (re_dfa_t *) extra;
1360   if (node->token.type == CONCAT)
1361     {
1362       node->first = node->left->first;
1363       node->node_idx = node->left->node_idx;
1364     }
1365   else
1366     {
1367       node->first = node;
1368       node->node_idx = re_dfa_add_node (dfa, node->token);
1369       if (BE (node->node_idx == -1, 0))
1370         return REG_ESPACE;
1371     }
1372   return REG_NOERROR;
1373 }
1374
1375 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1376 static reg_errcode_t
1377 calc_next (extra, node)
1378      void *extra;
1379      bin_tree_t *node;
1380 {
1381   switch (node->token.type)
1382     {
1383     case OP_DUP_ASTERISK:
1384       node->left->next = node;
1385       break;
1386     case CONCAT:
1387       node->left->next = node->right->first;
1388       node->right->next = node->next;
1389       break;
1390     default:
1391       if (node->left)
1392         node->left->next = node->next;
1393       if (node->right)
1394         node->right->next = node->next;
1395       break;
1396     }
1397   return REG_NOERROR;
1398 }
1399
1400 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1401 static reg_errcode_t
1402 link_nfa_nodes (extra, node)
1403      void *extra;
1404      bin_tree_t *node;
1405 {
1406   re_dfa_t *dfa = (re_dfa_t *) extra;
1407   int idx = node->node_idx;
1408   reg_errcode_t err = REG_NOERROR;
1409
1410   switch (node->token.type)
1411     {
1412     case CONCAT:
1413       break;
1414
1415     case END_OF_RE:
1416       assert (node->next == NULL);
1417       break;
1418
1419     case OP_DUP_ASTERISK:
1420     case OP_ALT:
1421       {
1422         int left, right;
1423         dfa->has_plural_match = 1;
1424         if (node->left != NULL)
1425           left = node->left->first->node_idx;
1426         else
1427           left = node->next->node_idx;
1428         if (node->right != NULL)
1429           right = node->right->first->node_idx;
1430         else
1431           right = node->next->node_idx;
1432         assert (left > -1);
1433         assert (right > -1);
1434         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1435       }
1436       break;
1437
1438     case ANCHOR:
1439     case OP_OPEN_SUBEXP:
1440     case OP_CLOSE_SUBEXP:
1441       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1442       break;
1443
1444     case OP_BACK_REF:
1445       dfa->nexts[idx] = node->next->node_idx;
1446       if (node->token.type == OP_BACK_REF)
1447         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1448       break;
1449
1450     default:
1451       assert (!IS_EPSILON_NODE (node->token.type));
1452       dfa->nexts[idx] = node->next->node_idx;
1453       break;
1454     }
1455
1456   return err;
1457 }
1458
1459 /* Duplicate the epsilon closure of the node ROOT_NODE.
1460    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1461    to their own constraint.  */
1462
1463 static reg_errcode_t
1464 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1465                         init_constraint)
1466      re_dfa_t *dfa;
1467      int top_org_node, top_clone_node, root_node;
1468      unsigned int init_constraint;
1469 {
1470   int org_node, clone_node, ret;
1471   unsigned int constraint = init_constraint;
1472   for (org_node = top_org_node, clone_node = top_clone_node;;)
1473     {
1474       int org_dest, clone_dest;
1475       if (dfa->nodes[org_node].type == OP_BACK_REF)
1476         {
1477           /* If the back reference epsilon-transit, its destination must
1478              also have the constraint.  Then duplicate the epsilon closure
1479              of the destination of the back reference, and store it in
1480              edests of the back reference.  */
1481           org_dest = dfa->nexts[org_node];
1482           re_node_set_empty (dfa->edests + clone_node);
1483           clone_dest = duplicate_node (dfa, org_dest, constraint);
1484           if (BE (clone_dest == -1, 0))
1485             return REG_ESPACE;
1486           dfa->nexts[clone_node] = dfa->nexts[org_node];
1487           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488           if (BE (ret < 0, 0))
1489             return REG_ESPACE;
1490         }
1491       else if (dfa->edests[org_node].nelem == 0)
1492         {
1493           /* In case of the node can't epsilon-transit, don't duplicate the
1494              destination and store the original destination as the
1495              destination of the node.  */
1496           dfa->nexts[clone_node] = dfa->nexts[org_node];
1497           break;
1498         }
1499       else if (dfa->edests[org_node].nelem == 1)
1500         {
1501           /* In case of the node can epsilon-transit, and it has only one
1502              destination.  */
1503           org_dest = dfa->edests[org_node].elems[0];
1504           re_node_set_empty (dfa->edests + clone_node);
1505           if (dfa->nodes[org_node].type == ANCHOR)
1506             {
1507               /* In case of the node has another constraint, append it.  */
1508               if (org_node == root_node && clone_node != org_node)
1509                 {
1510                   /* ...but if the node is root_node itself, it means the
1511                      epsilon closure have a loop, then tie it to the
1512                      destination of the root_node.  */
1513                   ret = re_node_set_insert (dfa->edests + clone_node,
1514                                             org_dest);
1515                   if (BE (ret < 0, 0))
1516                     return REG_ESPACE;
1517                   break;
1518                 }
1519               constraint |= dfa->nodes[org_node].opr.ctx_type;
1520             }
1521           clone_dest = duplicate_node (dfa, org_dest, constraint);
1522           if (BE (clone_dest == -1, 0))
1523             return REG_ESPACE;
1524           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1525           if (BE (ret < 0, 0))
1526             return REG_ESPACE;
1527         }
1528       else /* dfa->edests[org_node].nelem == 2 */
1529         {
1530           /* In case of the node can epsilon-transit, and it has two
1531              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1532           org_dest = dfa->edests[org_node].elems[0];
1533           re_node_set_empty (dfa->edests + clone_node);
1534           /* Search for a duplicated node which satisfies the constraint.  */
1535           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1536           if (clone_dest == -1)
1537             {
1538               /* There are no such a duplicated node, create a new one.  */
1539               reg_errcode_t err;
1540               clone_dest = duplicate_node (dfa, org_dest, constraint);
1541               if (BE (clone_dest == -1, 0))
1542                 return REG_ESPACE;
1543               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544               if (BE (ret < 0, 0))
1545                 return REG_ESPACE;
1546               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547                                             root_node, constraint);
1548               if (BE (err != REG_NOERROR, 0))
1549                 return err;
1550             }
1551           else
1552             {
1553               /* There are a duplicated node which satisfy the constraint,
1554                  use it to avoid infinite loop.  */
1555               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556               if (BE (ret < 0, 0))
1557                 return REG_ESPACE;
1558             }
1559
1560           org_dest = dfa->edests[org_node].elems[1];
1561           clone_dest = duplicate_node (dfa, org_dest, constraint);
1562           if (BE (clone_dest == -1, 0))
1563             return REG_ESPACE;
1564           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565           if (BE (ret < 0, 0))
1566             return REG_ESPACE;
1567         }
1568       org_node = org_dest;
1569       clone_node = clone_dest;
1570     }
1571   return REG_NOERROR;
1572 }
1573
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575    satisfies the constraint CONSTRAINT.  */
1576
1577 static int
1578 search_duplicated_node (dfa, org_node, constraint)
1579      const re_dfa_t *dfa;
1580      int org_node;
1581      unsigned int constraint;
1582 {
1583   int idx;
1584   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1585     {
1586       if (org_node == dfa->org_indices[idx]
1587           && constraint == dfa->nodes[idx].constraint)
1588         return idx; /* Found.  */
1589     }
1590   return -1; /* Not found.  */
1591 }
1592
1593 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1594    Return the index of the new node, or -1 if insufficient storage is
1595    available.  */
1596
1597 static int
1598 duplicate_node (dfa, org_idx, constraint)
1599      re_dfa_t *dfa;
1600      int org_idx;
1601      unsigned int constraint;
1602 {
1603   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1604   if (BE (dup_idx != -1, 1))
1605     {
1606       dfa->nodes[dup_idx].constraint = constraint;
1607       if (dfa->nodes[org_idx].type == ANCHOR)
1608         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1609       dfa->nodes[dup_idx].duplicated = 1;
1610
1611       /* Store the index of the original node.  */
1612       dfa->org_indices[dup_idx] = org_idx;
1613     }
1614   return dup_idx;
1615 }
1616
1617 static reg_errcode_t
1618 calc_inveclosure (dfa)
1619      re_dfa_t *dfa;
1620 {
1621   int src, idx, ret;
1622   for (idx = 0; idx < dfa->nodes_len; ++idx)
1623     re_node_set_init_empty (dfa->inveclosures + idx);
1624
1625   for (src = 0; src < dfa->nodes_len; ++src)
1626     {
1627       int *elems = dfa->eclosures[src].elems;
1628       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1629         {
1630           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1631           if (BE (ret == -1, 0))
1632             return REG_ESPACE;
1633         }
1634     }
1635
1636   return REG_NOERROR;
1637 }
1638
1639 /* Calculate "eclosure" for all the node in DFA.  */
1640
1641 static reg_errcode_t
1642 calc_eclosure (dfa)
1643      re_dfa_t *dfa;
1644 {
1645   int node_idx, incomplete;
1646 #ifdef DEBUG
1647   assert (dfa->nodes_len > 0);
1648 #endif
1649   incomplete = 0;
1650   /* For each nodes, calculate epsilon closure.  */
1651   for (node_idx = 0; ; ++node_idx)
1652     {
1653       reg_errcode_t err;
1654       re_node_set eclosure_elem;
1655       if (node_idx == dfa->nodes_len)
1656         {
1657           if (!incomplete)
1658             break;
1659           incomplete = 0;
1660           node_idx = 0;
1661         }
1662
1663 #ifdef DEBUG
1664       assert (dfa->eclosures[node_idx].nelem != -1);
1665 #endif
1666
1667       /* If we have already calculated, skip it.  */
1668       if (dfa->eclosures[node_idx].nelem != 0)
1669         continue;
1670       /* Calculate epsilon closure of `node_idx'.  */
1671       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1672       if (BE (err != REG_NOERROR, 0))
1673         return err;
1674
1675       if (dfa->eclosures[node_idx].nelem == 0)
1676         {
1677           incomplete = 1;
1678           re_node_set_free (&eclosure_elem);
1679         }
1680     }
1681   return REG_NOERROR;
1682 }
1683
1684 /* Calculate epsilon closure of NODE.  */
1685
1686 static reg_errcode_t
1687 calc_eclosure_iter (new_set, dfa, node, root)
1688      re_node_set *new_set;
1689      re_dfa_t *dfa;
1690      int node, root;
1691 {
1692   reg_errcode_t err;
1693   unsigned int constraint;
1694   int i, incomplete;
1695   re_node_set eclosure;
1696   incomplete = 0;
1697   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1698   if (BE (err != REG_NOERROR, 0))
1699     return err;
1700
1701   /* This indicates that we are calculating this node now.
1702      We reference this value to avoid infinite loop.  */
1703   dfa->eclosures[node].nelem = -1;
1704
1705   constraint = ((dfa->nodes[node].type == ANCHOR)
1706                 ? dfa->nodes[node].opr.ctx_type : 0);
1707   /* If the current node has constraints, duplicate all nodes.
1708      Since they must inherit the constraints.  */
1709   if (constraint
1710       && dfa->edests[node].nelem
1711       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1712     {
1713       int org_node, cur_node;
1714       org_node = cur_node = node;
1715       err = duplicate_node_closure (dfa, node, node, node, constraint);
1716       if (BE (err != REG_NOERROR, 0))
1717         return err;
1718     }
1719
1720   /* Expand each epsilon destination nodes.  */
1721   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1722     for (i = 0; i < dfa->edests[node].nelem; ++i)
1723       {
1724         re_node_set eclosure_elem;
1725         int edest = dfa->edests[node].elems[i];
1726         /* If calculating the epsilon closure of `edest' is in progress,
1727            return intermediate result.  */
1728         if (dfa->eclosures[edest].nelem == -1)
1729           {
1730             incomplete = 1;
1731             continue;
1732           }
1733         /* If we haven't calculated the epsilon closure of `edest' yet,
1734            calculate now. Otherwise use calculated epsilon closure.  */
1735         if (dfa->eclosures[edest].nelem == 0)
1736           {
1737             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1738             if (BE (err != REG_NOERROR, 0))
1739               return err;
1740           }
1741         else
1742           eclosure_elem = dfa->eclosures[edest];
1743         /* Merge the epsilon closure of `edest'.  */
1744         re_node_set_merge (&eclosure, &eclosure_elem);
1745         /* If the epsilon closure of `edest' is incomplete,
1746            the epsilon closure of this node is also incomplete.  */
1747         if (dfa->eclosures[edest].nelem == 0)
1748           {
1749             incomplete = 1;
1750             re_node_set_free (&eclosure_elem);
1751           }
1752       }
1753
1754   /* Epsilon closures include itself.  */
1755   re_node_set_insert (&eclosure, node);
1756   if (incomplete && !root)
1757     dfa->eclosures[node].nelem = 0;
1758   else
1759     dfa->eclosures[node] = eclosure;
1760   *new_set = eclosure;
1761   return REG_NOERROR;
1762 }
1763 \f
1764 /* Functions for token which are used in the parser.  */
1765
1766 /* Fetch a token from INPUT.
1767    We must not use this function inside bracket expressions.  */
1768
1769 static void
1770 fetch_token (result, input, syntax)
1771      re_token_t *result;
1772      re_string_t *input;
1773      reg_syntax_t syntax;
1774 {
1775   re_string_skip_bytes (input, peek_token (result, input, syntax));
1776 }
1777
1778 /* Peek a token from INPUT, and return the length of the token.
1779    We must not use this function inside bracket expressions.  */
1780
1781 static int
1782 peek_token (token, input, syntax)
1783      re_token_t *token;
1784      re_string_t *input;
1785      reg_syntax_t syntax;
1786 {
1787   unsigned char c;
1788
1789   if (re_string_eoi (input))
1790     {
1791       token->type = END_OF_RE;
1792       return 0;
1793     }
1794
1795   c = re_string_peek_byte (input, 0);
1796   token->opr.c = c;
1797
1798   token->word_char = 0;
1799 #ifdef RE_ENABLE_I18N
1800   token->mb_partial = 0;
1801   if (input->mb_cur_max > 1 &&
1802       !re_string_first_byte (input, re_string_cur_idx (input)))
1803     {
1804       token->type = CHARACTER;
1805       token->mb_partial = 1;
1806       return 1;
1807     }
1808 #endif
1809   if (c == '\\')
1810     {
1811       unsigned char c2;
1812       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1813         {
1814           token->type = BACK_SLASH;
1815           return 1;
1816         }
1817
1818       c2 = re_string_peek_byte_case (input, 1);
1819       token->opr.c = c2;
1820       token->type = CHARACTER;
1821 #ifdef RE_ENABLE_I18N
1822       if (input->mb_cur_max > 1)
1823         {
1824           wint_t wc = re_string_wchar_at (input,
1825                                           re_string_cur_idx (input) + 1);
1826           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1827         }
1828       else
1829 #endif
1830         token->word_char = IS_WORD_CHAR (c2) != 0;
1831
1832       switch (c2)
1833         {
1834         case '|':
1835           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1836             token->type = OP_ALT;
1837           break;
1838         case '1': case '2': case '3': case '4': case '5':
1839         case '6': case '7': case '8': case '9':
1840           if (!(syntax & RE_NO_BK_REFS))
1841             {
1842               token->type = OP_BACK_REF;
1843               token->opr.idx = c2 - '1';
1844             }
1845           break;
1846         case '<':
1847           if (!(syntax & RE_NO_GNU_OPS))
1848             {
1849               token->type = ANCHOR;
1850               token->opr.ctx_type = WORD_FIRST;
1851             }
1852           break;
1853         case '>':
1854           if (!(syntax & RE_NO_GNU_OPS))
1855             {
1856               token->type = ANCHOR;
1857               token->opr.ctx_type = WORD_LAST;
1858             }
1859           break;
1860         case 'b':
1861           if (!(syntax & RE_NO_GNU_OPS))
1862             {
1863               token->type = ANCHOR;
1864               token->opr.ctx_type = WORD_DELIM;
1865             }
1866           break;
1867         case 'B':
1868           if (!(syntax & RE_NO_GNU_OPS))
1869             {
1870               token->type = ANCHOR;
1871               token->opr.ctx_type = NOT_WORD_DELIM;
1872             }
1873           break;
1874         case 'w':
1875           if (!(syntax & RE_NO_GNU_OPS))
1876             token->type = OP_WORD;
1877           break;
1878         case 'W':
1879           if (!(syntax & RE_NO_GNU_OPS))
1880             token->type = OP_NOTWORD;
1881           break;
1882         case 's':
1883           if (!(syntax & RE_NO_GNU_OPS))
1884             token->type = OP_SPACE;
1885           break;
1886         case 'S':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             token->type = OP_NOTSPACE;
1889           break;
1890         case '`':
1891           if (!(syntax & RE_NO_GNU_OPS))
1892             {
1893               token->type = ANCHOR;
1894               token->opr.ctx_type = BUF_FIRST;
1895             }
1896           break;
1897         case '\'':
1898           if (!(syntax & RE_NO_GNU_OPS))
1899             {
1900               token->type = ANCHOR;
1901               token->opr.ctx_type = BUF_LAST;
1902             }
1903           break;
1904         case '(':
1905           if (!(syntax & RE_NO_BK_PARENS))
1906             token->type = OP_OPEN_SUBEXP;
1907           break;
1908         case ')':
1909           if (!(syntax & RE_NO_BK_PARENS))
1910             token->type = OP_CLOSE_SUBEXP;
1911           break;
1912         case '+':
1913           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914             token->type = OP_DUP_PLUS;
1915           break;
1916         case '?':
1917           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1918             token->type = OP_DUP_QUESTION;
1919           break;
1920         case '{':
1921           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922             token->type = OP_OPEN_DUP_NUM;
1923           break;
1924         case '}':
1925           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1926             token->type = OP_CLOSE_DUP_NUM;
1927           break;
1928         default:
1929           break;
1930         }
1931       return 2;
1932     }
1933
1934   token->type = CHARACTER;
1935 #ifdef RE_ENABLE_I18N
1936   if (input->mb_cur_max > 1)
1937     {
1938       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1939       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1940     }
1941   else
1942 #endif
1943     token->word_char = IS_WORD_CHAR (token->opr.c);
1944
1945   switch (c)
1946     {
1947     case '\n':
1948       if (syntax & RE_NEWLINE_ALT)
1949         token->type = OP_ALT;
1950       break;
1951     case '|':
1952       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1953         token->type = OP_ALT;
1954       break;
1955     case '*':
1956       token->type = OP_DUP_ASTERISK;
1957       break;
1958     case '+':
1959       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960         token->type = OP_DUP_PLUS;
1961       break;
1962     case '?':
1963       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1964         token->type = OP_DUP_QUESTION;
1965       break;
1966     case '{':
1967       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968         token->type = OP_OPEN_DUP_NUM;
1969       break;
1970     case '}':
1971       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1972         token->type = OP_CLOSE_DUP_NUM;
1973       break;
1974     case '(':
1975       if (syntax & RE_NO_BK_PARENS)
1976         token->type = OP_OPEN_SUBEXP;
1977       break;
1978     case ')':
1979       if (syntax & RE_NO_BK_PARENS)
1980         token->type = OP_CLOSE_SUBEXP;
1981       break;
1982     case '[':
1983       token->type = OP_OPEN_BRACKET;
1984       break;
1985     case '.':
1986       token->type = OP_PERIOD;
1987       break;
1988     case '^':
1989       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1990           re_string_cur_idx (input) != 0)
1991         {
1992           char prev = re_string_peek_byte (input, -1);
1993           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1994             break;
1995         }
1996       token->type = ANCHOR;
1997       token->opr.ctx_type = LINE_FIRST;
1998       break;
1999     case '$':
2000       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2001           re_string_cur_idx (input) + 1 != re_string_length (input))
2002         {
2003           re_token_t next;
2004           re_string_skip_bytes (input, 1);
2005           peek_token (&next, input, syntax);
2006           re_string_skip_bytes (input, -1);
2007           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2008             break;
2009         }
2010       token->type = ANCHOR;
2011       token->opr.ctx_type = LINE_LAST;
2012       break;
2013     default:
2014       break;
2015     }
2016   return 1;
2017 }
2018
2019 /* Peek a token from INPUT, and return the length of the token.
2020    We must not use this function out of bracket expressions.  */
2021
2022 static int
2023 peek_token_bracket (token, input, syntax)
2024      re_token_t *token;
2025      re_string_t *input;
2026      reg_syntax_t syntax;
2027 {
2028   unsigned char c;
2029   if (re_string_eoi (input))
2030     {
2031       token->type = END_OF_RE;
2032       return 0;
2033     }
2034   c = re_string_peek_byte (input, 0);
2035   token->opr.c = c;
2036
2037 #ifdef RE_ENABLE_I18N
2038   if (input->mb_cur_max > 1 &&
2039       !re_string_first_byte (input, re_string_cur_idx (input)))
2040     {
2041       token->type = CHARACTER;
2042       return 1;
2043     }
2044 #endif /* RE_ENABLE_I18N */
2045
2046   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2047       && re_string_cur_idx (input) + 1 < re_string_length (input))
2048     {
2049       /* In this case, '\' escape a character.  */
2050       unsigned char c2;
2051       re_string_skip_bytes (input, 1);
2052       c2 = re_string_peek_byte (input, 0);
2053       token->opr.c = c2;
2054       token->type = CHARACTER;
2055       return 1;
2056     }
2057   if (c == '[') /* '[' is a special char in a bracket exps.  */
2058     {
2059       unsigned char c2;
2060       int token_len;
2061       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2062         c2 = re_string_peek_byte (input, 1);
2063       else
2064         c2 = 0;
2065       token->opr.c = c2;
2066       token_len = 2;
2067       switch (c2)
2068         {
2069         case '.':
2070           token->type = OP_OPEN_COLL_ELEM;
2071           break;
2072         case '=':
2073           token->type = OP_OPEN_EQUIV_CLASS;
2074           break;
2075         case ':':
2076           if (syntax & RE_CHAR_CLASSES)
2077             {
2078               token->type = OP_OPEN_CHAR_CLASS;
2079               break;
2080             }
2081           /* else fall through.  */
2082         default:
2083           token->type = CHARACTER;
2084           token->opr.c = c;
2085           token_len = 1;
2086           break;
2087         }
2088       return token_len;
2089     }
2090   switch (c)
2091     {
2092     case '-':
2093       token->type = OP_CHARSET_RANGE;
2094       break;
2095     case ']':
2096       token->type = OP_CLOSE_BRACKET;
2097       break;
2098     case '^':
2099       token->type = OP_NON_MATCH_LIST;
2100       break;
2101     default:
2102       token->type = CHARACTER;
2103     }
2104   return 1;
2105 }
2106 \f
2107 /* Functions for parser.  */
2108
2109 /* Entry point of the parser.
2110    Parse the regular expression REGEXP and return the structure tree.
2111    If an error is occured, ERR is set by error code, and return NULL.
2112    This function build the following tree, from regular expression <reg_exp>:
2113            CAT
2114            / \
2115           /   \
2116    <reg_exp>  EOR
2117
2118    CAT means concatenation.
2119    EOR means end of regular expression.  */
2120
2121 static bin_tree_t *
2122 parse (regexp, preg, syntax, err)
2123      re_string_t *regexp;
2124      regex_t *preg;
2125      reg_syntax_t syntax;
2126      reg_errcode_t *err;
2127 {
2128   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2129   bin_tree_t *tree, *eor, *root;
2130   re_token_t current_token;
2131   dfa->syntax = syntax;
2132   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2133   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2134   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2135     return NULL;
2136   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2137   if (tree != NULL)
2138     root = create_tree (dfa, tree, eor, CONCAT);
2139   else
2140     root = eor;
2141   if (BE (eor == NULL || root == NULL, 0))
2142     {
2143       *err = REG_ESPACE;
2144       return NULL;
2145     }
2146   return root;
2147 }
2148
2149 /* This function build the following tree, from regular expression
2150    <branch1>|<branch2>:
2151            ALT
2152            / \
2153           /   \
2154    <branch1> <branch2>
2155
2156    ALT means alternative, which represents the operator `|'.  */
2157
2158 static bin_tree_t *
2159 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2160      re_string_t *regexp;
2161      regex_t *preg;
2162      re_token_t *token;
2163      reg_syntax_t syntax;
2164      int nest;
2165      reg_errcode_t *err;
2166 {
2167   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2168   bin_tree_t *tree, *branch = NULL;
2169   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2170   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2171     return NULL;
2172
2173   while (token->type == OP_ALT)
2174     {
2175       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2176       if (token->type != OP_ALT && token->type != END_OF_RE
2177           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2178         {
2179           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2180           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2181             return NULL;
2182         }
2183       else
2184         branch = NULL;
2185       tree = create_tree (dfa, tree, branch, OP_ALT);
2186       if (BE (tree == NULL, 0))
2187         {
2188           *err = REG_ESPACE;
2189           return NULL;
2190         }
2191     }
2192   return tree;
2193 }
2194
2195 /* This function build the following tree, from regular expression
2196    <exp1><exp2>:
2197         CAT
2198         / \
2199        /   \
2200    <exp1> <exp2>
2201
2202    CAT means concatenation.  */
2203
2204 static bin_tree_t *
2205 parse_branch (regexp, preg, token, syntax, nest, err)
2206      re_string_t *regexp;
2207      regex_t *preg;
2208      re_token_t *token;
2209      reg_syntax_t syntax;
2210      int nest;
2211      reg_errcode_t *err;
2212 {
2213   bin_tree_t *tree, *exp;
2214   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2215   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2216   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2217     return NULL;
2218
2219   while (token->type != OP_ALT && token->type != END_OF_RE
2220          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2221     {
2222       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2223       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2224         {
2225           return NULL;
2226         }
2227       if (tree != NULL && exp != NULL)
2228         {
2229           tree = create_tree (dfa, tree, exp, CONCAT);
2230           if (tree == NULL)
2231             {
2232               *err = REG_ESPACE;
2233               return NULL;
2234             }
2235         }
2236       else if (tree == NULL)
2237         tree = exp;
2238       /* Otherwise exp == NULL, we don't need to create new tree.  */
2239     }
2240   return tree;
2241 }
2242
2243 /* This function build the following tree, from regular expression a*:
2244          *
2245          |
2246          a
2247 */
2248
2249 static bin_tree_t *
2250 parse_expression (regexp, preg, token, syntax, nest, err)
2251      re_string_t *regexp;
2252      regex_t *preg;
2253      re_token_t *token;
2254      reg_syntax_t syntax;
2255      int nest;
2256      reg_errcode_t *err;
2257 {
2258   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2259   bin_tree_t *tree;
2260   switch (token->type)
2261     {
2262     case CHARACTER:
2263       tree = create_token_tree (dfa, NULL, NULL, token);
2264       if (BE (tree == NULL, 0))
2265         {
2266           *err = REG_ESPACE;
2267           return NULL;
2268         }
2269 #ifdef RE_ENABLE_I18N
2270       if (dfa->mb_cur_max > 1)
2271         {
2272           while (!re_string_eoi (regexp)
2273                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2274             {
2275               bin_tree_t *mbc_remain;
2276               fetch_token (token, regexp, syntax);
2277               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2278               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2279               if (BE (mbc_remain == NULL || tree == NULL, 0))
2280                 {
2281                   *err = REG_ESPACE;
2282                   return NULL;
2283                 }
2284             }
2285         }
2286 #endif
2287       break;
2288     case OP_OPEN_SUBEXP:
2289       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2290       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2291         return NULL;
2292       break;
2293     case OP_OPEN_BRACKET:
2294       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2295       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2296         return NULL;
2297       break;
2298     case OP_BACK_REF:
2299       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2300         {
2301           *err = REG_ESUBREG;
2302           return NULL;
2303         }
2304       dfa->used_bkref_map |= 1 << token->opr.idx;
2305       tree = create_token_tree (dfa, NULL, NULL, token);
2306       if (BE (tree == NULL, 0))
2307         {
2308           *err = REG_ESPACE;
2309           return NULL;
2310         }
2311       ++dfa->nbackref;
2312       dfa->has_mb_node = 1;
2313       break;
2314     case OP_OPEN_DUP_NUM:
2315       if (syntax & RE_CONTEXT_INVALID_DUP)
2316         {
2317           *err = REG_BADRPT;
2318           return NULL;
2319         }
2320       /* FALLTHROUGH */
2321     case OP_DUP_ASTERISK:
2322     case OP_DUP_PLUS:
2323     case OP_DUP_QUESTION:
2324       if (syntax & RE_CONTEXT_INVALID_OPS)
2325         {
2326           *err = REG_BADRPT;
2327           return NULL;
2328         }
2329       else if (syntax & RE_CONTEXT_INDEP_OPS)
2330         {
2331           fetch_token (token, regexp, syntax);
2332           return parse_expression (regexp, preg, token, syntax, nest, err);
2333         }
2334       /* else fall through  */
2335     case OP_CLOSE_SUBEXP:
2336       if ((token->type == OP_CLOSE_SUBEXP) &&
2337           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2338         {
2339           *err = REG_ERPAREN;
2340           return NULL;
2341         }
2342       /* else fall through  */
2343     case OP_CLOSE_DUP_NUM:
2344       /* We treat it as a normal character.  */
2345
2346       /* Then we can these characters as normal characters.  */
2347       token->type = CHARACTER;
2348       /* mb_partial and word_char bits should be initialized already
2349          by peek_token.  */
2350       tree = create_token_tree (dfa, NULL, NULL, token);
2351       if (BE (tree == NULL, 0))
2352         {
2353           *err = REG_ESPACE;
2354           return NULL;
2355         }
2356       break;
2357     case ANCHOR:
2358       if ((token->opr.ctx_type
2359            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2360           && dfa->word_ops_used == 0)
2361         init_word_char (dfa);
2362       if (token->opr.ctx_type == WORD_DELIM
2363           || token->opr.ctx_type == NOT_WORD_DELIM)
2364         {
2365           bin_tree_t *tree_first, *tree_last;
2366           if (token->opr.ctx_type == WORD_DELIM)
2367             {
2368               token->opr.ctx_type = WORD_FIRST;
2369               tree_first = create_token_tree (dfa, NULL, NULL, token);
2370               token->opr.ctx_type = WORD_LAST;
2371             }
2372           else
2373             {
2374               token->opr.ctx_type = INSIDE_WORD;
2375               tree_first = create_token_tree (dfa, NULL, NULL, token);
2376               token->opr.ctx_type = INSIDE_NOTWORD;
2377             }
2378           tree_last = create_token_tree (dfa, NULL, NULL, token);
2379           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2380           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2381             {
2382               *err = REG_ESPACE;
2383               return NULL;
2384             }
2385         }
2386       else
2387         {
2388           tree = create_token_tree (dfa, NULL, NULL, token);
2389           if (BE (tree == NULL, 0))
2390             {
2391               *err = REG_ESPACE;
2392               return NULL;
2393             }
2394         }
2395       /* We must return here, since ANCHORs can't be followed
2396          by repetition operators.
2397          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2398              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2399       fetch_token (token, regexp, syntax);
2400       return tree;
2401     case OP_PERIOD:
2402       tree = create_token_tree (dfa, NULL, NULL, token);
2403       if (BE (tree == NULL, 0))
2404         {
2405           *err = REG_ESPACE;
2406           return NULL;
2407         }
2408       if (dfa->mb_cur_max > 1)
2409         dfa->has_mb_node = 1;
2410       break;
2411     case OP_WORD:
2412     case OP_NOTWORD:
2413       tree = build_charclass_op (dfa, regexp->trans,
2414                                  (const unsigned char *) "alnum",
2415                                  (const unsigned char *) "_",
2416                                  token->type == OP_NOTWORD, err);
2417       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418         return NULL;
2419       break;
2420     case OP_SPACE:
2421     case OP_NOTSPACE:
2422       tree = build_charclass_op (dfa, regexp->trans,
2423                                  (const unsigned char *) "space",
2424                                  (const unsigned char *) "",
2425                                  token->type == OP_NOTSPACE, err);
2426       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2427         return NULL;
2428       break;
2429     case OP_ALT:
2430     case END_OF_RE:
2431       return NULL;
2432     case BACK_SLASH:
2433       *err = REG_EESCAPE;
2434       return NULL;
2435     default:
2436       /* Must not happen?  */
2437 #ifdef DEBUG
2438       assert (0);
2439 #endif
2440       return NULL;
2441     }
2442   fetch_token (token, regexp, syntax);
2443
2444   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2445          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2446     {
2447       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2448       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2449         return NULL;
2450       /* In BRE consecutive duplications are not allowed.  */
2451       if ((syntax & RE_CONTEXT_INVALID_DUP)
2452           && (token->type == OP_DUP_ASTERISK
2453               || token->type == OP_OPEN_DUP_NUM))
2454         {
2455           *err = REG_BADRPT;
2456           return NULL;
2457         }
2458     }
2459
2460   return tree;
2461 }
2462
2463 /* This function build the following tree, from regular expression
2464    (<reg_exp>):
2465          SUBEXP
2466             |
2467         <reg_exp>
2468 */
2469
2470 static bin_tree_t *
2471 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2472      re_string_t *regexp;
2473      regex_t *preg;
2474      re_token_t *token;
2475      reg_syntax_t syntax;
2476      int nest;
2477      reg_errcode_t *err;
2478 {
2479   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2480   bin_tree_t *tree;
2481   size_t cur_nsub;
2482   cur_nsub = preg->re_nsub++;
2483
2484   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2485
2486   /* The subexpression may be a null string.  */
2487   if (token->type == OP_CLOSE_SUBEXP)
2488     tree = NULL;
2489   else
2490     {
2491       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2492       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2493         *err = REG_EPAREN;
2494       if (BE (*err != REG_NOERROR, 0))
2495         return NULL;
2496     }
2497
2498   if (cur_nsub <= '9' - '1')
2499     dfa->completed_bkref_map |= 1 << cur_nsub;
2500
2501   tree = create_tree (dfa, tree, NULL, SUBEXP);
2502   if (BE (tree == NULL, 0))
2503     {
2504       *err = REG_ESPACE;
2505       return NULL;
2506     }
2507   tree->token.opr.idx = cur_nsub;
2508   return tree;
2509 }
2510
2511 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2512
2513 static bin_tree_t *
2514 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2515      bin_tree_t *elem;
2516      re_string_t *regexp;
2517      re_dfa_t *dfa;
2518      re_token_t *token;
2519      reg_syntax_t syntax;
2520      reg_errcode_t *err;
2521 {
2522   bin_tree_t *tree = NULL, *old_tree = NULL;
2523   int i, start, end, start_idx = re_string_cur_idx (regexp);
2524   re_token_t start_token = *token;
2525
2526   if (token->type == OP_OPEN_DUP_NUM)
2527     {
2528       end = 0;
2529       start = fetch_number (regexp, token, syntax);
2530       if (start == -1)
2531         {
2532           if (token->type == CHARACTER && token->opr.c == ',')
2533             start = 0; /* We treat "{,m}" as "{0,m}".  */
2534           else
2535             {
2536               *err = REG_BADBR; /* <re>{} is invalid.  */
2537               return NULL;
2538             }
2539         }
2540       if (BE (start != -2, 1))
2541         {
2542           /* We treat "{n}" as "{n,n}".  */
2543           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2544                  : ((token->type == CHARACTER && token->opr.c == ',')
2545                     ? fetch_number (regexp, token, syntax) : -2));
2546         }
2547       if (BE (start == -2 || end == -2, 0))
2548         {
2549           /* Invalid sequence.  */
2550           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2551             {
2552               if (token->type == END_OF_RE)
2553                 *err = REG_EBRACE;
2554               else
2555                 *err = REG_BADBR;
2556
2557               return NULL;
2558             }
2559
2560           /* If the syntax bit is set, rollback.  */
2561           re_string_set_index (regexp, start_idx);
2562           *token = start_token;
2563           token->type = CHARACTER;
2564           /* mb_partial and word_char bits should be already initialized by
2565              peek_token.  */
2566           return elem;
2567         }
2568
2569       if (BE (end != -1 && start > end, 0))
2570         {
2571           /* First number greater than second.  */
2572           *err = REG_BADBR;
2573           return NULL;
2574         }
2575     }
2576   else
2577     {
2578       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2579       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2580     }
2581
2582   fetch_token (token, regexp, syntax);
2583
2584   if (BE (elem == NULL, 0))
2585     return NULL;
2586   if (BE (start == 0 && end == 0, 0))
2587     {
2588       postorder (elem, free_tree, NULL);
2589       return NULL;
2590     }
2591
2592   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2593   if (BE (start > 0, 0))
2594     {
2595       tree = elem;
2596       for (i = 2; i <= start; ++i)
2597         {
2598           elem = duplicate_tree (elem, dfa);
2599           tree = create_tree (dfa, tree, elem, CONCAT);
2600           if (BE (elem == NULL || tree == NULL, 0))
2601             goto parse_dup_op_espace;
2602         }
2603
2604       if (start == end)
2605         return tree;
2606
2607       /* Duplicate ELEM before it is marked optional.  */
2608       elem = duplicate_tree (elem, dfa);
2609       old_tree = tree;
2610     }
2611   else
2612     old_tree = NULL;
2613
2614   if (elem->token.type == SUBEXP)
2615     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2616
2617   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2618   if (BE (tree == NULL, 0))
2619     goto parse_dup_op_espace;
2620
2621   /* This loop is actually executed only when end != -1,
2622      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2623      already created the start+1-th copy.  */
2624   for (i = start + 2; i <= end; ++i)
2625     {
2626       elem = duplicate_tree (elem, dfa);
2627       tree = create_tree (dfa, tree, elem, CONCAT);
2628       if (BE (elem == NULL || tree == NULL, 0))
2629         goto parse_dup_op_espace;
2630
2631       tree = create_tree (dfa, tree, NULL, OP_ALT);
2632       if (BE (tree == NULL, 0))
2633         goto parse_dup_op_espace;
2634     }
2635
2636   if (old_tree)
2637     tree = create_tree (dfa, old_tree, tree, CONCAT);
2638
2639   return tree;
2640
2641  parse_dup_op_espace:
2642   *err = REG_ESPACE;
2643   return NULL;
2644 }
2645
2646 /* Size of the names for collating symbol/equivalence_class/character_class.
2647    I'm not sure, but maybe enough.  */
2648 #define BRACKET_NAME_BUF_SIZE 32
2649
2650 #ifndef _LIBC
2651   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2652      Build the range expression which starts from START_ELEM, and ends
2653      at END_ELEM.  The result are written to MBCSET and SBCSET.
2654      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2655      mbcset->range_ends, is a pointer argument sinse we may
2656      update it.  */
2657
2658 static reg_errcode_t
2659 # ifdef RE_ENABLE_I18N
2660 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2661      re_charset_t *mbcset;
2662      int *range_alloc;
2663 # else /* not RE_ENABLE_I18N */
2664 build_range_exp (sbcset, start_elem, end_elem)
2665 # endif /* not RE_ENABLE_I18N */
2666      bitset_t sbcset;
2667      bracket_elem_t *start_elem, *end_elem;
2668 {
2669   unsigned int start_ch, end_ch;
2670   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2671   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2672           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2673           0))
2674     return REG_ERANGE;
2675
2676   /* We can handle no multi character collating elements without libc
2677      support.  */
2678   if (BE ((start_elem->type == COLL_SYM
2679            && strlen ((char *) start_elem->opr.name) > 1)
2680           || (end_elem->type == COLL_SYM
2681               && strlen ((char *) end_elem->opr.name) > 1), 0))
2682     return REG_ECOLLATE;
2683
2684 # ifdef RE_ENABLE_I18N
2685   {
2686     wchar_t wc;
2687     wint_t start_wc;
2688     wint_t end_wc;
2689     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2690
2691     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2692                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2693                    : 0));
2694     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2695               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2696                  : 0));
2697     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2698                 ? __btowc (start_ch) : start_elem->opr.wch);
2699     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2700               ? __btowc (end_ch) : end_elem->opr.wch);
2701     if (start_wc == WEOF || end_wc == WEOF)
2702       return REG_ECOLLATE;
2703     cmp_buf[0] = start_wc;
2704     cmp_buf[4] = end_wc;
2705     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2706       return REG_ERANGE;
2707
2708     /* Got valid collation sequence values, add them as a new entry.
2709        However, for !_LIBC we have no collation elements: if the
2710        character set is single byte, the single byte character set
2711        that we build below suffices.  parse_bracket_exp passes
2712        no MBCSET if dfa->mb_cur_max == 1.  */
2713     if (mbcset)
2714       {
2715         /* Check the space of the arrays.  */
2716         if (BE (*range_alloc == mbcset->nranges, 0))
2717           {
2718             /* There is not enough space, need realloc.  */
2719             wchar_t *new_array_start, *new_array_end;
2720             int new_nranges;
2721
2722             /* +1 in case of mbcset->nranges is 0.  */
2723             new_nranges = 2 * mbcset->nranges + 1;
2724             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2725                are NULL if *range_alloc == 0.  */
2726             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2727                                           new_nranges);
2728             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2729                                         new_nranges);
2730
2731             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2732               return REG_ESPACE;
2733
2734             mbcset->range_starts = new_array_start;
2735             mbcset->range_ends = new_array_end;
2736             *range_alloc = new_nranges;
2737           }
2738
2739         mbcset->range_starts[mbcset->nranges] = start_wc;
2740         mbcset->range_ends[mbcset->nranges++] = end_wc;
2741       }
2742
2743     /* Build the table for single byte characters.  */
2744     for (wc = 0; wc < SBC_MAX; ++wc)
2745       {
2746         cmp_buf[2] = wc;
2747         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2748             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2749           bitset_set (sbcset, wc);
2750       }
2751   }
2752 # else /* not RE_ENABLE_I18N */
2753   {
2754     unsigned int ch;
2755     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2756                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2757                    : 0));
2758     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2759               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2760                  : 0));
2761     if (start_ch > end_ch)
2762       return REG_ERANGE;
2763     /* Build the table for single byte characters.  */
2764     for (ch = 0; ch < SBC_MAX; ++ch)
2765       if (start_ch <= ch  && ch <= end_ch)
2766         bitset_set (sbcset, ch);
2767   }
2768 # endif /* not RE_ENABLE_I18N */
2769   return REG_NOERROR;
2770 }
2771 #endif /* not _LIBC */
2772
2773 #ifndef _LIBC
2774 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2775    Build the collating element which is represented by NAME.
2776    The result are written to MBCSET and SBCSET.
2777    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2778    pointer argument since we may update it.  */
2779
2780 static reg_errcode_t
2781 # ifdef RE_ENABLE_I18N
2782 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2783      re_charset_t *mbcset;
2784      int *coll_sym_alloc;
2785 # else /* not RE_ENABLE_I18N */
2786 build_collating_symbol (sbcset, name)
2787 # endif /* not RE_ENABLE_I18N */
2788      bitset_t sbcset;
2789      const unsigned char *name;
2790 {
2791   size_t name_len = strlen ((const char *) name);
2792   if (BE (name_len != 1, 0))
2793     return REG_ECOLLATE;
2794   else
2795     {
2796       bitset_set (sbcset, name[0]);
2797       return REG_NOERROR;
2798     }
2799 }
2800 #endif /* not _LIBC */
2801
2802 /* This function parse bracket expression like "[abc]", "[a-c]",
2803    "[[.a-a.]]" etc.  */
2804
2805 static bin_tree_t *
2806 parse_bracket_exp (regexp, dfa, token, syntax, err)
2807      re_string_t *regexp;
2808      re_dfa_t *dfa;
2809      re_token_t *token;
2810      reg_syntax_t syntax;
2811      reg_errcode_t *err;
2812 {
2813 #ifdef _LIBC
2814   const unsigned char *collseqmb;
2815   const char *collseqwc;
2816   uint32_t nrules;
2817   int32_t table_size;
2818   const int32_t *symb_table;
2819   const unsigned char *extra;
2820
2821   /* Local function for parse_bracket_exp used in _LIBC environement.
2822      Seek the collating symbol entry correspondings to NAME.
2823      Return the index of the symbol in the SYMB_TABLE.  */
2824
2825   auto inline int32_t
2826   __attribute ((always_inline))
2827   seek_collating_symbol_entry (name, name_len)
2828          const unsigned char *name;
2829          size_t name_len;
2830     {
2831       int32_t hash = elem_hash ((const char *) name, name_len);
2832       int32_t elem = hash % table_size;
2833       int32_t second = hash % (table_size - 2);
2834       while (symb_table[2 * elem] != 0)
2835         {
2836           /* First compare the hashing value.  */
2837           if (symb_table[2 * elem] == hash
2838               /* Compare the length of the name.  */
2839               && name_len == extra[symb_table[2 * elem + 1]]
2840               /* Compare the name.  */
2841               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2842                          name_len) == 0)
2843             {
2844               /* Yep, this is the entry.  */
2845               break;
2846             }
2847
2848           /* Next entry.  */
2849           elem += second;
2850         }
2851       return elem;
2852     }
2853
2854   /* Local function for parse_bracket_exp used in _LIBC environement.
2855      Look up the collation sequence value of BR_ELEM.
2856      Return the value if succeeded, UINT_MAX otherwise.  */
2857
2858   auto inline unsigned int
2859   __attribute ((always_inline))
2860   lookup_collation_sequence_value (br_elem)
2861          bracket_elem_t *br_elem;
2862     {
2863       if (br_elem->type == SB_CHAR)
2864         {
2865           /*
2866           if (MB_CUR_MAX == 1)
2867           */
2868           if (nrules == 0)
2869             return collseqmb[br_elem->opr.ch];
2870           else
2871             {
2872               wint_t wc = __btowc (br_elem->opr.ch);
2873               return __collseq_table_lookup (collseqwc, wc);
2874             }
2875         }
2876       else if (br_elem->type == MB_CHAR)
2877         {
2878           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2879         }
2880       else if (br_elem->type == COLL_SYM)
2881         {
2882           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2883           if (nrules != 0)
2884             {
2885               int32_t elem, idx;
2886               elem = seek_collating_symbol_entry (br_elem->opr.name,
2887                                                   sym_name_len);
2888               if (symb_table[2 * elem] != 0)
2889                 {
2890                   /* We found the entry.  */
2891                   idx = symb_table[2 * elem + 1];
2892                   /* Skip the name of collating element name.  */
2893                   idx += 1 + extra[idx];
2894                   /* Skip the byte sequence of the collating element.  */
2895                   idx += 1 + extra[idx];
2896                   /* Adjust for the alignment.  */
2897                   idx = (idx + 3) & ~3;
2898                   /* Skip the multibyte collation sequence value.  */
2899                   idx += sizeof (unsigned int);
2900                   /* Skip the wide char sequence of the collating element.  */
2901                   idx += sizeof (unsigned int) *
2902                     (1 + *(unsigned int *) (extra + idx));
2903                   /* Return the collation sequence value.  */
2904                   return *(unsigned int *) (extra + idx);
2905                 }
2906               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2907                 {
2908                   /* No valid character.  Match it as a single byte
2909                      character.  */
2910                   return collseqmb[br_elem->opr.name[0]];
2911                 }
2912             }
2913           else if (sym_name_len == 1)
2914             return collseqmb[br_elem->opr.name[0]];
2915         }
2916       return UINT_MAX;
2917     }
2918
2919   /* Local function for parse_bracket_exp used in _LIBC environement.
2920      Build the range expression which starts from START_ELEM, and ends
2921      at END_ELEM.  The result are written to MBCSET and SBCSET.
2922      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2923      mbcset->range_ends, is a pointer argument sinse we may
2924      update it.  */
2925
2926   auto inline reg_errcode_t
2927   __attribute ((always_inline))
2928   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2929          re_charset_t *mbcset;
2930          int *range_alloc;
2931          bitset_t sbcset;
2932          bracket_elem_t *start_elem, *end_elem;
2933     {
2934       unsigned int ch;
2935       uint32_t start_collseq;
2936       uint32_t end_collseq;
2937
2938       /* Equivalence Classes and Character Classes can't be a range
2939          start/end.  */
2940       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2941               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2942               0))
2943         return REG_ERANGE;
2944
2945       start_collseq = lookup_collation_sequence_value (start_elem);
2946       end_collseq = lookup_collation_sequence_value (end_elem);
2947       /* Check start/end collation sequence values.  */
2948       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2949         return REG_ECOLLATE;
2950       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2951         return REG_ERANGE;
2952
2953       /* Got valid collation sequence values, add them as a new entry.
2954          However, if we have no collation elements, and the character set
2955          is single byte, the single byte character set that we
2956          build below suffices. */
2957       if (nrules > 0 || dfa->mb_cur_max > 1)
2958         {
2959           /* Check the space of the arrays.  */
2960           if (BE (*range_alloc == mbcset->nranges, 0))
2961             {
2962               /* There is not enough space, need realloc.  */
2963               uint32_t *new_array_start;
2964               uint32_t *new_array_end;
2965               int new_nranges;
2966
2967               /* +1 in case of mbcset->nranges is 0.  */
2968               new_nranges = 2 * mbcset->nranges + 1;
2969               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2970                                             new_nranges);
2971               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2972                                           new_nranges);
2973
2974               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2975                 return REG_ESPACE;
2976
2977               mbcset->range_starts = new_array_start;
2978               mbcset->range_ends = new_array_end;
2979               *range_alloc = new_nranges;
2980             }
2981
2982           mbcset->range_starts[mbcset->nranges] = start_collseq;
2983           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2984         }
2985
2986       /* Build the table for single byte characters.  */
2987       for (ch = 0; ch < SBC_MAX; ch++)
2988         {
2989           uint32_t ch_collseq;
2990           /*
2991           if (MB_CUR_MAX == 1)
2992           */
2993           if (nrules == 0)
2994             ch_collseq = collseqmb[ch];
2995           else
2996             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2997           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2998             bitset_set (sbcset, ch);
2999         }
3000       return REG_NOERROR;
3001     }
3002
3003   /* Local function for parse_bracket_exp used in _LIBC environement.
3004      Build the collating element which is represented by NAME.
3005      The result are written to MBCSET and SBCSET.
3006      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3007      pointer argument sinse we may update it.  */
3008
3009   auto inline reg_errcode_t
3010   __attribute ((always_inline))
3011   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3012          re_charset_t *mbcset;
3013          int *coll_sym_alloc;
3014          bitset_t sbcset;
3015          const unsigned char *name;
3016     {
3017       int32_t elem, idx;
3018       size_t name_len = strlen ((const char *) name);
3019       if (nrules != 0)
3020         {
3021           elem = seek_collating_symbol_entry (name, name_len);
3022           if (symb_table[2 * elem] != 0)
3023             {
3024               /* We found the entry.  */
3025               idx = symb_table[2 * elem + 1];
3026               /* Skip the name of collating element name.  */
3027               idx += 1 + extra[idx];
3028             }
3029           else if (symb_table[2 * elem] == 0 && name_len == 1)
3030             {
3031               /* No valid character, treat it as a normal
3032                  character.  */
3033               bitset_set (sbcset, name[0]);
3034               return REG_NOERROR;
3035             }
3036           else
3037             return REG_ECOLLATE;
3038
3039           /* Got valid collation sequence, add it as a new entry.  */
3040           /* Check the space of the arrays.  */
3041           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3042             {
3043               /* Not enough, realloc it.  */
3044               /* +1 in case of mbcset->ncoll_syms is 0.  */
3045               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3046               /* Use realloc since mbcset->coll_syms is NULL
3047                  if *alloc == 0.  */
3048               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3049                                                    new_coll_sym_alloc);
3050               if (BE (new_coll_syms == NULL, 0))
3051                 return REG_ESPACE;
3052               mbcset->coll_syms = new_coll_syms;
3053               *coll_sym_alloc = new_coll_sym_alloc;
3054             }
3055           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3056           return REG_NOERROR;
3057         }
3058       else
3059         {
3060           if (BE (name_len != 1, 0))
3061             return REG_ECOLLATE;
3062           else
3063             {
3064               bitset_set (sbcset, name[0]);
3065               return REG_NOERROR;
3066             }
3067         }
3068     }
3069 #endif
3070
3071   re_token_t br_token;
3072   re_bitset_ptr_t sbcset;
3073 #ifdef RE_ENABLE_I18N
3074   re_charset_t *mbcset;
3075   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3076   int equiv_class_alloc = 0, char_class_alloc = 0;
3077 #endif /* not RE_ENABLE_I18N */
3078   int non_match = 0;
3079   bin_tree_t *work_tree;
3080   int token_len;
3081   int first_round = 1;
3082 #ifdef _LIBC
3083   collseqmb = (const unsigned char *)
3084     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3085   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3086   if (nrules)
3087     {
3088       /*
3089       if (MB_CUR_MAX > 1)
3090       */
3091         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3092       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3093       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3094                                                   _NL_COLLATE_SYMB_TABLEMB);
3095       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3096                                                    _NL_COLLATE_SYMB_EXTRAMB);
3097     }
3098 #endif
3099   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3100 #ifdef RE_ENABLE_I18N
3101   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3102 #endif /* RE_ENABLE_I18N */
3103 #ifdef RE_ENABLE_I18N
3104   if (BE (sbcset == NULL || mbcset == NULL, 0))
3105 #else
3106   if (BE (sbcset == NULL, 0))
3107 #endif /* RE_ENABLE_I18N */
3108     {
3109       *err = REG_ESPACE;
3110       return NULL;
3111     }
3112
3113   token_len = peek_token_bracket (token, regexp, syntax);
3114   if (BE (token->type == END_OF_RE, 0))
3115     {
3116       *err = REG_BADPAT;
3117       goto parse_bracket_exp_free_return;
3118     }
3119   if (token->type == OP_NON_MATCH_LIST)
3120     {
3121 #ifdef RE_ENABLE_I18N
3122       mbcset->non_match = 1;
3123 #endif /* not RE_ENABLE_I18N */
3124       non_match = 1;
3125       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3126         bitset_set (sbcset, '\0');
3127       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3128       token_len = peek_token_bracket (token, regexp, syntax);
3129       if (BE (token->type == END_OF_RE, 0))
3130         {
3131           *err = REG_BADPAT;
3132           goto parse_bracket_exp_free_return;
3133         }
3134     }
3135
3136   /* We treat the first ']' as a normal character.  */
3137   if (token->type == OP_CLOSE_BRACKET)
3138     token->type = CHARACTER;
3139
3140   while (1)
3141     {
3142       bracket_elem_t start_elem, end_elem;
3143       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3144       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3145       reg_errcode_t ret;
3146       int token_len2 = 0, is_range_exp = 0;
3147       re_token_t token2;
3148
3149       start_elem.opr.name = start_name_buf;
3150       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3151                                    syntax, first_round);
3152       if (BE (ret != REG_NOERROR, 0))
3153         {
3154           *err = ret;
3155           goto parse_bracket_exp_free_return;
3156         }
3157       first_round = 0;
3158
3159       /* Get information about the next token.  We need it in any case.  */
3160       token_len = peek_token_bracket (token, regexp, syntax);
3161
3162       /* Do not check for ranges if we know they are not allowed.  */
3163       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3164         {
3165           if (BE (token->type == END_OF_RE, 0))
3166             {
3167               *err = REG_EBRACK;
3168               goto parse_bracket_exp_free_return;
3169             }
3170           if (token->type == OP_CHARSET_RANGE)
3171             {
3172               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3173               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3174               if (BE (token2.type == END_OF_RE, 0))
3175                 {
3176                   *err = REG_EBRACK;
3177                   goto parse_bracket_exp_free_return;
3178                 }
3179               if (token2.type == OP_CLOSE_BRACKET)
3180                 {
3181                   /* We treat the last '-' as a normal character.  */
3182                   re_string_skip_bytes (regexp, -token_len);
3183                   token->type = CHARACTER;
3184                 }
3185               else
3186                 is_range_exp = 1;
3187             }
3188         }
3189
3190       if (is_range_exp == 1)
3191         {
3192           end_elem.opr.name = end_name_buf;
3193           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3194                                        dfa, syntax, 1);
3195           if (BE (ret != REG_NOERROR, 0))
3196             {
3197               *err = ret;
3198               goto parse_bracket_exp_free_return;
3199             }
3200
3201           token_len = peek_token_bracket (token, regexp, syntax);
3202
3203 #ifdef _LIBC
3204           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3205                                   &start_elem, &end_elem);
3206 #else
3207 # ifdef RE_ENABLE_I18N
3208           *err = build_range_exp (sbcset,
3209                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3210                                   &range_alloc, &start_elem, &end_elem);
3211 # else
3212           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3213 # endif
3214 #endif /* RE_ENABLE_I18N */
3215           if (BE (*err != REG_NOERROR, 0))
3216             goto parse_bracket_exp_free_return;
3217         }
3218       else
3219         {
3220           switch (start_elem.type)
3221             {
3222             case SB_CHAR:
3223               bitset_set (sbcset, start_elem.opr.ch);
3224               break;
3225 #ifdef RE_ENABLE_I18N
3226             case MB_CHAR:
3227               /* Check whether the array has enough space.  */
3228               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3229                 {
3230                   wchar_t *new_mbchars;
3231                   /* Not enough, realloc it.  */
3232                   /* +1 in case of mbcset->nmbchars is 0.  */
3233                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3234                   /* Use realloc since array is NULL if *alloc == 0.  */
3235                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3236                                             mbchar_alloc);
3237                   if (BE (new_mbchars == NULL, 0))
3238                     goto parse_bracket_exp_espace;
3239                   mbcset->mbchars = new_mbchars;
3240                 }
3241               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3242               break;
3243 #endif /* RE_ENABLE_I18N */
3244             case EQUIV_CLASS:
3245               *err = build_equiv_class (sbcset,
3246 #ifdef RE_ENABLE_I18N
3247                                         mbcset, &equiv_class_alloc,
3248 #endif /* RE_ENABLE_I18N */
3249                                         start_elem.opr.name);
3250               if (BE (*err != REG_NOERROR, 0))
3251                 goto parse_bracket_exp_free_return;
3252               break;
3253             case COLL_SYM:
3254               *err = build_collating_symbol (sbcset,
3255 #ifdef RE_ENABLE_I18N
3256                                              mbcset, &coll_sym_alloc,
3257 #endif /* RE_ENABLE_I18N */
3258                                              start_elem.opr.name);
3259               if (BE (*err != REG_NOERROR, 0))
3260                 goto parse_bracket_exp_free_return;
3261               break;
3262             case CHAR_CLASS:
3263               *err = build_charclass (regexp->trans, sbcset,
3264 #ifdef RE_ENABLE_I18N
3265                                       mbcset, &char_class_alloc,
3266 #endif /* RE_ENABLE_I18N */
3267                                       start_elem.opr.name, syntax);
3268               if (BE (*err != REG_NOERROR, 0))
3269                goto parse_bracket_exp_free_return;
3270               break;
3271             default:
3272               assert (0);
3273               break;
3274             }
3275         }
3276       if (BE (token->type == END_OF_RE, 0))
3277         {
3278           *err = REG_EBRACK;
3279           goto parse_bracket_exp_free_return;
3280         }
3281       if (token->type == OP_CLOSE_BRACKET)
3282         break;
3283     }
3284
3285   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3286
3287   /* If it is non-matching list.  */
3288   if (non_match)
3289     bitset_not (sbcset);
3290
3291 #ifdef RE_ENABLE_I18N
3292   /* Ensure only single byte characters are set.  */
3293   if (dfa->mb_cur_max > 1)
3294     bitset_mask (sbcset, dfa->sb_char);
3295
3296   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3297       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3298                                                      || mbcset->non_match)))
3299     {
3300       bin_tree_t *mbc_tree;
3301       int sbc_idx;
3302       /* Build a tree for complex bracket.  */
3303       dfa->has_mb_node = 1;
3304       br_token.type = COMPLEX_BRACKET;
3305       br_token.opr.mbcset = mbcset;
3306       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3307       if (BE (mbc_tree == NULL, 0))
3308         goto parse_bracket_exp_espace;
3309       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3310         if (sbcset[sbc_idx])
3311           break;
3312       /* If there are no bits set in sbcset, there is no point
3313          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3314       if (sbc_idx < BITSET_WORDS)
3315         {
3316           /* Build a tree for simple bracket.  */
3317           br_token.type = SIMPLE_BRACKET;
3318           br_token.opr.sbcset = sbcset;
3319           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3320           if (BE (work_tree == NULL, 0))
3321             goto parse_bracket_exp_espace;
3322
3323           /* Then join them by ALT node.  */
3324           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3325           if (BE (work_tree == NULL, 0))
3326             goto parse_bracket_exp_espace;
3327         }
3328       else
3329         {
3330           re_free (sbcset);
3331           work_tree = mbc_tree;
3332         }
3333     }
3334   else
3335 #endif /* not RE_ENABLE_I18N */
3336     {
3337 #ifdef RE_ENABLE_I18N
3338       free_charset (mbcset);
3339 #endif
3340       /* Build a tree for simple bracket.  */
3341       br_token.type = SIMPLE_BRACKET;
3342       br_token.opr.sbcset = sbcset;
3343       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3344       if (BE (work_tree == NULL, 0))
3345         goto parse_bracket_exp_espace;
3346     }
3347   return work_tree;
3348
3349  parse_bracket_exp_espace:
3350   *err = REG_ESPACE;
3351  parse_bracket_exp_free_return:
3352   re_free (sbcset);
3353 #ifdef RE_ENABLE_I18N
3354   free_charset (mbcset);
3355 #endif /* RE_ENABLE_I18N */
3356   return NULL;
3357 }
3358
3359 /* Parse an element in the bracket expression.  */
3360
3361 static reg_errcode_t
3362 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3363                        accept_hyphen)
3364      bracket_elem_t *elem;
3365      re_string_t *regexp;
3366      re_token_t *token;
3367      int token_len;
3368      re_dfa_t *dfa;
3369      reg_syntax_t syntax;
3370      int accept_hyphen;
3371 {
3372 #ifdef RE_ENABLE_I18N
3373   int cur_char_size;
3374   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3375   if (cur_char_size > 1)
3376     {
3377       elem->type = MB_CHAR;
3378       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3379       re_string_skip_bytes (regexp, cur_char_size);
3380       return REG_NOERROR;
3381     }
3382 #endif /* RE_ENABLE_I18N */
3383   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3384   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3385       || token->type == OP_OPEN_EQUIV_CLASS)
3386     return parse_bracket_symbol (elem, regexp, token);
3387   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3388     {
3389       /* A '-' must only appear as anything but a range indicator before
3390          the closing bracket.  Everything else is an error.  */
3391       re_token_t token2;
3392       (void) peek_token_bracket (&token2, regexp, syntax);
3393       if (token2.type != OP_CLOSE_BRACKET)
3394         /* The actual error value is not standardized since this whole
3395            case is undefined.  But ERANGE makes good sense.  */
3396         return REG_ERANGE;
3397     }
3398   elem->type = SB_CHAR;
3399   elem->opr.ch = token->opr.c;
3400   return REG_NOERROR;
3401 }
3402
3403 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3404    such as [:<character_class>:], [.<collating_element>.], and
3405    [=<equivalent_class>=].  */
3406
3407 static reg_errcode_t
3408 parse_bracket_symbol (elem, regexp, token)
3409      bracket_elem_t *elem;
3410      re_string_t *regexp;
3411      re_token_t *token;
3412 {
3413   unsigned char ch, delim = token->opr.c;
3414   int i = 0;
3415   if (re_string_eoi(regexp))
3416     return REG_EBRACK;
3417   for (;; ++i)
3418     {
3419       if (i >= BRACKET_NAME_BUF_SIZE)
3420         return REG_EBRACK;
3421       if (token->type == OP_OPEN_CHAR_CLASS)
3422         ch = re_string_fetch_byte_case (regexp);
3423       else
3424         ch = re_string_fetch_byte (regexp);
3425       if (re_string_eoi(regexp))
3426         return REG_EBRACK;
3427       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3428         break;
3429       elem->opr.name[i] = ch;
3430     }
3431   re_string_skip_bytes (regexp, 1);
3432   elem->opr.name[i] = '\0';
3433   switch (token->type)
3434     {
3435     case OP_OPEN_COLL_ELEM:
3436       elem->type = COLL_SYM;
3437       break;
3438     case OP_OPEN_EQUIV_CLASS:
3439       elem->type = EQUIV_CLASS;
3440       break;
3441     case OP_OPEN_CHAR_CLASS:
3442       elem->type = CHAR_CLASS;
3443       break;
3444     default:
3445       break;
3446     }
3447   return REG_NOERROR;
3448 }
3449
3450   /* Helper function for parse_bracket_exp.
3451      Build the equivalence class which is represented by NAME.
3452      The result are written to MBCSET and SBCSET.
3453      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3454      is a pointer argument sinse we may update it.  */
3455
3456 static reg_errcode_t
3457 #ifdef RE_ENABLE_I18N
3458 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3459      re_charset_t *mbcset;
3460      int *equiv_class_alloc;
3461 #else /* not RE_ENABLE_I18N */
3462 build_equiv_class (sbcset, name)
3463 #endif /* not RE_ENABLE_I18N */
3464      bitset_t sbcset;
3465      const unsigned char *name;
3466 {
3467 #if defined _LIBC
3468   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3469   if (nrules != 0)
3470     {
3471       const int32_t *table, *indirect;
3472       const unsigned char *weights, *extra, *cp;
3473       unsigned char char_buf[2];
3474       int32_t idx1, idx2;
3475       unsigned int ch;
3476       size_t len;
3477       /* This #include defines a local function!  */
3478 # include <locale/weight.h>
3479       /* Calculate the index for equivalence class.  */
3480       cp = name;
3481       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3482       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3483                                                _NL_COLLATE_WEIGHTMB);
3484       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3485                                                    _NL_COLLATE_EXTRAMB);
3486       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3487                                                 _NL_COLLATE_INDIRECTMB);
3488       idx1 = findidx (&cp);
3489       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3490         /* This isn't a valid character.  */
3491         return REG_ECOLLATE;
3492
3493       /* Build single byte matcing table for this equivalence class.  */
3494       char_buf[1] = (unsigned char) '\0';
3495       len = weights[idx1];
3496       for (ch = 0; ch < SBC_MAX; ++ch)
3497         {
3498           char_buf[0] = ch;
3499           cp = char_buf;
3500           idx2 = findidx (&cp);
3501 /*
3502           idx2 = table[ch];
3503 */
3504           if (idx2 == 0)
3505             /* This isn't a valid character.  */
3506             continue;
3507           if (len == weights[idx2])
3508             {
3509               int cnt = 0;
3510               while (cnt <= len &&
3511                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3512                 ++cnt;
3513
3514               if (cnt > len)
3515                 bitset_set (sbcset, ch);
3516             }
3517         }
3518       /* Check whether the array has enough space.  */
3519       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3520         {
3521           /* Not enough, realloc it.  */
3522           /* +1 in case of mbcset->nequiv_classes is 0.  */
3523           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3524           /* Use realloc since the array is NULL if *alloc == 0.  */
3525           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3526                                                    int32_t,
3527                                                    new_equiv_class_alloc);
3528           if (BE (new_equiv_classes == NULL, 0))
3529             return REG_ESPACE;
3530           mbcset->equiv_classes = new_equiv_classes;
3531           *equiv_class_alloc = new_equiv_class_alloc;
3532         }
3533       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3534     }
3535   else
3536 #endif /* _LIBC */
3537     {
3538       if (BE (strlen ((const char *) name) != 1, 0))
3539         return REG_ECOLLATE;
3540       bitset_set (sbcset, *name);
3541     }
3542   return REG_NOERROR;
3543 }
3544
3545   /* Helper function for parse_bracket_exp.
3546      Build the character class which is represented by NAME.
3547      The result are written to MBCSET and SBCSET.
3548      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3549      is a pointer argument sinse we may update it.  */
3550
3551 static reg_errcode_t
3552 #ifdef RE_ENABLE_I18N
3553 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3554      re_charset_t *mbcset;
3555      int *char_class_alloc;
3556 #else /* not RE_ENABLE_I18N */
3557 build_charclass (trans, sbcset, class_name, syntax)
3558 #endif /* not RE_ENABLE_I18N */
3559      RE_TRANSLATE_TYPE trans;
3560      bitset_t sbcset;
3561      const unsigned char *class_name;
3562      reg_syntax_t syntax;
3563 {
3564   int i;
3565   const char *name = (const char *) class_name;
3566
3567   /* In case of REG_ICASE "upper" and "lower" match the both of
3568      upper and lower cases.  */
3569   if ((syntax & RE_ICASE)
3570       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3571     name = "alpha";
3572
3573 #ifdef RE_ENABLE_I18N
3574   /* Check the space of the arrays.  */
3575   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3576     {
3577       /* Not enough, realloc it.  */
3578       /* +1 in case of mbcset->nchar_classes is 0.  */
3579       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3580       /* Use realloc since array is NULL if *alloc == 0.  */
3581       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3582                                                new_char_class_alloc);
3583       if (BE (new_char_classes == NULL, 0))
3584         return REG_ESPACE;
3585       mbcset->char_classes = new_char_classes;
3586       *char_class_alloc = new_char_class_alloc;
3587     }
3588   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3589 #endif /* RE_ENABLE_I18N */
3590
3591 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3592     for (i = 0; i < SBC_MAX; ++i)               \
3593       {                                         \
3594         if (ctype_func (i))                     \
3595           {                                     \
3596             int ch = trans ? trans[i] : i;      \
3597             bitset_set (sbcset, ch);            \
3598           }                                     \
3599       }
3600
3601   if (strcmp (name, "alnum") == 0)
3602     BUILD_CHARCLASS_LOOP (isalnum)
3603   else if (strcmp (name, "cntrl") == 0)
3604     BUILD_CHARCLASS_LOOP (iscntrl)
3605   else if (strcmp (name, "lower") == 0)
3606     BUILD_CHARCLASS_LOOP (islower)
3607   else if (strcmp (name, "space") == 0)
3608     BUILD_CHARCLASS_LOOP (isspace)
3609   else if (strcmp (name, "alpha") == 0)
3610     BUILD_CHARCLASS_LOOP (isalpha)
3611   else if (strcmp (name, "digit") == 0)
3612     BUILD_CHARCLASS_LOOP (isdigit)
3613   else if (strcmp (name, "print") == 0)
3614     BUILD_CHARCLASS_LOOP (isprint)
3615   else if (strcmp (name, "upper") == 0)
3616     BUILD_CHARCLASS_LOOP (isupper)
3617   else if (strcmp (name, "blank") == 0)
3618     BUILD_CHARCLASS_LOOP (isblank)
3619   else if (strcmp (name, "graph") == 0)
3620     BUILD_CHARCLASS_LOOP (isgraph)
3621   else if (strcmp (name, "punct") == 0)
3622     BUILD_CHARCLASS_LOOP (ispunct)
3623   else if (strcmp (name, "xdigit") == 0)
3624     BUILD_CHARCLASS_LOOP (isxdigit)
3625   else
3626     return REG_ECTYPE;
3627
3628   return REG_NOERROR;
3629 }
3630
3631 static bin_tree_t *
3632 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3633      re_dfa_t *dfa;
3634      RE_TRANSLATE_TYPE trans;
3635      const unsigned char *class_name;
3636      const unsigned char *extra;
3637      int non_match;
3638      reg_errcode_t *err;
3639 {
3640   re_bitset_ptr_t sbcset;
3641 #ifdef RE_ENABLE_I18N
3642   re_charset_t *mbcset;
3643   int alloc = 0;
3644 #endif /* not RE_ENABLE_I18N */
3645   reg_errcode_t ret;
3646   re_token_t br_token;
3647   bin_tree_t *tree;
3648
3649   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3650 #ifdef RE_ENABLE_I18N
3651   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3652 #endif /* RE_ENABLE_I18N */
3653
3654 #ifdef RE_ENABLE_I18N
3655   if (BE (sbcset == NULL || mbcset == NULL, 0))
3656 #else /* not RE_ENABLE_I18N */
3657   if (BE (sbcset == NULL, 0))
3658 #endif /* not RE_ENABLE_I18N */
3659     {
3660       *err = REG_ESPACE;
3661       return NULL;
3662     }
3663
3664   if (non_match)
3665     {
3666 #ifdef RE_ENABLE_I18N
3667       /*
3668       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3669         bitset_set(cset->sbcset, '\0');
3670       */
3671       mbcset->non_match = 1;
3672 #endif /* not RE_ENABLE_I18N */
3673     }
3674
3675   /* We don't care the syntax in this case.  */
3676   ret = build_charclass (trans, sbcset,
3677 #ifdef RE_ENABLE_I18N
3678                          mbcset, &alloc,
3679 #endif /* RE_ENABLE_I18N */
3680                          class_name, 0);
3681
3682   if (BE (ret != REG_NOERROR, 0))
3683     {
3684       re_free (sbcset);
3685 #ifdef RE_ENABLE_I18N
3686       free_charset (mbcset);
3687 #endif /* RE_ENABLE_I18N */
3688       *err = ret;
3689       return NULL;
3690     }
3691   /* \w match '_' also.  */
3692   for (; *extra; extra++)
3693     bitset_set (sbcset, *extra);
3694
3695   /* If it is non-matching list.  */
3696   if (non_match)
3697     bitset_not (sbcset);
3698
3699 #ifdef RE_ENABLE_I18N
3700   /* Ensure only single byte characters are set.  */
3701   if (dfa->mb_cur_max > 1)
3702     bitset_mask (sbcset, dfa->sb_char);
3703 #endif
3704
3705   /* Build a tree for simple bracket.  */
3706   br_token.type = SIMPLE_BRACKET;
3707   br_token.opr.sbcset = sbcset;
3708   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3709   if (BE (tree == NULL, 0))
3710     goto build_word_op_espace;
3711
3712 #ifdef RE_ENABLE_I18N
3713   if (dfa->mb_cur_max > 1)
3714     {
3715       bin_tree_t *mbc_tree;
3716       /* Build a tree for complex bracket.  */
3717       br_token.type = COMPLEX_BRACKET;
3718       br_token.opr.mbcset = mbcset;
3719       dfa->has_mb_node = 1;
3720       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3721       if (BE (mbc_tree == NULL, 0))
3722         goto build_word_op_espace;
3723       /* Then join them by ALT node.  */
3724       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3725       if (BE (mbc_tree != NULL, 1))
3726         return tree;
3727     }
3728   else
3729     {
3730       free_charset (mbcset);
3731       return tree;
3732     }
3733 #else /* not RE_ENABLE_I18N */
3734   return tree;
3735 #endif /* not RE_ENABLE_I18N */
3736
3737  build_word_op_espace:
3738   re_free (sbcset);
3739 #ifdef RE_ENABLE_I18N
3740   free_charset (mbcset);
3741 #endif /* RE_ENABLE_I18N */
3742   *err = REG_ESPACE;
3743   return NULL;
3744 }
3745
3746 /* This is intended for the expressions like "a{1,3}".
3747    Fetch a number from `input', and return the number.
3748    Return -1, if the number field is empty like "{,1}".
3749    Return -2, If an error is occured.  */
3750
3751 static int
3752 fetch_number (input, token, syntax)
3753      re_string_t *input;
3754      re_token_t *token;
3755      reg_syntax_t syntax;
3756 {
3757   int num = -1;
3758   unsigned char c;
3759   while (1)
3760     {
3761       fetch_token (token, input, syntax);
3762       c = token->opr.c;
3763       if (BE (token->type == END_OF_RE, 0))
3764         return -2;
3765       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3766         break;
3767       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3768              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3769       num = (num > RE_DUP_MAX) ? -2 : num;
3770     }
3771   return num;
3772 }
3773 \f
3774 #ifdef RE_ENABLE_I18N
3775 static void
3776 free_charset (re_charset_t *cset)
3777 {
3778   re_free (cset->mbchars);
3779 # ifdef _LIBC
3780   re_free (cset->coll_syms);
3781   re_free (cset->equiv_classes);
3782   re_free (cset->range_starts);
3783   re_free (cset->range_ends);
3784 # endif
3785   re_free (cset->char_classes);
3786   re_free (cset);
3787 }
3788 #endif /* RE_ENABLE_I18N */
3789 \f
3790 /* Functions for binary tree operation.  */
3791
3792 /* Create a tree node.  */
3793
3794 static bin_tree_t *
3795 create_tree (dfa, left, right, type)
3796      re_dfa_t *dfa;
3797      bin_tree_t *left;
3798      bin_tree_t *right;
3799      re_token_type_t type;
3800 {
3801   re_token_t t;
3802   t.type = type;
3803   return create_token_tree (dfa, left, right, &t);
3804 }
3805
3806 static bin_tree_t *
3807 create_token_tree (dfa, left, right, token)
3808      re_dfa_t *dfa;
3809      bin_tree_t *left;
3810      bin_tree_t *right;
3811      const re_token_t *token;
3812 {
3813   bin_tree_t *tree;
3814   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3815     {
3816       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3817
3818       if (storage == NULL)
3819         return NULL;
3820       storage->next = dfa->str_tree_storage;
3821       dfa->str_tree_storage = storage;
3822       dfa->str_tree_storage_idx = 0;
3823     }
3824   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3825
3826   tree->parent = NULL;
3827   tree->left = left;
3828   tree->right = right;
3829   tree->token = *token;
3830   tree->token.duplicated = 0;
3831   tree->token.opt_subexp = 0;
3832   tree->first = NULL;
3833   tree->next = NULL;
3834   tree->node_idx = -1;
3835
3836   if (left != NULL)
3837     left->parent = tree;
3838   if (right != NULL)
3839     right->parent = tree;
3840   return tree;
3841 }
3842
3843 /* Mark the tree SRC as an optional subexpression.
3844    To be called from preorder or postorder.  */
3845
3846 static reg_errcode_t
3847 mark_opt_subexp (extra, node)
3848      void *extra;
3849      bin_tree_t *node;
3850 {
3851   int idx = (int) (long) extra;
3852   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3853     node->token.opt_subexp = 1;
3854
3855   return REG_NOERROR;
3856 }
3857
3858 /* Free the allocated memory inside NODE. */
3859
3860 static void
3861 free_token (re_token_t *node)
3862 {
3863 #ifdef RE_ENABLE_I18N
3864   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3865     free_charset (node->opr.mbcset);
3866   else
3867 #endif /* RE_ENABLE_I18N */
3868     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3869       re_free (node->opr.sbcset);
3870 }
3871
3872 /* Worker function for tree walking.  Free the allocated memory inside NODE
3873    and its children. */
3874
3875 static reg_errcode_t
3876 free_tree (void *extra, bin_tree_t *node)
3877 {
3878   free_token (&node->token);
3879   return REG_NOERROR;
3880 }
3881
3882
3883 /* Duplicate the node SRC, and return new node.  This is a preorder
3884    visit similar to the one implemented by the generic visitor, but
3885    we need more infrastructure to maintain two parallel trees --- so,
3886    it's easier to duplicate.  */
3887
3888 static bin_tree_t *
3889 duplicate_tree (root, dfa)
3890      const bin_tree_t *root;
3891      re_dfa_t *dfa;
3892 {
3893   const bin_tree_t *node;
3894   bin_tree_t *dup_root;
3895   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3896
3897   for (node = root; ; )
3898     {
3899       /* Create a new tree and link it back to the current parent.  */
3900       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3901       if (*p_new == NULL)
3902         return NULL;
3903       (*p_new)->parent = dup_node;
3904       (*p_new)->token.duplicated = 1;
3905       dup_node = *p_new;
3906
3907       /* Go to the left node, or up and to the right.  */
3908       if (node->left)
3909         {
3910           node = node->left;
3911           p_new = &dup_node->left;
3912         }
3913       else
3914         {
3915           const bin_tree_t *prev = NULL;
3916           while (node->right == prev || node->right == NULL)
3917             {
3918               prev = node;
3919               node = node->parent;
3920               dup_node = dup_node->parent;
3921               if (!node)
3922                 return dup_root;
3923             }
3924           node = node->right;
3925           p_new = &dup_node->right;
3926         }
3927     }
3928 }