posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002,2003,2004,2005,2006,2007,2009
   3    Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library; if not, write to the Free
  19    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  20    02111-1307 USA.  */
  21
  22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  23                                           size_t length, reg_syntax_t syntax);
  24 static void re_compile_fastmap_iter (regex_t *bufp,
  25                                      const re_dfastate_t *init_state,
  26                                      char *fastmap);
  27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t preorder (bin_tree_t *root,
  38                                reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                void *extra);
  40 static reg_errcode_t postorder (bin_tree_t *root,
  41                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  42                                 void *extra);
  43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  46                                  bin_tree_t *node);
  47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  50 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  51 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  52                                    unsigned int constraint);
  53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  55                                          int node, int root);
  56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  57 static int fetch_number (re_string_t *input, re_token_t *token,
  58                          reg_syntax_t syntax);
  59 static int peek_token (re_token_t *token, re_string_t *input,
  60                         reg_syntax_t syntax) internal_function;
  61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  62                           reg_syntax_t syntax, reg_errcode_t *err);
  63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  64                                   re_token_t *token, reg_syntax_t syntax,
  65                                   int nest, reg_errcode_t *err);
  66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  67                                  re_token_t *token, reg_syntax_t syntax,
  68                                  int nest, reg_errcode_t *err);
  69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  70                                      re_token_t *token, reg_syntax_t syntax,
  71                                      int nest, reg_errcode_t *err);
  72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  73                                   re_token_t *token, reg_syntax_t syntax,
  74                                   int nest, reg_errcode_t *err);
  75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  76                                  re_dfa_t *dfa, re_token_t *token,
  77                                  reg_syntax_t syntax, reg_errcode_t *err);
  78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  79                                       re_token_t *token, reg_syntax_t syntax,
  80                                       reg_errcode_t *err);
  81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  82                                             re_string_t *regexp,
  83                                             re_token_t *token, int token_len,
  84                                             re_dfa_t *dfa,
  85                                             reg_syntax_t syntax,
  86                                             int accept_hyphen);
  87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  88                                           re_string_t *regexp,
  89                                           re_token_t *token);
  90 #ifdef RE_ENABLE_I18N
  91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  92                                         re_charset_t *mbcset,
  93                                         int *equiv_class_alloc,
  94                                         const unsigned char *name);
  95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  96                                       bitset_t sbcset,
  97                                       re_charset_t *mbcset,
  98                                       int *char_class_alloc,
  99                                       const unsigned char *class_name,
 100                                       reg_syntax_t syntax);
 101 #else  /* not RE_ENABLE_I18N */
 102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 103                                         const unsigned char *name);
 104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 105                                       bitset_t sbcset,
 106                                       const unsigned char *class_name,
 107                                       reg_syntax_t syntax);
 108 #endif /* not RE_ENABLE_I18N */
 109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 110                                        RE_TRANSLATE_TYPE trans,
 111                                        const unsigned char *class_name,
 112                                        const unsigned char *extra,
 113                                        int non_match, reg_errcode_t *err);
 114 static bin_tree_t *create_tree (re_dfa_t *dfa,
 115                                 bin_tree_t *left, bin_tree_t *right,
 116                                 re_token_type_t type);
 117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 118                                       bin_tree_t *left, bin_tree_t *right,
 119                                       const re_token_t *token);
 120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 121 static void free_token (re_token_t *node);
 122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 124 \f
 125 /* This table gives an error message for each of the error codes listed
 126    in regex.h.  Obviously the order here has to be same as there.
 127    POSIX doesn't require that we do anything for REG_NOERROR,
 128    but why not be nice?  */
 129
 130 const char __re_error_msgid[] attribute_hidden =
 131   {
 132 #define REG_NOERROR_IDX 0
 133     gettext_noop ("Success")    /* REG_NOERROR */
 134     "\0"
 135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 136     gettext_noop ("No match")   /* REG_NOMATCH */
 137     "\0"
 138 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 139     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 140     "\0"
 141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 142     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 143     "\0"
 144 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 145     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 146     "\0"
 147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 148     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 149     "\0"
 150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 151     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 152     "\0"
 153 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 154     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 155     "\0"
 156 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 157     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 158     "\0"
 159 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 160     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 161     "\0"
 162 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 163     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 164     "\0"
 165 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 166     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 167     "\0"
 168 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 169     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 170     "\0"
 171 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 172     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 173     "\0"
 174 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 175     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 176     "\0"
 177 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 178     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 179     "\0"
 180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 181     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 182   };
 183
 184 const size_t __re_error_msgid_idx[] attribute_hidden =
 185   {
 186     REG_NOERROR_IDX,
 187     REG_NOMATCH_IDX,
 188     REG_BADPAT_IDX,
 189     REG_ECOLLATE_IDX,
 190     REG_ECTYPE_IDX,
 191     REG_EESCAPE_IDX,
 192     REG_ESUBREG_IDX,
 193     REG_EBRACK_IDX,
 194     REG_EPAREN_IDX,
 195     REG_EBRACE_IDX,
 196     REG_BADBR_IDX,
 197     REG_ERANGE_IDX,
 198     REG_ESPACE_IDX,
 199     REG_BADRPT_IDX,
 200     REG_EEND_IDX,
 201     REG_ESIZE_IDX,
 202     REG_ERPAREN_IDX
 203   };
 204 \f
 205 /* Entry points for GNU code.  */
 206
 207 /* re_compile_pattern is the GNU regular expression compiler: it
 208    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 209    Returns 0 if the pattern was valid, otherwise an error string.
 210
 211    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 212    are set in BUFP on entry.  */
 213
 214 const char *
 215 re_compile_pattern (pattern, length, bufp)
 216     const char *pattern;
 217     size_t length;
 218     struct re_pattern_buffer *bufp;
 219 {
 220   reg_errcode_t ret;
 221
 222   /* And GNU code determines whether or not to get register information
 223      by passing null for the REGS argument to re_match, etc., not by
 224      setting no_sub, unless RE_NO_SUB is set.  */
 225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 226
 227   /* Match anchors at newline.  */
 228   bufp->newline_anchor = 1;
 229
 230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 231
 232   if (!ret)
 233     return NULL;
 234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 235 }
 236 #ifdef _LIBC
 237 weak_alias (__re_compile_pattern, re_compile_pattern)
 238 #endif
 239
 240 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 241    also be assigned to arbitrarily: each pattern buffer stores its own
 242    syntax, so it can be changed between regex compilations.  */
 243 /* This has no initializer because initialized variables in Emacs
 244    become read-only after dumping.  */
 245 reg_syntax_t re_syntax_options;
 246
 247
 248 /* Specify the precise syntax of regexps for compilation.  This provides
 249    for compatibility for various utilities which historically have
 250    different, incompatible syntaxes.
 251
 252    The argument SYNTAX is a bit mask comprised of the various bits
 253    defined in regex.h.  We return the old syntax.  */
 254
 255 reg_syntax_t
 256 re_set_syntax (syntax)
 257     reg_syntax_t syntax;
 258 {
 259   reg_syntax_t ret = re_syntax_options;
 260
 261   re_syntax_options = syntax;
 262   return ret;
 263 }
 264 #ifdef _LIBC
 265 weak_alias (__re_set_syntax, re_set_syntax)
 266 #endif
 267
 268 int
 269 re_compile_fastmap (bufp)
 270     struct re_pattern_buffer *bufp;
 271 {
 272   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 273   char *fastmap = bufp->fastmap;
 274
 275   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 276   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 277   if (dfa->init_state != dfa->init_state_word)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 279   if (dfa->init_state != dfa->init_state_nl)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 281   if (dfa->init_state != dfa->init_state_begbuf)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 283   bufp->fastmap_accurate = 1;
 284   return 0;
 285 }
 286 #ifdef _LIBC
 287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 288 #endif
 289
 290 static inline void
 291 __attribute ((always_inline))
 292 re_set_fastmap (char *fastmap, int icase, int ch)
 293 {
 294   fastmap[ch] = 1;
 295   if (icase)
 296     fastmap[tolower (ch)] = 1;
 297 }
 298
 299 /* Helper function for re_compile_fastmap.
 300    Compile fastmap for the initial_state INIT_STATE.  */
 301
 302 static void
 303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 304                          char *fastmap)
 305 {
 306   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 307   int node_cnt;
 308   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 309   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 310     {
 311       int node = init_state->nodes.elems[node_cnt];
 312       re_token_type_t type = dfa->nodes[node].type;
 313
 314       if (type == CHARACTER)
 315         {
 316           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 317 #ifdef RE_ENABLE_I18N
 318           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 319             {
 320               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 321               wchar_t wc;
 322               mbstate_t state;
 323
 324               p = buf;
 325               *p++ = dfa->nodes[node].opr.c;
 326               while (++node < dfa->nodes_len
 327                      && dfa->nodes[node].type == CHARACTER
 328                      && dfa->nodes[node].mb_partial)
 329                 *p++ = dfa->nodes[node].opr.c;
 330               memset (&state, '\0', sizeof (state));
 331               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 332                              &state) == p - buf
 333                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 334                       != (size_t) -1))
 335                 re_set_fastmap (fastmap, 0, buf[0]);
 336             }
 337 #endif
 338         }
 339       else if (type == SIMPLE_BRACKET)
 340         {
 341           int i, ch;
 342           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 343             {
 344               int j;
 345               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 346               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 347                 if (w & ((bitset_word_t) 1 << j))
 348                   re_set_fastmap (fastmap, icase, ch);
 349             }
 350         }
 351 #ifdef RE_ENABLE_I18N
 352       else if (type == COMPLEX_BRACKET)
 353         {
 354           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 355           int i;
 356
 357 # ifdef _LIBC
 358           /* See if we have to try all bytes which start multiple collation
 359              elements.
 360              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 361                   collation element, and don't catch 'b' since 'b' is
 362                   the only collation element which starts from 'b' (and
 363                   it is caught by SIMPLE_BRACKET).  */
 364               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 365                   && (cset->ncoll_syms || cset->nranges))
 366                 {
 367                   const int32_t *table = (const int32_t *)
 368                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 369                   for (i = 0; i < SBC_MAX; ++i)
 370                     if (table[i] < 0)
 371                       re_set_fastmap (fastmap, icase, i);
 372                 }
 373 # endif /* _LIBC */
 374
 375           /* See if we have to start the match at all multibyte characters,
 376              i.e. where we would not find an invalid sequence.  This only
 377              applies to multibyte character sets; for single byte character
 378              sets, the SIMPLE_BRACKET again suffices.  */
 379           if (dfa->mb_cur_max > 1
 380               && (cset->nchar_classes || cset->non_match
 381 # ifdef _LIBC
 382                   || cset->nequiv_classes
 383 # endif /* _LIBC */
 384                  ))
 385             {
 386               unsigned char c = 0;
 387               do
 388                 {
 389                   mbstate_t mbs;
 390                   memset (&mbs, 0, sizeof (mbs));
 391                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 392                     re_set_fastmap (fastmap, false, (int) c);
 393                 }
 394               while (++c != 0);
 395             }
 396
 397           else
 398             {
 399               /* ... Else catch all bytes which can start the mbchars.  */
 400               for (i = 0; i < cset->nmbchars; ++i)
 401                 {
 402                   char buf[256];
 403                   mbstate_t state;
 404                   memset (&state, '\0', sizeof (state));
 405                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 406                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 407                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 408                     {
 409                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 410                           != (size_t) -1)
 411                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 412                     }
 413                 }
 414             }
 415         }
 416 #endif /* RE_ENABLE_I18N */
 417       else if (type == OP_PERIOD
 418 #ifdef RE_ENABLE_I18N
 419                || type == OP_UTF8_PERIOD
 420 #endif /* RE_ENABLE_I18N */
 421                || type == END_OF_RE)
 422         {
 423           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 424           if (type == END_OF_RE)
 425             bufp->can_be_null = 1;
 426           return;
 427         }
 428     }
 429 }
 430 \f
 431 /* Entry point for POSIX code.  */
 432 /* regcomp takes a regular expression as a string and compiles it.
 433
 434    PREG is a regex_t *.  We do not expect any fields to be initialized,
 435    since POSIX says we shouldn't.  Thus, we set
 436
 437      `buffer' to the compiled pattern;
 438      `used' to the length of the compiled pattern;
 439      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 440        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 441        RE_SYNTAX_POSIX_BASIC;
 442      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 443      `fastmap' to an allocated space for the fastmap;
 444      `fastmap_accurate' to zero;
 445      `re_nsub' to the number of subexpressions in PATTERN.
 446
 447    PATTERN is the address of the pattern string.
 448
 449    CFLAGS is a series of bits which affect compilation.
 450
 451      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 452      use POSIX basic syntax.
 453
 454      If REG_NEWLINE is set, then . and [^...] don't match newline.
 455      Also, regexec will try a match beginning after every newline.
 456
 457      If REG_ICASE is set, then we considers upper- and lowercase
 458      versions of letters to be equivalent when matching.
 459
 460      If REG_NOSUB is set, then when PREG is passed to regexec, that
 461      routine will report only success or failure, and nothing about the
 462      registers.
 463
 464    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 465    the return codes and their meanings.)  */
 466
 467 int
 468 regcomp (preg, pattern, cflags)
 469     regex_t *__restrict preg;
 470     const char *__restrict pattern;
 471     int cflags;
 472 {
 473   reg_errcode_t ret;
 474   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 475                          : RE_SYNTAX_POSIX_BASIC);
 476
 477   preg->buffer = NULL;
 478   preg->allocated = 0;
 479   preg->used = 0;
 480
 481   /* Try to allocate space for the fastmap.  */
 482   preg->fastmap = re_malloc (char, SBC_MAX);
 483   if (BE (preg->fastmap == NULL, 0))
 484     return REG_ESPACE;
 485
 486   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 487
 488   /* If REG_NEWLINE is set, newlines are treated differently.  */
 489   if (cflags & REG_NEWLINE)
 490     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 491       syntax &= ~RE_DOT_NEWLINE;
 492       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 493       /* It also changes the matching behavior.  */
 494       preg->newline_anchor = 1;
 495     }
 496   else
 497     preg->newline_anchor = 0;
 498   preg->no_sub = !!(cflags & REG_NOSUB);
 499   preg->translate = NULL;
 500
 501   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 502
 503   /* POSIX doesn't distinguish between an unmatched open-group and an
 504      unmatched close-group: both are REG_EPAREN.  */
 505   if (ret == REG_ERPAREN)
 506     ret = REG_EPAREN;
 507
 508   /* We have already checked preg->fastmap != NULL.  */
 509   if (BE (ret == REG_NOERROR, 1))
 510     /* Compute the fastmap now, since regexec cannot modify the pattern
 511        buffer.  This function never fails in this implementation.  */
 512     (void) re_compile_fastmap (preg);
 513   else
 514     {
 515       /* Some error occurred while compiling the expression.  */
 516       re_free (preg->fastmap);
 517       preg->fastmap = NULL;
 518     }
 519
 520   return (int) ret;
 521 }
 522 #ifdef _LIBC
 523 weak_alias (__regcomp, regcomp)
 524 #endif
 525
 526 /* Returns a message corresponding to an error code, ERRCODE, returned
 527    from either regcomp or regexec.   We don't use PREG here.  */
 528
 529 size_t
 530 regerror (errcode, preg, errbuf, errbuf_size)
 531     int errcode;
 532     const regex_t *__restrict preg;
 533     char *__restrict errbuf;
 534     size_t errbuf_size;
 535 {
 536   const char *msg;
 537   size_t msg_size;
 538
 539   if (BE (errcode < 0
 540           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 541                                / sizeof (__re_error_msgid_idx[0])), 0))
 542     /* Only error codes returned by the rest of the code should be passed
 543        to this routine.  If we are given anything else, or if other regex
 544        code generates an invalid error code, then the program has a bug.
 545        Dump core so we can fix it.  */
 546     abort ();
 547
 548   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 549
 550   msg_size = strlen (msg) + 1; /* Includes the null.  */
 551
 552   if (BE (errbuf_size != 0, 1))
 553     {
 554       if (BE (msg_size > errbuf_size, 0))
 555         {
 556 #if defined HAVE_MEMPCPY || defined _LIBC
 557           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 558 #else
 559           memcpy (errbuf, msg, errbuf_size - 1);
 560           errbuf[errbuf_size - 1] = 0;
 561 #endif
 562         }
 563       else
 564         memcpy (errbuf, msg, msg_size);
 565     }
 566
 567   return msg_size;
 568 }
 569 #ifdef _LIBC
 570 weak_alias (__regerror, regerror)
 571 #endif
 572
 573
 574 #ifdef RE_ENABLE_I18N
 575 /* This static array is used for the map to single-byte characters when
 576    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 577    it the same all the time.  UTF-8 is the preferred encoding so this is
 578    a worthwhile optimization.  */
 579 static const bitset_t utf8_sb_map =
 580 {
 581   /* Set the first 128 bits.  */
 582   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 583 };
 584 #endif
 585
 586
 587 static void
 588 free_dfa_content (re_dfa_t *dfa)
 589 {
 590   int i, j;
 591
 592   if (dfa->nodes)
 593     for (i = 0; i < dfa->nodes_len; ++i)
 594       free_token (dfa->nodes + i);
 595   re_free (dfa->nexts);
 596   for (i = 0; i < dfa->nodes_len; ++i)
 597     {
 598       if (dfa->eclosures != NULL)
 599         re_node_set_free (dfa->eclosures + i);
 600       if (dfa->inveclosures != NULL)
 601         re_node_set_free (dfa->inveclosures + i);
 602       if (dfa->edests != NULL)
 603         re_node_set_free (dfa->edests + i);
 604     }
 605   re_free (dfa->edests);
 606   re_free (dfa->eclosures);
 607   re_free (dfa->inveclosures);
 608   re_free (dfa->nodes);
 609
 610   if (dfa->state_table)
 611     for (i = 0; i <= dfa->state_hash_mask; ++i)
 612       {
 613         struct re_state_table_entry *entry = dfa->state_table + i;
 614         for (j = 0; j < entry->num; ++j)
 615           {
 616             re_dfastate_t *state = entry->array[j];
 617             free_state (state);
 618           }
 619         re_free (entry->array);
 620       }
 621   re_free (dfa->state_table);
 622 #ifdef RE_ENABLE_I18N
 623   if (dfa->sb_char != utf8_sb_map)
 624     re_free (dfa->sb_char);
 625 #endif
 626   re_free (dfa->subexp_map);
 627 #ifdef DEBUG
 628   re_free (dfa->re_str);
 629 #endif
 630
 631   re_free (dfa);
 632 }
 633
 634
 635 /* Free dynamically allocated space used by PREG.  */
 636
 637 void
 638 regfree (preg)
 639     regex_t *preg;
 640 {
 641   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 642   if (BE (dfa != NULL, 1))
 643     free_dfa_content (dfa);
 644   preg->buffer = NULL;
 645   preg->allocated = 0;
 646
 647   re_free (preg->fastmap);
 648   preg->fastmap = NULL;
 649
 650   re_free (preg->translate);
 651   preg->translate = NULL;
 652 }
 653 #ifdef _LIBC
 654 weak_alias (__regfree, regfree)
 655 #endif
 656 \f
 657 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 658    them unless specifically requested.  */
 659
 660 #if defined _REGEX_RE_COMP || defined _LIBC
 661
 662 /* BSD has one and only one pattern buffer.  */
 663 static struct re_pattern_buffer re_comp_buf;
 664
 665 char *
 666 # ifdef _LIBC
 667 /* Make these definitions weak in libc, so POSIX programs can redefine
 668    these names if they don't use our functions, and still use
 669    regcomp/regexec above without link errors.  */
 670 weak_function
 671 # endif
 672 re_comp (s)
 673      const char *s;
 674 {
 675   reg_errcode_t ret;
 676   char *fastmap;
 677
 678   if (!s)
 679     {
 680       if (!re_comp_buf.buffer)
 681         return gettext ("No previous regular expression");
 682       return 0;
 683     }
 684
 685   if (re_comp_buf.buffer)
 686     {
 687       fastmap = re_comp_buf.fastmap;
 688       re_comp_buf.fastmap = NULL;
 689       __regfree (&re_comp_buf);
 690       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 691       re_comp_buf.fastmap = fastmap;
 692     }
 693
 694   if (re_comp_buf.fastmap == NULL)
 695     {
 696       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 697       if (re_comp_buf.fastmap == NULL)
 698         return (char *) gettext (__re_error_msgid
 699                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 700     }
 701
 702   /* Since `re_exec' always passes NULL for the `regs' argument, we
 703      don't need to initialize the pattern buffer fields which affect it.  */
 704
 705   /* Match anchors at newlines.  */
 706   re_comp_buf.newline_anchor = 1;
 707
 708   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 709
 710   if (!ret)
 711     return NULL;
 712
 713   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 714   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 715 }
 716
 717 #ifdef _LIBC
 718 libc_freeres_fn (free_mem)
 719 {
 720   __regfree (&re_comp_buf);
 721 }
 722 #endif
 723
 724 #endif /* _REGEX_RE_COMP */
 725 \f
 726 /* Internal entry point.
 727    Compile the regular expression PATTERN, whose length is LENGTH.
 728    SYNTAX indicate regular expression's syntax.  */
 729
 730 static reg_errcode_t
 731 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 732                      reg_syntax_t syntax)
 733 {
 734   reg_errcode_t err = REG_NOERROR;
 735   re_dfa_t *dfa;
 736   re_string_t regexp;
 737
 738   /* Initialize the pattern buffer.  */
 739   preg->fastmap_accurate = 0;
 740   preg->syntax = syntax;
 741   preg->not_bol = preg->not_eol = 0;
 742   preg->used = 0;
 743   preg->re_nsub = 0;
 744   preg->can_be_null = 0;
 745   preg->regs_allocated = REGS_UNALLOCATED;
 746
 747   /* Initialize the dfa.  */
 748   dfa = (re_dfa_t *) preg->buffer;
 749   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 750     {
 751       /* If zero allocated, but buffer is non-null, try to realloc
 752          enough space.  This loses if buffer's address is bogus, but
 753          that is the user's responsibility.  If ->buffer is NULL this
 754          is a simple allocation.  */
 755       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 756       if (dfa == NULL)
 757         return REG_ESPACE;
 758       preg->allocated = sizeof (re_dfa_t);
 759       preg->buffer = (unsigned char *) dfa;
 760     }
 761   preg->used = sizeof (re_dfa_t);
 762
 763   err = init_dfa (dfa, length);
 764   if (BE (err != REG_NOERROR, 0))
 765     {
 766       free_dfa_content (dfa);
 767       preg->buffer = NULL;
 768       preg->allocated = 0;
 769       return err;
 770     }
 771 #ifdef DEBUG
 772   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 773   dfa->re_str = re_malloc (char, length + 1);
 774   strncpy (dfa->re_str, pattern, length + 1);
 775 #endif
 776
 777   __libc_lock_init (dfa->lock);
 778
 779   err = re_string_construct (&regexp, pattern, length, preg->translate,
 780                              syntax & RE_ICASE, dfa);
 781   if (BE (err != REG_NOERROR, 0))
 782     {
 783     re_compile_internal_free_return:
 784       free_workarea_compile (preg);
 785       re_string_destruct (&regexp);
 786       free_dfa_content (dfa);
 787       preg->buffer = NULL;
 788       preg->allocated = 0;
 789       return err;
 790     }
 791
 792   /* Parse the regular expression, and build a structure tree.  */
 793   preg->re_nsub = 0;
 794   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 795   if (BE (dfa->str_tree == NULL, 0))
 796     goto re_compile_internal_free_return;
 797
 798   /* Analyze the tree and create the nfa.  */
 799   err = analyze (preg);
 800   if (BE (err != REG_NOERROR, 0))
 801     goto re_compile_internal_free_return;
 802
 803 #ifdef RE_ENABLE_I18N
 804   /* If possible, do searching in single byte encoding to speed things up.  */
 805   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 806     optimize_utf8 (dfa);
 807 #endif
 808
 809   /* Then create the initial state of the dfa.  */
 810   err = create_initial_state (dfa);
 811
 812   /* Release work areas.  */
 813   free_workarea_compile (preg);
 814   re_string_destruct (&regexp);
 815
 816   if (BE (err != REG_NOERROR, 0))
 817     {
 818       free_dfa_content (dfa);
 819       preg->buffer = NULL;
 820       preg->allocated = 0;
 821     }
 822
 823   return err;
 824 }
 825
 826 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 827    as the initial length of some arrays.  */
 828
 829 static reg_errcode_t
 830 init_dfa (re_dfa_t *dfa, size_t pat_len)
 831 {
 832   unsigned int table_size;
 833 #ifndef _LIBC
 834   char *codeset_name;
 835 #endif
 836
 837   memset (dfa, '\0', sizeof (re_dfa_t));
 838
 839   /* Force allocation of str_tree_storage the first time.  */
 840   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 841
 842   /* Avoid overflows.  */
 843   if (pat_len == SIZE_MAX)
 844     return REG_ESPACE;
 845
 846   dfa->nodes_alloc = pat_len + 1;
 847   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 848
 849   /*  table_size = 2 ^ ceil(log pat_len) */
 850   for (table_size = 1; ; table_size <<= 1)
 851     if (table_size > pat_len)
 852       break;
 853
 854   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 855   dfa->state_hash_mask = table_size - 1;
 856
 857   dfa->mb_cur_max = MB_CUR_MAX;
 858 #ifdef _LIBC
 859   if (dfa->mb_cur_max == 6
 860       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 861     dfa->is_utf8 = 1;
 862   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 863                        != 0);
 864 #else
 865 # ifdef HAVE_LANGINFO_CODESET
 866   codeset_name = nl_langinfo (CODESET);
 867 # else
 868   codeset_name = getenv ("LC_ALL");
 869   if (codeset_name == NULL || codeset_name[0] == '\0')
 870     codeset_name = getenv ("LC_CTYPE");
 871   if (codeset_name == NULL || codeset_name[0] == '\0')
 872     codeset_name = getenv ("LANG");
 873   if (codeset_name == NULL)
 874     codeset_name = "";
 875   else if (strchr (codeset_name, '.') !=  NULL)
 876     codeset_name = strchr (codeset_name, '.') + 1;
 877 # endif
 878
 879   if (strcasecmp (codeset_name, "UTF-8") == 0
 880       || strcasecmp (codeset_name, "UTF8") == 0)
 881     dfa->is_utf8 = 1;
 882
 883   /* We check exhaustively in the loop below if this charset is a
 884      superset of ASCII.  */
 885   dfa->map_notascii = 0;
 886 #endif
 887
 888 #ifdef RE_ENABLE_I18N
 889   if (dfa->mb_cur_max > 1)
 890     {
 891       if (dfa->is_utf8)
 892         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 893       else
 894         {
 895           int i, j, ch;
 896
 897           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 898           if (BE (dfa->sb_char == NULL, 0))
 899             return REG_ESPACE;
 900
 901           /* Set the bits corresponding to single byte chars.  */
 902           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 903             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 904               {
 905                 wint_t wch = __btowc (ch);
 906                 if (wch != WEOF)
 907                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 908 # ifndef _LIBC
 909                 if (isascii (ch) && wch != ch)
 910                   dfa->map_notascii = 1;
 911 # endif
 912               }
 913         }
 914     }
 915 #endif
 916
 917   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 918     return REG_ESPACE;
 919   return REG_NOERROR;
 920 }
 921
 922 /* Initialize WORD_CHAR table, which indicate which character is
 923    "word".  In this case "word" means that it is the word construction
 924    character used by some operators like "\<", "\>", etc.  */
 925
 926 static void
 927 internal_function
 928 init_word_char (re_dfa_t *dfa)
 929 {
 930   int i, j, ch;
 931   dfa->word_ops_used = 1;
 932   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 933     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 934       if (isalnum (ch) || ch == '_')
 935         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 936 }
 937
 938 /* Free the work area which are only used while compiling.  */
 939
 940 static void
 941 free_workarea_compile (regex_t *preg)
 942 {
 943   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 944   bin_tree_storage_t *storage, *next;
 945   for (storage = dfa->str_tree_storage; storage; storage = next)
 946     {
 947       next = storage->next;
 948       re_free (storage);
 949     }
 950   dfa->str_tree_storage = NULL;
 951   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 952   dfa->str_tree = NULL;
 953   re_free (dfa->org_indices);
 954   dfa->org_indices = NULL;
 955 }
 956
 957 /* Create initial states for all contexts.  */
 958
 959 static reg_errcode_t
 960 create_initial_state (re_dfa_t *dfa)
 961 {
 962   int first, i;
 963   reg_errcode_t err;
 964   re_node_set init_nodes;
 965
 966   /* Initial states have the epsilon closure of the node which is
 967      the first node of the regular expression.  */
 968   first = dfa->str_tree->first->node_idx;
 969   dfa->init_node = first;
 970   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 971   if (BE (err != REG_NOERROR, 0))
 972     return err;
 973
 974   /* The back-references which are in initial states can epsilon transit,
 975      since in this case all of the subexpressions can be null.
 976      Then we add epsilon closures of the nodes which are the next nodes of
 977      the back-references.  */
 978   if (dfa->nbackref > 0)
 979     for (i = 0; i < init_nodes.nelem; ++i)
 980       {
 981         int node_idx = init_nodes.elems[i];
 982         re_token_type_t type = dfa->nodes[node_idx].type;
 983
 984         int clexp_idx;
 985         if (type != OP_BACK_REF)
 986           continue;
 987         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 988           {
 989             re_token_t *clexp_node;
 990             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 991             if (clexp_node->type == OP_CLOSE_SUBEXP
 992                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 993               break;
 994           }
 995         if (clexp_idx == init_nodes.nelem)
 996           continue;
 997
 998         if (type == OP_BACK_REF)
 999           {
1000             int dest_idx = dfa->edests[node_idx].elems[0];
1001             if (!re_node_set_contains (&init_nodes, dest_idx))
1002               {
1003                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1004                 i = 0;
1005               }
1006           }
1007       }
1008
1009   /* It must be the first time to invoke acquire_state.  */
1010   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1011   /* We don't check ERR here, since the initial state must not be NULL.  */
1012   if (BE (dfa->init_state == NULL, 0))
1013     return err;
1014   if (dfa->init_state->has_constraint)
1015     {
1016       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1017                                                        CONTEXT_WORD);
1018       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1019                                                      CONTEXT_NEWLINE);
1020       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1021                                                          &init_nodes,
1022                                                          CONTEXT_NEWLINE
1023                                                          | CONTEXT_BEGBUF);
1024       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1025               || dfa->init_state_begbuf == NULL, 0))
1026         return err;
1027     }
1028   else
1029     dfa->init_state_word = dfa->init_state_nl
1030       = dfa->init_state_begbuf = dfa->init_state;
1031
1032   re_node_set_free (&init_nodes);
1033   return REG_NOERROR;
1034 }
1035 \f
1036 #ifdef RE_ENABLE_I18N
1037 /* If it is possible to do searching in single byte encoding instead of UTF-8
1038    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1039    DFA nodes where needed.  */
1040
1041 static void
1042 optimize_utf8 (re_dfa_t *dfa)
1043 {
1044   int node, i, mb_chars = 0, has_period = 0;
1045
1046   for (node = 0; node < dfa->nodes_len; ++node)
1047     switch (dfa->nodes[node].type)
1048       {
1049       case CHARACTER:
1050         if (dfa->nodes[node].opr.c >= 0x80)
1051           mb_chars = 1;
1052         break;
1053       case ANCHOR:
1054         switch (dfa->nodes[node].opr.ctx_type)
1055           {
1056           case LINE_FIRST:
1057           case LINE_LAST:
1058           case BUF_FIRST:
1059           case BUF_LAST:
1060             break;
1061           default:
1062             /* Word anchors etc. cannot be handled.  It's okay to test
1063                opr.ctx_type since constraints (for all DFA nodes) are
1064                created by ORing one or more opr.ctx_type values.  */
1065             return;
1066           }
1067         break;
1068       case OP_PERIOD:
1069         has_period = 1;
1070         break;
1071       case OP_BACK_REF:
1072       case OP_ALT:
1073       case END_OF_RE:
1074       case OP_DUP_ASTERISK:
1075       case OP_OPEN_SUBEXP:
1076       case OP_CLOSE_SUBEXP:
1077         break;
1078       case COMPLEX_BRACKET:
1079         return;
1080       case SIMPLE_BRACKET:
1081         /* Just double check.  The non-ASCII range starts at 0x80.  */
1082         assert (0x80 % BITSET_WORD_BITS == 0);
1083         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1084           if (dfa->nodes[node].opr.sbcset[i])
1085             return;
1086         break;
1087       default:
1088         abort ();
1089       }
1090
1091   if (mb_chars || has_period)
1092     for (node = 0; node < dfa->nodes_len; ++node)
1093       {
1094         if (dfa->nodes[node].type == CHARACTER
1095             && dfa->nodes[node].opr.c >= 0x80)
1096           dfa->nodes[node].mb_partial = 0;
1097         else if (dfa->nodes[node].type == OP_PERIOD)
1098           dfa->nodes[node].type = OP_UTF8_PERIOD;
1099       }
1100
1101   /* The search can be in single byte locale.  */
1102   dfa->mb_cur_max = 1;
1103   dfa->is_utf8 = 0;
1104   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1105 }
1106 #endif
1107 \f
1108 /* Analyze the structure tree, and calculate "first", "next", "edest",
1109    "eclosure", and "inveclosure".  */
1110
1111 static reg_errcode_t
1112 analyze (regex_t *preg)
1113 {
1114   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1115   reg_errcode_t ret;
1116
1117   /* Allocate arrays.  */
1118   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1119   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1120   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1121   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1122   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1123           || dfa->eclosures == NULL, 0))
1124     return REG_ESPACE;
1125
1126   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1127   if (dfa->subexp_map != NULL)
1128     {
1129       int i;
1130       for (i = 0; i < preg->re_nsub; i++)
1131         dfa->subexp_map[i] = i;
1132       preorder (dfa->str_tree, optimize_subexps, dfa);
1133       for (i = 0; i < preg->re_nsub; i++)
1134         if (dfa->subexp_map[i] != i)
1135           break;
1136       if (i == preg->re_nsub)
1137         {
1138           free (dfa->subexp_map);
1139           dfa->subexp_map = NULL;
1140         }
1141     }
1142
1143   ret = postorder (dfa->str_tree, lower_subexps, preg);
1144   if (BE (ret != REG_NOERROR, 0))
1145     return ret;
1146   ret = postorder (dfa->str_tree, calc_first, dfa);
1147   if (BE (ret != REG_NOERROR, 0))
1148     return ret;
1149   preorder (dfa->str_tree, calc_next, dfa);
1150   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1151   if (BE (ret != REG_NOERROR, 0))
1152     return ret;
1153   ret = calc_eclosure (dfa);
1154   if (BE (ret != REG_NOERROR, 0))
1155     return ret;
1156
1157   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1158      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1159   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1160       || dfa->nbackref)
1161     {
1162       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1163       if (BE (dfa->inveclosures == NULL, 0))
1164         return REG_ESPACE;
1165       ret = calc_inveclosure (dfa);
1166     }
1167
1168   return ret;
1169 }
1170
1171 /* Our parse trees are very unbalanced, so we cannot use a stack to
1172    implement parse tree visits.  Instead, we use parent pointers and
1173    some hairy code in these two functions.  */
1174 static reg_errcode_t
1175 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1176            void *extra)
1177 {
1178   bin_tree_t *node, *prev;
1179
1180   for (node = root; ; )
1181     {
1182       /* Descend down the tree, preferably to the left (or to the right
1183          if that's the only child).  */
1184       while (node->left || node->right)
1185         if (node->left)
1186           node = node->left;
1187         else
1188           node = node->right;
1189
1190       do
1191         {
1192           reg_errcode_t err = fn (extra, node);
1193           if (BE (err != REG_NOERROR, 0))
1194             return err;
1195           if (node->parent == NULL)
1196             return REG_NOERROR;
1197           prev = node;
1198           node = node->parent;
1199         }
1200       /* Go up while we have a node that is reached from the right.  */
1201       while (node->right == prev || node->right == NULL);
1202       node = node->right;
1203     }
1204 }
1205
1206 static reg_errcode_t
1207 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1208           void *extra)
1209 {
1210   bin_tree_t *node;
1211
1212   for (node = root; ; )
1213     {
1214       reg_errcode_t err = fn (extra, node);
1215       if (BE (err != REG_NOERROR, 0))
1216         return err;
1217
1218       /* Go to the left node, or up and to the right.  */
1219       if (node->left)
1220         node = node->left;
1221       else
1222         {
1223           bin_tree_t *prev = NULL;
1224           while (node->right == prev || node->right == NULL)
1225             {
1226               prev = node;
1227               node = node->parent;
1228               if (!node)
1229                 return REG_NOERROR;
1230             }
1231           node = node->right;
1232         }
1233     }
1234 }
1235
1236 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1237    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1238    backreferences as well.  Requires a preorder visit.  */
1239 static reg_errcode_t
1240 optimize_subexps (void *extra, bin_tree_t *node)
1241 {
1242   re_dfa_t *dfa = (re_dfa_t *) extra;
1243
1244   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1245     {
1246       int idx = node->token.opr.idx;
1247       node->token.opr.idx = dfa->subexp_map[idx];
1248       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1249     }
1250
1251   else if (node->token.type == SUBEXP
1252            && node->left && node->left->token.type == SUBEXP)
1253     {
1254       int other_idx = node->left->token.opr.idx;
1255
1256       node->left = node->left->left;
1257       if (node->left)
1258         node->left->parent = node;
1259
1260       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1261       if (other_idx < BITSET_WORD_BITS)
1262           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1263     }
1264
1265   return REG_NOERROR;
1266 }
1267
1268 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1269    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1270 static reg_errcode_t
1271 lower_subexps (void *extra, bin_tree_t *node)
1272 {
1273   regex_t *preg = (regex_t *) extra;
1274   reg_errcode_t err = REG_NOERROR;
1275
1276   if (node->left && node->left->token.type == SUBEXP)
1277     {
1278       node->left = lower_subexp (&err, preg, node->left);
1279       if (node->left)
1280         node->left->parent = node;
1281     }
1282   if (node->right && node->right->token.type == SUBEXP)
1283     {
1284       node->right = lower_subexp (&err, preg, node->right);
1285       if (node->right)
1286         node->right->parent = node;
1287     }
1288
1289   return err;
1290 }
1291
1292 static bin_tree_t *
1293 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1294 {
1295   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1296   bin_tree_t *body = node->left;
1297   bin_tree_t *op, *cls, *tree1, *tree;
1298
1299   if (preg->no_sub
1300       /* We do not optimize empty subexpressions, because otherwise we may
1301          have bad CONCAT nodes with NULL children.  This is obviously not
1302          very common, so we do not lose much.  An example that triggers
1303          this case is the sed "script" /\(\)/x.  */
1304       && node->left != NULL
1305       && (node->token.opr.idx >= BITSET_WORD_BITS
1306           || !(dfa->used_bkref_map
1307                & ((bitset_word_t) 1 << node->token.opr.idx))))
1308     return node->left;
1309
1310   /* Convert the SUBEXP node to the concatenation of an
1311      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1312   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1313   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1314   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1315   tree = create_tree (dfa, op, tree1, CONCAT);
1316   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1317     {
1318       *err = REG_ESPACE;
1319       return NULL;
1320     }
1321
1322   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1323   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1324   return tree;
1325 }
1326
1327 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1328    nodes.  Requires a postorder visit.  */
1329 static reg_errcode_t
1330 calc_first (void *extra, bin_tree_t *node)
1331 {
1332   re_dfa_t *dfa = (re_dfa_t *) extra;
1333   if (node->token.type == CONCAT)
1334     {
1335       node->first = node->left->first;
1336       node->node_idx = node->left->node_idx;
1337     }
1338   else
1339     {
1340       node->first = node;
1341       node->node_idx = re_dfa_add_node (dfa, node->token);
1342       if (BE (node->node_idx == -1, 0))
1343         return REG_ESPACE;
1344       if (node->token.type == ANCHOR)
1345         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1346     }
1347   return REG_NOERROR;
1348 }
1349
1350 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1351 static reg_errcode_t
1352 calc_next (void *extra, bin_tree_t *node)
1353 {
1354   switch (node->token.type)
1355     {
1356     case OP_DUP_ASTERISK:
1357       node->left->next = node;
1358       break;
1359     case CONCAT:
1360       node->left->next = node->right->first;
1361       node->right->next = node->next;
1362       break;
1363     default:
1364       if (node->left)
1365         node->left->next = node->next;
1366       if (node->right)
1367         node->right->next = node->next;
1368       break;
1369     }
1370   return REG_NOERROR;
1371 }
1372
1373 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1374 static reg_errcode_t
1375 link_nfa_nodes (void *extra, bin_tree_t *node)
1376 {
1377   re_dfa_t *dfa = (re_dfa_t *) extra;
1378   int idx = node->node_idx;
1379   reg_errcode_t err = REG_NOERROR;
1380
1381   switch (node->token.type)
1382     {
1383     case CONCAT:
1384       break;
1385
1386     case END_OF_RE:
1387       assert (node->next == NULL);
1388       break;
1389
1390     case OP_DUP_ASTERISK:
1391     case OP_ALT:
1392       {
1393         int left, right;
1394         dfa->has_plural_match = 1;
1395         if (node->left != NULL)
1396           left = node->left->first->node_idx;
1397         else
1398           left = node->next->node_idx;
1399         if (node->right != NULL)
1400           right = node->right->first->node_idx;
1401         else
1402           right = node->next->node_idx;
1403         assert (left > -1);
1404         assert (right > -1);
1405         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1406       }
1407       break;
1408
1409     case ANCHOR:
1410     case OP_OPEN_SUBEXP:
1411     case OP_CLOSE_SUBEXP:
1412       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1413       break;
1414
1415     case OP_BACK_REF:
1416       dfa->nexts[idx] = node->next->node_idx;
1417       if (node->token.type == OP_BACK_REF)
1418         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1419       break;
1420
1421     default:
1422       assert (!IS_EPSILON_NODE (node->token.type));
1423       dfa->nexts[idx] = node->next->node_idx;
1424       break;
1425     }
1426
1427   return err;
1428 }
1429
1430 /* Duplicate the epsilon closure of the node ROOT_NODE.
1431    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1432    to their own constraint.  */
1433
1434 static reg_errcode_t
1435 internal_function
1436 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1437                         int root_node, unsigned int init_constraint)
1438 {
1439   int org_node, clone_node, ret;
1440   unsigned int constraint = init_constraint;
1441   for (org_node = top_org_node, clone_node = top_clone_node;;)
1442     {
1443       int org_dest, clone_dest;
1444       if (dfa->nodes[org_node].type == OP_BACK_REF)
1445         {
1446           /* If the back reference epsilon-transit, its destination must
1447              also have the constraint.  Then duplicate the epsilon closure
1448              of the destination of the back reference, and store it in
1449              edests of the back reference.  */
1450           org_dest = dfa->nexts[org_node];
1451           re_node_set_empty (dfa->edests + clone_node);
1452           clone_dest = duplicate_node (dfa, org_dest, constraint);
1453           if (BE (clone_dest == -1, 0))
1454             return REG_ESPACE;
1455           dfa->nexts[clone_node] = dfa->nexts[org_node];
1456           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1457           if (BE (ret < 0, 0))
1458             return REG_ESPACE;
1459         }
1460       else if (dfa->edests[org_node].nelem == 0)
1461         {
1462           /* In case of the node can't epsilon-transit, don't duplicate the
1463              destination and store the original destination as the
1464              destination of the node.  */
1465           dfa->nexts[clone_node] = dfa->nexts[org_node];
1466           break;
1467         }
1468       else if (dfa->edests[org_node].nelem == 1)
1469         {
1470           /* In case of the node can epsilon-transit, and it has only one
1471              destination.  */
1472           org_dest = dfa->edests[org_node].elems[0];
1473           re_node_set_empty (dfa->edests + clone_node);
1474           /* If the node is root_node itself, it means the epsilon clsoure
1475              has a loop.   Then tie it to the destination of the root_node.  */
1476           if (org_node == root_node && clone_node != org_node)
1477             {
1478               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1479               if (BE (ret < 0, 0))
1480                 return REG_ESPACE;
1481               break;
1482             }
1483           /* In case of the node has another constraint, add it.  */
1484           constraint |= dfa->nodes[org_node].constraint;
1485           clone_dest = duplicate_node (dfa, org_dest, constraint);
1486           if (BE (clone_dest == -1, 0))
1487             return REG_ESPACE;
1488           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489           if (BE (ret < 0, 0))
1490             return REG_ESPACE;
1491         }
1492       else /* dfa->edests[org_node].nelem == 2 */
1493         {
1494           /* In case of the node can epsilon-transit, and it has two
1495              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1496           org_dest = dfa->edests[org_node].elems[0];
1497           re_node_set_empty (dfa->edests + clone_node);
1498           /* Search for a duplicated node which satisfies the constraint.  */
1499           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1500           if (clone_dest == -1)
1501             {
1502               /* There is no such duplicated node, create a new one.  */
1503               reg_errcode_t err;
1504               clone_dest = duplicate_node (dfa, org_dest, constraint);
1505               if (BE (clone_dest == -1, 0))
1506                 return REG_ESPACE;
1507               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1508               if (BE (ret < 0, 0))
1509                 return REG_ESPACE;
1510               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1511                                             root_node, constraint);
1512               if (BE (err != REG_NOERROR, 0))
1513                 return err;
1514             }
1515           else
1516             {
1517               /* There is a duplicated node which satisfies the constraint,
1518                  use it to avoid infinite loop.  */
1519               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1520               if (BE (ret < 0, 0))
1521                 return REG_ESPACE;
1522             }
1523
1524           org_dest = dfa->edests[org_node].elems[1];
1525           clone_dest = duplicate_node (dfa, org_dest, constraint);
1526           if (BE (clone_dest == -1, 0))
1527             return REG_ESPACE;
1528           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529           if (BE (ret < 0, 0))
1530             return REG_ESPACE;
1531         }
1532       org_node = org_dest;
1533       clone_node = clone_dest;
1534     }
1535   return REG_NOERROR;
1536 }
1537
1538 /* Search for a node which is duplicated from the node ORG_NODE, and
1539    satisfies the constraint CONSTRAINT.  */
1540
1541 static int
1542 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1543                         unsigned int constraint)
1544 {
1545   int idx;
1546   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1547     {
1548       if (org_node == dfa->org_indices[idx]
1549           && constraint == dfa->nodes[idx].constraint)
1550         return idx; /* Found.  */
1551     }
1552   return -1; /* Not found.  */
1553 }
1554
1555 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1556    Return the index of the new node, or -1 if insufficient storage is
1557    available.  */
1558
1559 static int
1560 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1561 {
1562   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1563   if (BE (dup_idx != -1, 1))
1564     {
1565       dfa->nodes[dup_idx].constraint = constraint;
1566       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1567       dfa->nodes[dup_idx].duplicated = 1;
1568
1569       /* Store the index of the original node.  */
1570       dfa->org_indices[dup_idx] = org_idx;
1571     }
1572   return dup_idx;
1573 }
1574
1575 static reg_errcode_t
1576 calc_inveclosure (re_dfa_t *dfa)
1577 {
1578   int src, idx, ret;
1579   for (idx = 0; idx < dfa->nodes_len; ++idx)
1580     re_node_set_init_empty (dfa->inveclosures + idx);
1581
1582   for (src = 0; src < dfa->nodes_len; ++src)
1583     {
1584       int *elems = dfa->eclosures[src].elems;
1585       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1586         {
1587           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1588           if (BE (ret == -1, 0))
1589             return REG_ESPACE;
1590         }
1591     }
1592
1593   return REG_NOERROR;
1594 }
1595
1596 /* Calculate "eclosure" for all the node in DFA.  */
1597
1598 static reg_errcode_t
1599 calc_eclosure (re_dfa_t *dfa)
1600 {
1601   int node_idx, incomplete;
1602 #ifdef DEBUG
1603   assert (dfa->nodes_len > 0);
1604 #endif
1605   incomplete = 0;
1606   /* For each nodes, calculate epsilon closure.  */
1607   for (node_idx = 0; ; ++node_idx)
1608     {
1609       reg_errcode_t err;
1610       re_node_set eclosure_elem;
1611       if (node_idx == dfa->nodes_len)
1612         {
1613           if (!incomplete)
1614             break;
1615           incomplete = 0;
1616           node_idx = 0;
1617         }
1618
1619 #ifdef DEBUG
1620       assert (dfa->eclosures[node_idx].nelem != -1);
1621 #endif
1622
1623       /* If we have already calculated, skip it.  */
1624       if (dfa->eclosures[node_idx].nelem != 0)
1625         continue;
1626       /* Calculate epsilon closure of `node_idx'.  */
1627       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1628       if (BE (err != REG_NOERROR, 0))
1629         return err;
1630
1631       if (dfa->eclosures[node_idx].nelem == 0)
1632         {
1633           incomplete = 1;
1634           re_node_set_free (&eclosure_elem);
1635         }
1636     }
1637   return REG_NOERROR;
1638 }
1639
1640 /* Calculate epsilon closure of NODE.  */
1641
1642 static reg_errcode_t
1643 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1644 {
1645   reg_errcode_t err;
1646   int i, incomplete;
1647   re_node_set eclosure;
1648   incomplete = 0;
1649   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1650   if (BE (err != REG_NOERROR, 0))
1651     return err;
1652
1653   /* This indicates that we are calculating this node now.
1654      We reference this value to avoid infinite loop.  */
1655   dfa->eclosures[node].nelem = -1;
1656
1657   /* If the current node has constraints, duplicate all nodes
1658      since they must inherit the constraints.  */
1659   if (dfa->nodes[node].constraint
1660       && dfa->edests[node].nelem
1661       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1662     {
1663       err = duplicate_node_closure (dfa, node, node, node,
1664                                     dfa->nodes[node].constraint);
1665       if (BE (err != REG_NOERROR, 0))
1666         return err;
1667     }
1668
1669   /* Expand each epsilon destination nodes.  */
1670   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1671     for (i = 0; i < dfa->edests[node].nelem; ++i)
1672       {
1673         re_node_set eclosure_elem;
1674         int edest = dfa->edests[node].elems[i];
1675         /* If calculating the epsilon closure of `edest' is in progress,
1676            return intermediate result.  */
1677         if (dfa->eclosures[edest].nelem == -1)
1678           {
1679             incomplete = 1;
1680             continue;
1681           }
1682         /* If we haven't calculated the epsilon closure of `edest' yet,
1683            calculate now. Otherwise use calculated epsilon closure.  */
1684         if (dfa->eclosures[edest].nelem == 0)
1685           {
1686             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1687             if (BE (err != REG_NOERROR, 0))
1688               return err;
1689           }
1690         else
1691           eclosure_elem = dfa->eclosures[edest];
1692         /* Merge the epsilon closure of `edest'.  */
1693         re_node_set_merge (&eclosure, &eclosure_elem);
1694         /* If the epsilon closure of `edest' is incomplete,
1695            the epsilon closure of this node is also incomplete.  */
1696         if (dfa->eclosures[edest].nelem == 0)
1697           {
1698             incomplete = 1;
1699             re_node_set_free (&eclosure_elem);
1700           }
1701       }
1702
1703   /* Epsilon closures include itself.  */
1704   re_node_set_insert (&eclosure, node);
1705   if (incomplete && !root)
1706     dfa->eclosures[node].nelem = 0;
1707   else
1708     dfa->eclosures[node] = eclosure;
1709   *new_set = eclosure;
1710   return REG_NOERROR;
1711 }
1712 \f
1713 /* Functions for token which are used in the parser.  */
1714
1715 /* Fetch a token from INPUT.
1716    We must not use this function inside bracket expressions.  */
1717
1718 static void
1719 internal_function
1720 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1721 {
1722   re_string_skip_bytes (input, peek_token (result, input, syntax));
1723 }
1724
1725 /* Peek a token from INPUT, and return the length of the token.
1726    We must not use this function inside bracket expressions.  */
1727
1728 static int
1729 internal_function
1730 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1731 {
1732   unsigned char c;
1733
1734   if (re_string_eoi (input))
1735     {
1736       token->type = END_OF_RE;
1737       return 0;
1738     }
1739
1740   c = re_string_peek_byte (input, 0);
1741   token->opr.c = c;
1742
1743   token->word_char = 0;
1744 #ifdef RE_ENABLE_I18N
1745   token->mb_partial = 0;
1746   if (input->mb_cur_max > 1 &&
1747       !re_string_first_byte (input, re_string_cur_idx (input)))
1748     {
1749       token->type = CHARACTER;
1750       token->mb_partial = 1;
1751       return 1;
1752     }
1753 #endif
1754   if (c == '\\')
1755     {
1756       unsigned char c2;
1757       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1758         {
1759           token->type = BACK_SLASH;
1760           return 1;
1761         }
1762
1763       c2 = re_string_peek_byte_case (input, 1);
1764       token->opr.c = c2;
1765       token->type = CHARACTER;
1766 #ifdef RE_ENABLE_I18N
1767       if (input->mb_cur_max > 1)
1768         {
1769           wint_t wc = re_string_wchar_at (input,
1770                                           re_string_cur_idx (input) + 1);
1771           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1772         }
1773       else
1774 #endif
1775         token->word_char = IS_WORD_CHAR (c2) != 0;
1776
1777       switch (c2)
1778         {
1779         case '|':
1780           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1781             token->type = OP_ALT;
1782           break;
1783         case '1': case '2': case '3': case '4': case '5':
1784         case '6': case '7': case '8': case '9':
1785           if (!(syntax & RE_NO_BK_REFS))
1786             {
1787               token->type = OP_BACK_REF;
1788               token->opr.idx = c2 - '1';
1789             }
1790           break;
1791         case '<':
1792           if (!(syntax & RE_NO_GNU_OPS))
1793             {
1794               token->type = ANCHOR;
1795               token->opr.ctx_type = WORD_FIRST;
1796             }
1797           break;
1798         case '>':
1799           if (!(syntax & RE_NO_GNU_OPS))
1800             {
1801               token->type = ANCHOR;
1802               token->opr.ctx_type = WORD_LAST;
1803             }
1804           break;
1805         case 'b':
1806           if (!(syntax & RE_NO_GNU_OPS))
1807             {
1808               token->type = ANCHOR;
1809               token->opr.ctx_type = WORD_DELIM;
1810             }
1811           break;
1812         case 'B':
1813           if (!(syntax & RE_NO_GNU_OPS))
1814             {
1815               token->type = ANCHOR;
1816               token->opr.ctx_type = NOT_WORD_DELIM;
1817             }
1818           break;
1819         case 'w':
1820           if (!(syntax & RE_NO_GNU_OPS))
1821             token->type = OP_WORD;
1822           break;
1823         case 'W':
1824           if (!(syntax & RE_NO_GNU_OPS))
1825             token->type = OP_NOTWORD;
1826           break;
1827         case 's':
1828           if (!(syntax & RE_NO_GNU_OPS))
1829             token->type = OP_SPACE;
1830           break;
1831         case 'S':
1832           if (!(syntax & RE_NO_GNU_OPS))
1833             token->type = OP_NOTSPACE;
1834           break;
1835         case '`':
1836           if (!(syntax & RE_NO_GNU_OPS))
1837             {
1838               token->type = ANCHOR;
1839               token->opr.ctx_type = BUF_FIRST;
1840             }
1841           break;
1842         case '\'':
1843           if (!(syntax & RE_NO_GNU_OPS))
1844             {
1845               token->type = ANCHOR;
1846               token->opr.ctx_type = BUF_LAST;
1847             }
1848           break;
1849         case '(':
1850           if (!(syntax & RE_NO_BK_PARENS))
1851             token->type = OP_OPEN_SUBEXP;
1852           break;
1853         case ')':
1854           if (!(syntax & RE_NO_BK_PARENS))
1855             token->type = OP_CLOSE_SUBEXP;
1856           break;
1857         case '+':
1858           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1859             token->type = OP_DUP_PLUS;
1860           break;
1861         case '?':
1862           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1863             token->type = OP_DUP_QUESTION;
1864           break;
1865         case '{':
1866           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1867             token->type = OP_OPEN_DUP_NUM;
1868           break;
1869         case '}':
1870           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1871             token->type = OP_CLOSE_DUP_NUM;
1872           break;
1873         default:
1874           break;
1875         }
1876       return 2;
1877     }
1878
1879   token->type = CHARACTER;
1880 #ifdef RE_ENABLE_I18N
1881   if (input->mb_cur_max > 1)
1882     {
1883       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1884       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1885     }
1886   else
1887 #endif
1888     token->word_char = IS_WORD_CHAR (token->opr.c);
1889
1890   switch (c)
1891     {
1892     case '\n':
1893       if (syntax & RE_NEWLINE_ALT)
1894         token->type = OP_ALT;
1895       break;
1896     case '|':
1897       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1898         token->type = OP_ALT;
1899       break;
1900     case '*':
1901       token->type = OP_DUP_ASTERISK;
1902       break;
1903     case '+':
1904       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1905         token->type = OP_DUP_PLUS;
1906       break;
1907     case '?':
1908       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1909         token->type = OP_DUP_QUESTION;
1910       break;
1911     case '{':
1912       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1913         token->type = OP_OPEN_DUP_NUM;
1914       break;
1915     case '}':
1916       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1917         token->type = OP_CLOSE_DUP_NUM;
1918       break;
1919     case '(':
1920       if (syntax & RE_NO_BK_PARENS)
1921         token->type = OP_OPEN_SUBEXP;
1922       break;
1923     case ')':
1924       if (syntax & RE_NO_BK_PARENS)
1925         token->type = OP_CLOSE_SUBEXP;
1926       break;
1927     case '[':
1928       token->type = OP_OPEN_BRACKET;
1929       break;
1930     case '.':
1931       token->type = OP_PERIOD;
1932       break;
1933     case '^':
1934       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1935           re_string_cur_idx (input) != 0)
1936         {
1937           char prev = re_string_peek_byte (input, -1);
1938           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1939             break;
1940         }
1941       token->type = ANCHOR;
1942       token->opr.ctx_type = LINE_FIRST;
1943       break;
1944     case '$':
1945       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1946           re_string_cur_idx (input) + 1 != re_string_length (input))
1947         {
1948           re_token_t next;
1949           re_string_skip_bytes (input, 1);
1950           peek_token (&next, input, syntax);
1951           re_string_skip_bytes (input, -1);
1952           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1953             break;
1954         }
1955       token->type = ANCHOR;
1956       token->opr.ctx_type = LINE_LAST;
1957       break;
1958     default:
1959       break;
1960     }
1961   return 1;
1962 }
1963
1964 /* Peek a token from INPUT, and return the length of the token.
1965    We must not use this function out of bracket expressions.  */
1966
1967 static int
1968 internal_function
1969 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1970 {
1971   unsigned char c;
1972   if (re_string_eoi (input))
1973     {
1974       token->type = END_OF_RE;
1975       return 0;
1976     }
1977   c = re_string_peek_byte (input, 0);
1978   token->opr.c = c;
1979
1980 #ifdef RE_ENABLE_I18N
1981   if (input->mb_cur_max > 1 &&
1982       !re_string_first_byte (input, re_string_cur_idx (input)))
1983     {
1984       token->type = CHARACTER;
1985       return 1;
1986     }
1987 #endif /* RE_ENABLE_I18N */
1988
1989   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1990       && re_string_cur_idx (input) + 1 < re_string_length (input))
1991     {
1992       /* In this case, '\' escape a character.  */
1993       unsigned char c2;
1994       re_string_skip_bytes (input, 1);
1995       c2 = re_string_peek_byte (input, 0);
1996       token->opr.c = c2;
1997       token->type = CHARACTER;
1998       return 1;
1999     }
2000   if (c == '[') /* '[' is a special char in a bracket exps.  */
2001     {
2002       unsigned char c2;
2003       int token_len;
2004       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2005         c2 = re_string_peek_byte (input, 1);
2006       else
2007         c2 = 0;
2008       token->opr.c = c2;
2009       token_len = 2;
2010       switch (c2)
2011         {
2012         case '.':
2013           token->type = OP_OPEN_COLL_ELEM;
2014           break;
2015         case '=':
2016           token->type = OP_OPEN_EQUIV_CLASS;
2017           break;
2018         case ':':
2019           if (syntax & RE_CHAR_CLASSES)
2020             {
2021               token->type = OP_OPEN_CHAR_CLASS;
2022               break;
2023             }
2024           /* else fall through.  */
2025         default:
2026           token->type = CHARACTER;
2027           token->opr.c = c;
2028           token_len = 1;
2029           break;
2030         }
2031       return token_len;
2032     }
2033   switch (c)
2034     {
2035     case '-':
2036       token->type = OP_CHARSET_RANGE;
2037       break;
2038     case ']':
2039       token->type = OP_CLOSE_BRACKET;
2040       break;
2041     case '^':
2042       token->type = OP_NON_MATCH_LIST;
2043       break;
2044     default:
2045       token->type = CHARACTER;
2046     }
2047   return 1;
2048 }
2049 \f
2050 /* Functions for parser.  */
2051
2052 /* Entry point of the parser.
2053    Parse the regular expression REGEXP and return the structure tree.
2054    If an error is occured, ERR is set by error code, and return NULL.
2055    This function build the following tree, from regular expression <reg_exp>:
2056            CAT
2057            / \
2058           /   \
2059    <reg_exp>  EOR
2060
2061    CAT means concatenation.
2062    EOR means end of regular expression.  */
2063
2064 static bin_tree_t *
2065 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2066        reg_errcode_t *err)
2067 {
2068   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2069   bin_tree_t *tree, *eor, *root;
2070   re_token_t current_token;
2071   dfa->syntax = syntax;
2072   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2073   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2074   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2075     return NULL;
2076   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2077   if (tree != NULL)
2078     root = create_tree (dfa, tree, eor, CONCAT);
2079   else
2080     root = eor;
2081   if (BE (eor == NULL || root == NULL, 0))
2082     {
2083       *err = REG_ESPACE;
2084       return NULL;
2085     }
2086   return root;
2087 }
2088
2089 /* This function build the following tree, from regular expression
2090    <branch1>|<branch2>:
2091            ALT
2092            / \
2093           /   \
2094    <branch1> <branch2>
2095
2096    ALT means alternative, which represents the operator `|'.  */
2097
2098 static bin_tree_t *
2099 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2100                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2101 {
2102   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2103   bin_tree_t *tree, *branch = NULL;
2104   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2105   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2106     return NULL;
2107
2108   while (token->type == OP_ALT)
2109     {
2110       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2111       if (token->type != OP_ALT && token->type != END_OF_RE
2112           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2113         {
2114           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2115           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2116             return NULL;
2117         }
2118       else
2119         branch = NULL;
2120       tree = create_tree (dfa, tree, branch, OP_ALT);
2121       if (BE (tree == NULL, 0))
2122         {
2123           *err = REG_ESPACE;
2124           return NULL;
2125         }
2126     }
2127   return tree;
2128 }
2129
2130 /* This function build the following tree, from regular expression
2131    <exp1><exp2>:
2132         CAT
2133         / \
2134        /   \
2135    <exp1> <exp2>
2136
2137    CAT means concatenation.  */
2138
2139 static bin_tree_t *
2140 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2141               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2142 {
2143   bin_tree_t *tree, *exp;
2144   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2145   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2146   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147     return NULL;
2148
2149   while (token->type != OP_ALT && token->type != END_OF_RE
2150          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2151     {
2152       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2153       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2154         {
2155           return NULL;
2156         }
2157       if (tree != NULL && exp != NULL)
2158         {
2159           tree = create_tree (dfa, tree, exp, CONCAT);
2160           if (tree == NULL)
2161             {
2162               *err = REG_ESPACE;
2163               return NULL;
2164             }
2165         }
2166       else if (tree == NULL)
2167         tree = exp;
2168       /* Otherwise exp == NULL, we don't need to create new tree.  */
2169     }
2170   return tree;
2171 }
2172
2173 /* This function build the following tree, from regular expression a*:
2174          *
2175          |
2176          a
2177 */
2178
2179 static bin_tree_t *
2180 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2181                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2182 {
2183   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2184   bin_tree_t *tree;
2185   switch (token->type)
2186     {
2187     case CHARACTER:
2188       tree = create_token_tree (dfa, NULL, NULL, token);
2189       if (BE (tree == NULL, 0))
2190         {
2191           *err = REG_ESPACE;
2192           return NULL;
2193         }
2194 #ifdef RE_ENABLE_I18N
2195       if (dfa->mb_cur_max > 1)
2196         {
2197           while (!re_string_eoi (regexp)
2198                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2199             {
2200               bin_tree_t *mbc_remain;
2201               fetch_token (token, regexp, syntax);
2202               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2203               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2204               if (BE (mbc_remain == NULL || tree == NULL, 0))
2205                 {
2206                   *err = REG_ESPACE;
2207                   return NULL;
2208                 }
2209             }
2210         }
2211 #endif
2212       break;
2213     case OP_OPEN_SUBEXP:
2214       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2215       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2216         return NULL;
2217       break;
2218     case OP_OPEN_BRACKET:
2219       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2220       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221         return NULL;
2222       break;
2223     case OP_BACK_REF:
2224       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2225         {
2226           *err = REG_ESUBREG;
2227           return NULL;
2228         }
2229       dfa->used_bkref_map |= 1 << token->opr.idx;
2230       tree = create_token_tree (dfa, NULL, NULL, token);
2231       if (BE (tree == NULL, 0))
2232         {
2233           *err = REG_ESPACE;
2234           return NULL;
2235         }
2236       ++dfa->nbackref;
2237       dfa->has_mb_node = 1;
2238       break;
2239     case OP_OPEN_DUP_NUM:
2240       if (syntax & RE_CONTEXT_INVALID_DUP)
2241         {
2242           *err = REG_BADRPT;
2243           return NULL;
2244         }
2245       /* FALLTHROUGH */
2246     case OP_DUP_ASTERISK:
2247     case OP_DUP_PLUS:
2248     case OP_DUP_QUESTION:
2249       if (syntax & RE_CONTEXT_INVALID_OPS)
2250         {
2251           *err = REG_BADRPT;
2252           return NULL;
2253         }
2254       else if (syntax & RE_CONTEXT_INDEP_OPS)
2255         {
2256           fetch_token (token, regexp, syntax);
2257           return parse_expression (regexp, preg, token, syntax, nest, err);
2258         }
2259       /* else fall through  */
2260     case OP_CLOSE_SUBEXP:
2261       if ((token->type == OP_CLOSE_SUBEXP) &&
2262           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2263         {
2264           *err = REG_ERPAREN;
2265           return NULL;
2266         }
2267       /* else fall through  */
2268     case OP_CLOSE_DUP_NUM:
2269       /* We treat it as a normal character.  */
2270
2271       /* Then we can these characters as normal characters.  */
2272       token->type = CHARACTER;
2273       /* mb_partial and word_char bits should be initialized already
2274          by peek_token.  */
2275       tree = create_token_tree (dfa, NULL, NULL, token);
2276       if (BE (tree == NULL, 0))
2277         {
2278           *err = REG_ESPACE;
2279           return NULL;
2280         }
2281       break;
2282     case ANCHOR:
2283       if ((token->opr.ctx_type
2284            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2285           && dfa->word_ops_used == 0)
2286         init_word_char (dfa);
2287       if (token->opr.ctx_type == WORD_DELIM
2288           || token->opr.ctx_type == NOT_WORD_DELIM)
2289         {
2290           bin_tree_t *tree_first, *tree_last;
2291           if (token->opr.ctx_type == WORD_DELIM)
2292             {
2293               token->opr.ctx_type = WORD_FIRST;
2294               tree_first = create_token_tree (dfa, NULL, NULL, token);
2295               token->opr.ctx_type = WORD_LAST;
2296             }
2297           else
2298             {
2299               token->opr.ctx_type = INSIDE_WORD;
2300               tree_first = create_token_tree (dfa, NULL, NULL, token);
2301               token->opr.ctx_type = INSIDE_NOTWORD;
2302             }
2303           tree_last = create_token_tree (dfa, NULL, NULL, token);
2304           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2305           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2306             {
2307               *err = REG_ESPACE;
2308               return NULL;
2309             }
2310         }
2311       else
2312         {
2313           tree = create_token_tree (dfa, NULL, NULL, token);
2314           if (BE (tree == NULL, 0))
2315             {
2316               *err = REG_ESPACE;
2317               return NULL;
2318             }
2319         }
2320       /* We must return here, since ANCHORs can't be followed
2321          by repetition operators.
2322          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2323              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2324       fetch_token (token, regexp, syntax);
2325       return tree;
2326     case OP_PERIOD:
2327       tree = create_token_tree (dfa, NULL, NULL, token);
2328       if (BE (tree == NULL, 0))
2329         {
2330           *err = REG_ESPACE;
2331           return NULL;
2332         }
2333       if (dfa->mb_cur_max > 1)
2334         dfa->has_mb_node = 1;
2335       break;
2336     case OP_WORD:
2337     case OP_NOTWORD:
2338       tree = build_charclass_op (dfa, regexp->trans,
2339                                  (const unsigned char *) "alnum",
2340                                  (const unsigned char *) "_",
2341                                  token->type == OP_NOTWORD, err);
2342       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2343         return NULL;
2344       break;
2345     case OP_SPACE:
2346     case OP_NOTSPACE:
2347       tree = build_charclass_op (dfa, regexp->trans,
2348                                  (const unsigned char *) "space",
2349                                  (const unsigned char *) "",
2350                                  token->type == OP_NOTSPACE, err);
2351       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2352         return NULL;
2353       break;
2354     case OP_ALT:
2355     case END_OF_RE:
2356       return NULL;
2357     case BACK_SLASH:
2358       *err = REG_EESCAPE;
2359       return NULL;
2360     default:
2361       /* Must not happen?  */
2362 #ifdef DEBUG
2363       assert (0);
2364 #endif
2365       return NULL;
2366     }
2367   fetch_token (token, regexp, syntax);
2368
2369   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2370          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2371     {
2372       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2373       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2374         return NULL;
2375       /* In BRE consecutive duplications are not allowed.  */
2376       if ((syntax & RE_CONTEXT_INVALID_DUP)
2377           && (token->type == OP_DUP_ASTERISK
2378               || token->type == OP_OPEN_DUP_NUM))
2379         {
2380           *err = REG_BADRPT;
2381           return NULL;
2382         }
2383     }
2384
2385   return tree;
2386 }
2387
2388 /* This function build the following tree, from regular expression
2389    (<reg_exp>):
2390          SUBEXP
2391             |
2392         <reg_exp>
2393 */
2394
2395 static bin_tree_t *
2396 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2397                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2398 {
2399   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2400   bin_tree_t *tree;
2401   size_t cur_nsub;
2402   cur_nsub = preg->re_nsub++;
2403
2404   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2405
2406   /* The subexpression may be a null string.  */
2407   if (token->type == OP_CLOSE_SUBEXP)
2408     tree = NULL;
2409   else
2410     {
2411       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2412       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2413         *err = REG_EPAREN;
2414       if (BE (*err != REG_NOERROR, 0))
2415         return NULL;
2416     }
2417
2418   if (cur_nsub <= '9' - '1')
2419     dfa->completed_bkref_map |= 1 << cur_nsub;
2420
2421   tree = create_tree (dfa, tree, NULL, SUBEXP);
2422   if (BE (tree == NULL, 0))
2423     {
2424       *err = REG_ESPACE;
2425       return NULL;
2426     }
2427   tree->token.opr.idx = cur_nsub;
2428   return tree;
2429 }
2430
2431 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2432
2433 static bin_tree_t *
2434 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2435               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2436 {
2437   bin_tree_t *tree = NULL, *old_tree = NULL;
2438   int i, start, end, start_idx = re_string_cur_idx (regexp);
2439   re_token_t start_token = *token;
2440
2441   if (token->type == OP_OPEN_DUP_NUM)
2442     {
2443       end = 0;
2444       start = fetch_number (regexp, token, syntax);
2445       if (start == -1)
2446         {
2447           if (token->type == CHARACTER && token->opr.c == ',')
2448             start = 0; /* We treat "{,m}" as "{0,m}".  */
2449           else
2450             {
2451               *err = REG_BADBR; /* <re>{} is invalid.  */
2452               return NULL;
2453             }
2454         }
2455       if (BE (start != -2, 1))
2456         {
2457           /* We treat "{n}" as "{n,n}".  */
2458           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2459                  : ((token->type == CHARACTER && token->opr.c == ',')
2460                     ? fetch_number (regexp, token, syntax) : -2));
2461         }
2462       if (BE (start == -2 || end == -2, 0))
2463         {
2464           /* Invalid sequence.  */
2465           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2466             {
2467               if (token->type == END_OF_RE)
2468                 *err = REG_EBRACE;
2469               else
2470                 *err = REG_BADBR;
2471
2472               return NULL;
2473             }
2474
2475           /* If the syntax bit is set, rollback.  */
2476           re_string_set_index (regexp, start_idx);
2477           *token = start_token;
2478           token->type = CHARACTER;
2479           /* mb_partial and word_char bits should be already initialized by
2480              peek_token.  */
2481           return elem;
2482         }
2483
2484       if (BE (end != -1 && start > end, 0))
2485         {
2486           /* First number greater than second.  */
2487           *err = REG_BADBR;
2488           return NULL;
2489         }
2490     }
2491   else
2492     {
2493       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2494       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2495     }
2496
2497   fetch_token (token, regexp, syntax);
2498
2499   if (BE (elem == NULL, 0))
2500     return NULL;
2501   if (BE (start == 0 && end == 0, 0))
2502     {
2503       postorder (elem, free_tree, NULL);
2504       return NULL;
2505     }
2506
2507   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2508   if (BE (start > 0, 0))
2509     {
2510       tree = elem;
2511       for (i = 2; i <= start; ++i)
2512         {
2513           elem = duplicate_tree (elem, dfa);
2514           tree = create_tree (dfa, tree, elem, CONCAT);
2515           if (BE (elem == NULL || tree == NULL, 0))
2516             goto parse_dup_op_espace;
2517         }
2518
2519       if (start == end)
2520         return tree;
2521
2522       /* Duplicate ELEM before it is marked optional.  */
2523       elem = duplicate_tree (elem, dfa);
2524       old_tree = tree;
2525     }
2526   else
2527     old_tree = NULL;
2528
2529   if (elem->token.type == SUBEXP)
2530     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2531
2532   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2533   if (BE (tree == NULL, 0))
2534     goto parse_dup_op_espace;
2535
2536   /* This loop is actually executed only when end != -1,
2537      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2538      already created the start+1-th copy.  */
2539   for (i = start + 2; i <= end; ++i)
2540     {
2541       elem = duplicate_tree (elem, dfa);
2542       tree = create_tree (dfa, tree, elem, CONCAT);
2543       if (BE (elem == NULL || tree == NULL, 0))
2544         goto parse_dup_op_espace;
2545
2546       tree = create_tree (dfa, tree, NULL, OP_ALT);
2547       if (BE (tree == NULL, 0))
2548         goto parse_dup_op_espace;
2549     }
2550
2551   if (old_tree)
2552     tree = create_tree (dfa, old_tree, tree, CONCAT);
2553
2554   return tree;
2555
2556  parse_dup_op_espace:
2557   *err = REG_ESPACE;
2558   return NULL;
2559 }
2560
2561 /* Size of the names for collating symbol/equivalence_class/character_class.
2562    I'm not sure, but maybe enough.  */
2563 #define BRACKET_NAME_BUF_SIZE 32
2564
2565 #ifndef _LIBC
2566   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2567      Build the range expression which starts from START_ELEM, and ends
2568      at END_ELEM.  The result are written to MBCSET and SBCSET.
2569      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2570      mbcset->range_ends, is a pointer argument sinse we may
2571      update it.  */
2572
2573 static reg_errcode_t
2574 internal_function
2575 # ifdef RE_ENABLE_I18N
2576 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2577                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2578 # else /* not RE_ENABLE_I18N */
2579 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2580                  bracket_elem_t *end_elem)
2581 # endif /* not RE_ENABLE_I18N */
2582 {
2583   unsigned int start_ch, end_ch;
2584   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2585   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2586           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2587           0))
2588     return REG_ERANGE;
2589
2590   /* We can handle no multi character collating elements without libc
2591      support.  */
2592   if (BE ((start_elem->type == COLL_SYM
2593            && strlen ((char *) start_elem->opr.name) > 1)
2594           || (end_elem->type == COLL_SYM
2595               && strlen ((char *) end_elem->opr.name) > 1), 0))
2596     return REG_ECOLLATE;
2597
2598 # ifdef RE_ENABLE_I18N
2599   {
2600     wchar_t wc;
2601     wint_t start_wc;
2602     wint_t end_wc;
2603     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2604
2605     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2606                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2607                    : 0));
2608     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2609               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2610                  : 0));
2611     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2612                 ? __btowc (start_ch) : start_elem->opr.wch);
2613     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2614               ? __btowc (end_ch) : end_elem->opr.wch);
2615     if (start_wc == WEOF || end_wc == WEOF)
2616       return REG_ECOLLATE;
2617     cmp_buf[0] = start_wc;
2618     cmp_buf[4] = end_wc;
2619     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2620       return REG_ERANGE;
2621
2622     /* Got valid collation sequence values, add them as a new entry.
2623        However, for !_LIBC we have no collation elements: if the
2624        character set is single byte, the single byte character set
2625        that we build below suffices.  parse_bracket_exp passes
2626        no MBCSET if dfa->mb_cur_max == 1.  */
2627     if (mbcset)
2628       {
2629         /* Check the space of the arrays.  */
2630         if (BE (*range_alloc == mbcset->nranges, 0))
2631           {
2632             /* There is not enough space, need realloc.  */
2633             wchar_t *new_array_start, *new_array_end;
2634             int new_nranges;
2635
2636             /* +1 in case of mbcset->nranges is 0.  */
2637             new_nranges = 2 * mbcset->nranges + 1;
2638             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2639                are NULL if *range_alloc == 0.  */
2640             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2641                                           new_nranges);
2642             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2643                                         new_nranges);
2644
2645             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2646               return REG_ESPACE;
2647
2648             mbcset->range_starts = new_array_start;
2649             mbcset->range_ends = new_array_end;
2650             *range_alloc = new_nranges;
2651           }
2652
2653         mbcset->range_starts[mbcset->nranges] = start_wc;
2654         mbcset->range_ends[mbcset->nranges++] = end_wc;
2655       }
2656
2657     /* Build the table for single byte characters.  */
2658     for (wc = 0; wc < SBC_MAX; ++wc)
2659       {
2660         cmp_buf[2] = wc;
2661         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2662             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2663           bitset_set (sbcset, wc);
2664       }
2665   }
2666 # else /* not RE_ENABLE_I18N */
2667   {
2668     unsigned int ch;
2669     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2670                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2671                    : 0));
2672     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2673               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2674                  : 0));
2675     if (start_ch > end_ch)
2676       return REG_ERANGE;
2677     /* Build the table for single byte characters.  */
2678     for (ch = 0; ch < SBC_MAX; ++ch)
2679       if (start_ch <= ch  && ch <= end_ch)
2680         bitset_set (sbcset, ch);
2681   }
2682 # endif /* not RE_ENABLE_I18N */
2683   return REG_NOERROR;
2684 }
2685 #endif /* not _LIBC */
2686
2687 #ifndef _LIBC
2688 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2689    Build the collating element which is represented by NAME.
2690    The result are written to MBCSET and SBCSET.
2691    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2692    pointer argument since we may update it.  */
2693
2694 static reg_errcode_t
2695 internal_function
2696 # ifdef RE_ENABLE_I18N
2697 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2698                         int *coll_sym_alloc, const unsigned char *name)
2699 # else /* not RE_ENABLE_I18N */
2700 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2701 # endif /* not RE_ENABLE_I18N */
2702 {
2703   size_t name_len = strlen ((const char *) name);
2704   if (BE (name_len != 1, 0))
2705     return REG_ECOLLATE;
2706   else
2707     {
2708       bitset_set (sbcset, name[0]);
2709       return REG_NOERROR;
2710     }
2711 }
2712 #endif /* not _LIBC */
2713
2714 /* This function parse bracket expression like "[abc]", "[a-c]",
2715    "[[.a-a.]]" etc.  */
2716
2717 static bin_tree_t *
2718 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2719                    reg_syntax_t syntax, reg_errcode_t *err)
2720 {
2721 #ifdef _LIBC
2722   const unsigned char *collseqmb;
2723   const char *collseqwc;
2724   uint32_t nrules;
2725   int32_t table_size;
2726   const int32_t *symb_table;
2727   const unsigned char *extra;
2728
2729   /* Local function for parse_bracket_exp used in _LIBC environement.
2730      Seek the collating symbol entry correspondings to NAME.
2731      Return the index of the symbol in the SYMB_TABLE.  */
2732
2733   auto inline int32_t
2734   __attribute ((always_inline))
2735   seek_collating_symbol_entry (name, name_len)
2736          const unsigned char *name;
2737          size_t name_len;
2738     {
2739       int32_t hash = elem_hash ((const char *) name, name_len);
2740       int32_t elem = hash % table_size;
2741       if (symb_table[2 * elem] != 0)
2742         {
2743           int32_t second = hash % (table_size - 2) + 1;
2744
2745           do
2746             {
2747               /* First compare the hashing value.  */
2748               if (symb_table[2 * elem] == hash
2749                   /* Compare the length of the name.  */
2750                   && name_len == extra[symb_table[2 * elem + 1]]
2751                   /* Compare the name.  */
2752                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2753                              name_len) == 0)
2754                 {
2755                   /* Yep, this is the entry.  */
2756                   break;
2757                 }
2758
2759               /* Next entry.  */
2760               elem += second;
2761             }
2762           while (symb_table[2 * elem] != 0);
2763         }
2764       return elem;
2765     }
2766
2767   /* Local function for parse_bracket_exp used in _LIBC environment.
2768      Look up the collation sequence value of BR_ELEM.
2769      Return the value if succeeded, UINT_MAX otherwise.  */
2770
2771   auto inline unsigned int
2772   __attribute ((always_inline))
2773   lookup_collation_sequence_value (br_elem)
2774          bracket_elem_t *br_elem;
2775     {
2776       if (br_elem->type == SB_CHAR)
2777         {
2778           /*
2779           if (MB_CUR_MAX == 1)
2780           */
2781           if (nrules == 0)
2782             return collseqmb[br_elem->opr.ch];
2783           else
2784             {
2785               wint_t wc = __btowc (br_elem->opr.ch);
2786               return __collseq_table_lookup (collseqwc, wc);
2787             }
2788         }
2789       else if (br_elem->type == MB_CHAR)
2790         {
2791           if (nrules != 0)
2792             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2793         }
2794       else if (br_elem->type == COLL_SYM)
2795         {
2796           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2797           if (nrules != 0)
2798             {
2799               int32_t elem, idx;
2800               elem = seek_collating_symbol_entry (br_elem->opr.name,
2801                                                   sym_name_len);
2802               if (symb_table[2 * elem] != 0)
2803                 {
2804                   /* We found the entry.  */
2805                   idx = symb_table[2 * elem + 1];
2806                   /* Skip the name of collating element name.  */
2807                   idx += 1 + extra[idx];
2808                   /* Skip the byte sequence of the collating element.  */
2809                   idx += 1 + extra[idx];
2810                   /* Adjust for the alignment.  */
2811                   idx = (idx + 3) & ~3;
2812                   /* Skip the multibyte collation sequence value.  */
2813                   idx += sizeof (unsigned int);
2814                   /* Skip the wide char sequence of the collating element.  */
2815                   idx += sizeof (unsigned int) *
2816                     (1 + *(unsigned int *) (extra + idx));
2817                   /* Return the collation sequence value.  */
2818                   return *(unsigned int *) (extra + idx);
2819                 }
2820               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2821                 {
2822                   /* No valid character.  Match it as a single byte
2823                      character.  */
2824                   return collseqmb[br_elem->opr.name[0]];
2825                 }
2826             }
2827           else if (sym_name_len == 1)
2828             return collseqmb[br_elem->opr.name[0]];
2829         }
2830       return UINT_MAX;
2831     }
2832
2833   /* Local function for parse_bracket_exp used in _LIBC environement.
2834      Build the range expression which starts from START_ELEM, and ends
2835      at END_ELEM.  The result are written to MBCSET and SBCSET.
2836      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2837      mbcset->range_ends, is a pointer argument sinse we may
2838      update it.  */
2839
2840   auto inline reg_errcode_t
2841   __attribute ((always_inline))
2842   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2843          re_charset_t *mbcset;
2844          int *range_alloc;
2845          bitset_t sbcset;
2846          bracket_elem_t *start_elem, *end_elem;
2847     {
2848       unsigned int ch;
2849       uint32_t start_collseq;
2850       uint32_t end_collseq;
2851
2852       /* Equivalence Classes and Character Classes can't be a range
2853          start/end.  */
2854       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2855               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2856               0))
2857         return REG_ERANGE;
2858
2859       start_collseq = lookup_collation_sequence_value (start_elem);
2860       end_collseq = lookup_collation_sequence_value (end_elem);
2861       /* Check start/end collation sequence values.  */
2862       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2863         return REG_ECOLLATE;
2864       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2865         return REG_ERANGE;
2866
2867       /* Got valid collation sequence values, add them as a new entry.
2868          However, if we have no collation elements, and the character set
2869          is single byte, the single byte character set that we
2870          build below suffices. */
2871       if (nrules > 0 || dfa->mb_cur_max > 1)
2872         {
2873           /* Check the space of the arrays.  */
2874           if (BE (*range_alloc == mbcset->nranges, 0))
2875             {
2876               /* There is not enough space, need realloc.  */
2877               uint32_t *new_array_start;
2878               uint32_t *new_array_end;
2879               int new_nranges;
2880
2881               /* +1 in case of mbcset->nranges is 0.  */
2882               new_nranges = 2 * mbcset->nranges + 1;
2883               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2884                                             new_nranges);
2885               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2886                                           new_nranges);
2887
2888               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2889                 return REG_ESPACE;
2890
2891               mbcset->range_starts = new_array_start;
2892               mbcset->range_ends = new_array_end;
2893               *range_alloc = new_nranges;
2894             }
2895
2896           mbcset->range_starts[mbcset->nranges] = start_collseq;
2897           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2898         }
2899
2900       /* Build the table for single byte characters.  */
2901       for (ch = 0; ch < SBC_MAX; ch++)
2902         {
2903           uint32_t ch_collseq;
2904           /*
2905           if (MB_CUR_MAX == 1)
2906           */
2907           if (nrules == 0)
2908             ch_collseq = collseqmb[ch];
2909           else
2910             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2911           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2912             bitset_set (sbcset, ch);
2913         }
2914       return REG_NOERROR;
2915     }
2916
2917   /* Local function for parse_bracket_exp used in _LIBC environement.
2918      Build the collating element which is represented by NAME.
2919      The result are written to MBCSET and SBCSET.
2920      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2921      pointer argument sinse we may update it.  */
2922
2923   auto inline reg_errcode_t
2924   __attribute ((always_inline))
2925   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2926          re_charset_t *mbcset;
2927          int *coll_sym_alloc;
2928          bitset_t sbcset;
2929          const unsigned char *name;
2930     {
2931       int32_t elem, idx;
2932       size_t name_len = strlen ((const char *) name);
2933       if (nrules != 0)
2934         {
2935           elem = seek_collating_symbol_entry (name, name_len);
2936           if (symb_table[2 * elem] != 0)
2937             {
2938               /* We found the entry.  */
2939               idx = symb_table[2 * elem + 1];
2940               /* Skip the name of collating element name.  */
2941               idx += 1 + extra[idx];
2942             }
2943           else if (symb_table[2 * elem] == 0 && name_len == 1)
2944             {
2945               /* No valid character, treat it as a normal
2946                  character.  */
2947               bitset_set (sbcset, name[0]);
2948               return REG_NOERROR;
2949             }
2950           else
2951             return REG_ECOLLATE;
2952
2953           /* Got valid collation sequence, add it as a new entry.  */
2954           /* Check the space of the arrays.  */
2955           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2956             {
2957               /* Not enough, realloc it.  */
2958               /* +1 in case of mbcset->ncoll_syms is 0.  */
2959               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2960               /* Use realloc since mbcset->coll_syms is NULL
2961                  if *alloc == 0.  */
2962               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2963                                                    new_coll_sym_alloc);
2964               if (BE (new_coll_syms == NULL, 0))
2965                 return REG_ESPACE;
2966               mbcset->coll_syms = new_coll_syms;
2967               *coll_sym_alloc = new_coll_sym_alloc;
2968             }
2969           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2970           return REG_NOERROR;
2971         }
2972       else
2973         {
2974           if (BE (name_len != 1, 0))
2975             return REG_ECOLLATE;
2976           else
2977             {
2978               bitset_set (sbcset, name[0]);
2979               return REG_NOERROR;
2980             }
2981         }
2982     }
2983 #endif
2984
2985   re_token_t br_token;
2986   re_bitset_ptr_t sbcset;
2987 #ifdef RE_ENABLE_I18N
2988   re_charset_t *mbcset;
2989   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2990   int equiv_class_alloc = 0, char_class_alloc = 0;
2991 #endif /* not RE_ENABLE_I18N */
2992   int non_match = 0;
2993   bin_tree_t *work_tree;
2994   int token_len;
2995   int first_round = 1;
2996 #ifdef _LIBC
2997   collseqmb = (const unsigned char *)
2998     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2999   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3000   if (nrules)
3001     {
3002       /*
3003       if (MB_CUR_MAX > 1)
3004       */
3005       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3006       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3007       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3008                                                   _NL_COLLATE_SYMB_TABLEMB);
3009       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3010                                                    _NL_COLLATE_SYMB_EXTRAMB);
3011     }
3012 #endif
3013   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3014 #ifdef RE_ENABLE_I18N
3015   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3016 #endif /* RE_ENABLE_I18N */
3017 #ifdef RE_ENABLE_I18N
3018   if (BE (sbcset == NULL || mbcset == NULL, 0))
3019 #else
3020   if (BE (sbcset == NULL, 0))
3021 #endif /* RE_ENABLE_I18N */
3022     {
3023       *err = REG_ESPACE;
3024       return NULL;
3025     }
3026
3027   token_len = peek_token_bracket (token, regexp, syntax);
3028   if (BE (token->type == END_OF_RE, 0))
3029     {
3030       *err = REG_BADPAT;
3031       goto parse_bracket_exp_free_return;
3032     }
3033   if (token->type == OP_NON_MATCH_LIST)
3034     {
3035 #ifdef RE_ENABLE_I18N
3036       mbcset->non_match = 1;
3037 #endif /* not RE_ENABLE_I18N */
3038       non_match = 1;
3039       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3040         bitset_set (sbcset, '\n');
3041       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3042       token_len = peek_token_bracket (token, regexp, syntax);
3043       if (BE (token->type == END_OF_RE, 0))
3044         {
3045           *err = REG_BADPAT;
3046           goto parse_bracket_exp_free_return;
3047         }
3048     }
3049
3050   /* We treat the first ']' as a normal character.  */
3051   if (token->type == OP_CLOSE_BRACKET)
3052     token->type = CHARACTER;
3053
3054   while (1)
3055     {
3056       bracket_elem_t start_elem, end_elem;
3057       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3058       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3059       reg_errcode_t ret;
3060       int token_len2 = 0, is_range_exp = 0;
3061       re_token_t token2;
3062
3063       start_elem.opr.name = start_name_buf;
3064       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3065                                    syntax, first_round);
3066       if (BE (ret != REG_NOERROR, 0))
3067         {
3068           *err = ret;
3069           goto parse_bracket_exp_free_return;
3070         }
3071       first_round = 0;
3072
3073       /* Get information about the next token.  We need it in any case.  */
3074       token_len = peek_token_bracket (token, regexp, syntax);
3075
3076       /* Do not check for ranges if we know they are not allowed.  */
3077       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3078         {
3079           if (BE (token->type == END_OF_RE, 0))
3080             {
3081               *err = REG_EBRACK;
3082               goto parse_bracket_exp_free_return;
3083             }
3084           if (token->type == OP_CHARSET_RANGE)
3085             {
3086               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3087               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3088               if (BE (token2.type == END_OF_RE, 0))
3089                 {
3090                   *err = REG_EBRACK;
3091                   goto parse_bracket_exp_free_return;
3092                 }
3093               if (token2.type == OP_CLOSE_BRACKET)
3094                 {
3095                   /* We treat the last '-' as a normal character.  */
3096                   re_string_skip_bytes (regexp, -token_len);
3097                   token->type = CHARACTER;
3098                 }
3099               else
3100                 is_range_exp = 1;
3101             }
3102         }
3103
3104       if (is_range_exp == 1)
3105         {
3106           end_elem.opr.name = end_name_buf;
3107           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3108                                        dfa, syntax, 1);
3109           if (BE (ret != REG_NOERROR, 0))
3110             {
3111               *err = ret;
3112               goto parse_bracket_exp_free_return;
3113             }
3114
3115           token_len = peek_token_bracket (token, regexp, syntax);
3116
3117 #ifdef _LIBC
3118           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3119                                   &start_elem, &end_elem);
3120 #else
3121 # ifdef RE_ENABLE_I18N
3122           *err = build_range_exp (sbcset,
3123                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3124                                   &range_alloc, &start_elem, &end_elem);
3125 # else
3126           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3127 # endif
3128 #endif /* RE_ENABLE_I18N */
3129           if (BE (*err != REG_NOERROR, 0))
3130             goto parse_bracket_exp_free_return;
3131         }
3132       else
3133         {
3134           switch (start_elem.type)
3135             {
3136             case SB_CHAR:
3137               bitset_set (sbcset, start_elem.opr.ch);
3138               break;
3139 #ifdef RE_ENABLE_I18N
3140             case MB_CHAR:
3141               /* Check whether the array has enough space.  */
3142               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3143                 {
3144                   wchar_t *new_mbchars;
3145                   /* Not enough, realloc it.  */
3146                   /* +1 in case of mbcset->nmbchars is 0.  */
3147                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3148                   /* Use realloc since array is NULL if *alloc == 0.  */
3149                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3150                                             mbchar_alloc);
3151                   if (BE (new_mbchars == NULL, 0))
3152                     goto parse_bracket_exp_espace;
3153                   mbcset->mbchars = new_mbchars;
3154                 }
3155               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3156               break;
3157 #endif /* RE_ENABLE_I18N */
3158             case EQUIV_CLASS:
3159               *err = build_equiv_class (sbcset,
3160 #ifdef RE_ENABLE_I18N
3161                                         mbcset, &equiv_class_alloc,
3162 #endif /* RE_ENABLE_I18N */
3163                                         start_elem.opr.name);
3164               if (BE (*err != REG_NOERROR, 0))
3165                 goto parse_bracket_exp_free_return;
3166               break;
3167             case COLL_SYM:
3168               *err = build_collating_symbol (sbcset,
3169 #ifdef RE_ENABLE_I18N
3170                                              mbcset, &coll_sym_alloc,
3171 #endif /* RE_ENABLE_I18N */
3172                                              start_elem.opr.name);
3173               if (BE (*err != REG_NOERROR, 0))
3174                 goto parse_bracket_exp_free_return;
3175               break;
3176             case CHAR_CLASS:
3177               *err = build_charclass (regexp->trans, sbcset,
3178 #ifdef RE_ENABLE_I18N
3179                                       mbcset, &char_class_alloc,
3180 #endif /* RE_ENABLE_I18N */
3181                                       start_elem.opr.name, syntax);
3182               if (BE (*err != REG_NOERROR, 0))
3183                goto parse_bracket_exp_free_return;
3184               break;
3185             default:
3186               assert (0);
3187               break;
3188             }
3189         }
3190       if (BE (token->type == END_OF_RE, 0))
3191         {
3192           *err = REG_EBRACK;
3193           goto parse_bracket_exp_free_return;
3194         }
3195       if (token->type == OP_CLOSE_BRACKET)
3196         break;
3197     }
3198
3199   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3200
3201   /* If it is non-matching list.  */
3202   if (non_match)
3203     bitset_not (sbcset);
3204
3205 #ifdef RE_ENABLE_I18N
3206   /* Ensure only single byte characters are set.  */
3207   if (dfa->mb_cur_max > 1)
3208     bitset_mask (sbcset, dfa->sb_char);
3209
3210   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3211       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3212                                                      || mbcset->non_match)))
3213     {
3214       bin_tree_t *mbc_tree;
3215       int sbc_idx;
3216       /* Build a tree for complex bracket.  */
3217       dfa->has_mb_node = 1;
3218       br_token.type = COMPLEX_BRACKET;
3219       br_token.opr.mbcset = mbcset;
3220       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3221       if (BE (mbc_tree == NULL, 0))
3222         goto parse_bracket_exp_espace;
3223       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3224         if (sbcset[sbc_idx])
3225           break;
3226       /* If there are no bits set in sbcset, there is no point
3227          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3228       if (sbc_idx < BITSET_WORDS)
3229         {
3230           /* Build a tree for simple bracket.  */
3231           br_token.type = SIMPLE_BRACKET;
3232           br_token.opr.sbcset = sbcset;
3233           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3234           if (BE (work_tree == NULL, 0))
3235             goto parse_bracket_exp_espace;
3236
3237           /* Then join them by ALT node.  */
3238           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3239           if (BE (work_tree == NULL, 0))
3240             goto parse_bracket_exp_espace;
3241         }
3242       else
3243         {
3244           re_free (sbcset);
3245           work_tree = mbc_tree;
3246         }
3247     }
3248   else
3249 #endif /* not RE_ENABLE_I18N */
3250     {
3251 #ifdef RE_ENABLE_I18N
3252       free_charset (mbcset);
3253 #endif
3254       /* Build a tree for simple bracket.  */
3255       br_token.type = SIMPLE_BRACKET;
3256       br_token.opr.sbcset = sbcset;
3257       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3258       if (BE (work_tree == NULL, 0))
3259         goto parse_bracket_exp_espace;
3260     }
3261   return work_tree;
3262
3263  parse_bracket_exp_espace:
3264   *err = REG_ESPACE;
3265  parse_bracket_exp_free_return:
3266   re_free (sbcset);
3267 #ifdef RE_ENABLE_I18N
3268   free_charset (mbcset);
3269 #endif /* RE_ENABLE_I18N */
3270   return NULL;
3271 }
3272
3273 /* Parse an element in the bracket expression.  */
3274
3275 static reg_errcode_t
3276 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3277                        re_token_t *token, int token_len, re_dfa_t *dfa,
3278                        reg_syntax_t syntax, int accept_hyphen)
3279 {
3280 #ifdef RE_ENABLE_I18N
3281   int cur_char_size;
3282   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3283   if (cur_char_size > 1)
3284     {
3285       elem->type = MB_CHAR;
3286       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3287       re_string_skip_bytes (regexp, cur_char_size);
3288       return REG_NOERROR;
3289     }
3290 #endif /* RE_ENABLE_I18N */
3291   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3292   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3293       || token->type == OP_OPEN_EQUIV_CLASS)
3294     return parse_bracket_symbol (elem, regexp, token);
3295   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3296     {
3297       /* A '-' must only appear as anything but a range indicator before
3298          the closing bracket.  Everything else is an error.  */
3299       re_token_t token2;
3300       (void) peek_token_bracket (&token2, regexp, syntax);
3301       if (token2.type != OP_CLOSE_BRACKET)
3302         /* The actual error value is not standardized since this whole
3303            case is undefined.  But ERANGE makes good sense.  */
3304         return REG_ERANGE;
3305     }
3306   elem->type = SB_CHAR;
3307   elem->opr.ch = token->opr.c;
3308   return REG_NOERROR;
3309 }
3310
3311 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3312    such as [:<character_class>:], [.<collating_element>.], and
3313    [=<equivalent_class>=].  */
3314
3315 static reg_errcode_t
3316 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3317                       re_token_t *token)
3318 {
3319   unsigned char ch, delim = token->opr.c;
3320   int i = 0;
3321   if (re_string_eoi(regexp))
3322     return REG_EBRACK;
3323   for (;; ++i)
3324     {
3325       if (i >= BRACKET_NAME_BUF_SIZE)
3326         return REG_EBRACK;
3327       if (token->type == OP_OPEN_CHAR_CLASS)
3328         ch = re_string_fetch_byte_case (regexp);
3329       else
3330         ch = re_string_fetch_byte (regexp);
3331       if (re_string_eoi(regexp))
3332         return REG_EBRACK;
3333       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3334         break;
3335       elem->opr.name[i] = ch;
3336     }
3337   re_string_skip_bytes (regexp, 1);
3338   elem->opr.name[i] = '\0';
3339   switch (token->type)
3340     {
3341     case OP_OPEN_COLL_ELEM:
3342       elem->type = COLL_SYM;
3343       break;
3344     case OP_OPEN_EQUIV_CLASS:
3345       elem->type = EQUIV_CLASS;
3346       break;
3347     case OP_OPEN_CHAR_CLASS:
3348       elem->type = CHAR_CLASS;
3349       break;
3350     default:
3351       break;
3352     }
3353   return REG_NOERROR;
3354 }
3355
3356   /* Helper function for parse_bracket_exp.
3357      Build the equivalence class which is represented by NAME.
3358      The result are written to MBCSET and SBCSET.
3359      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3360      is a pointer argument sinse we may update it.  */
3361
3362 static reg_errcode_t
3363 #ifdef RE_ENABLE_I18N
3364 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3365                    int *equiv_class_alloc, const unsigned char *name)
3366 #else /* not RE_ENABLE_I18N */
3367 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3368 #endif /* not RE_ENABLE_I18N */
3369 {
3370 #ifdef _LIBC
3371   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3372   if (nrules != 0)
3373     {
3374       const int32_t *table, *indirect;
3375       const unsigned char *weights, *extra, *cp;
3376       unsigned char char_buf[2];
3377       int32_t idx1, idx2;
3378       unsigned int ch;
3379       size_t len;
3380       /* This #include defines a local function!  */
3381 # include <locale/weight.h>
3382       /* Calculate the index for equivalence class.  */
3383       cp = name;
3384       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3385       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3386                                                _NL_COLLATE_WEIGHTMB);
3387       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3388                                                    _NL_COLLATE_EXTRAMB);
3389       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3390                                                 _NL_COLLATE_INDIRECTMB);
3391       idx1 = findidx (&cp);
3392       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3393         /* This isn't a valid character.  */
3394         return REG_ECOLLATE;
3395
3396       /* Build single byte matcing table for this equivalence class.  */
3397       char_buf[1] = (unsigned char) '\0';
3398       len = weights[idx1 & 0xffffff];
3399       for (ch = 0; ch < SBC_MAX; ++ch)
3400         {
3401           char_buf[0] = ch;
3402           cp = char_buf;
3403           idx2 = findidx (&cp);
3404 /*
3405           idx2 = table[ch];
3406 */
3407           if (idx2 == 0)
3408             /* This isn't a valid character.  */
3409             continue;
3410           /* Compare only if the length matches and the collation rule
3411              index is the same.  */
3412           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3413             {
3414               int cnt = 0;
3415
3416               while (cnt <= len &&
3417                      weights[(idx1 & 0xffffff) + 1 + cnt]
3418                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3419                 ++cnt;
3420
3421               if (cnt > len)
3422                 bitset_set (sbcset, ch);
3423             }
3424         }
3425       /* Check whether the array has enough space.  */
3426       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3427         {
3428           /* Not enough, realloc it.  */
3429           /* +1 in case of mbcset->nequiv_classes is 0.  */
3430           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3431           /* Use realloc since the array is NULL if *alloc == 0.  */
3432           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3433                                                    int32_t,
3434                                                    new_equiv_class_alloc);
3435           if (BE (new_equiv_classes == NULL, 0))
3436             return REG_ESPACE;
3437           mbcset->equiv_classes = new_equiv_classes;
3438           *equiv_class_alloc = new_equiv_class_alloc;
3439         }
3440       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3441     }
3442   else
3443 #endif /* _LIBC */
3444     {
3445       if (BE (strlen ((const char *) name) != 1, 0))
3446         return REG_ECOLLATE;
3447       bitset_set (sbcset, *name);
3448     }
3449   return REG_NOERROR;
3450 }
3451
3452   /* Helper function for parse_bracket_exp.
3453      Build the character class which is represented by NAME.
3454      The result are written to MBCSET and SBCSET.
3455      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3456      is a pointer argument sinse we may update it.  */
3457
3458 static reg_errcode_t
3459 #ifdef RE_ENABLE_I18N
3460 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3461                  re_charset_t *mbcset, int *char_class_alloc,
3462                  const unsigned char *class_name, reg_syntax_t syntax)
3463 #else /* not RE_ENABLE_I18N */
3464 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3465                  const unsigned char *class_name, reg_syntax_t syntax)
3466 #endif /* not RE_ENABLE_I18N */
3467 {
3468   int i;
3469   const char *name = (const char *) class_name;
3470
3471   /* In case of REG_ICASE "upper" and "lower" match the both of
3472      upper and lower cases.  */
3473   if ((syntax & RE_ICASE)
3474       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3475     name = "alpha";
3476
3477 #ifdef RE_ENABLE_I18N
3478   /* Check the space of the arrays.  */
3479   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3480     {
3481       /* Not enough, realloc it.  */
3482       /* +1 in case of mbcset->nchar_classes is 0.  */
3483       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3484       /* Use realloc since array is NULL if *alloc == 0.  */
3485       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3486                                                new_char_class_alloc);
3487       if (BE (new_char_classes == NULL, 0))
3488         return REG_ESPACE;
3489       mbcset->char_classes = new_char_classes;
3490       *char_class_alloc = new_char_class_alloc;
3491     }
3492   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3493 #endif /* RE_ENABLE_I18N */
3494
3495 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3496   do {                                          \
3497     if (BE (trans != NULL, 0))                  \
3498       {                                         \
3499         for (i = 0; i < SBC_MAX; ++i)           \
3500           if (ctype_func (i))                   \
3501             bitset_set (sbcset, trans[i]);      \
3502       }                                         \
3503     else                                        \
3504       {                                         \
3505         for (i = 0; i < SBC_MAX; ++i)           \
3506           if (ctype_func (i))                   \
3507             bitset_set (sbcset, i);             \
3508       }                                         \
3509   } while (0)
3510
3511   if (strcmp (name, "alnum") == 0)
3512     BUILD_CHARCLASS_LOOP (isalnum);
3513   else if (strcmp (name, "cntrl") == 0)
3514     BUILD_CHARCLASS_LOOP (iscntrl);
3515   else if (strcmp (name, "lower") == 0)
3516     BUILD_CHARCLASS_LOOP (islower);
3517   else if (strcmp (name, "space") == 0)
3518     BUILD_CHARCLASS_LOOP (isspace);
3519   else if (strcmp (name, "alpha") == 0)
3520     BUILD_CHARCLASS_LOOP (isalpha);
3521   else if (strcmp (name, "digit") == 0)
3522     BUILD_CHARCLASS_LOOP (isdigit);
3523   else if (strcmp (name, "print") == 0)
3524     BUILD_CHARCLASS_LOOP (isprint);
3525   else if (strcmp (name, "upper") == 0)
3526     BUILD_CHARCLASS_LOOP (isupper);
3527   else if (strcmp (name, "blank") == 0)
3528     BUILD_CHARCLASS_LOOP (isblank);
3529   else if (strcmp (name, "graph") == 0)
3530     BUILD_CHARCLASS_LOOP (isgraph);
3531   else if (strcmp (name, "punct") == 0)
3532     BUILD_CHARCLASS_LOOP (ispunct);
3533   else if (strcmp (name, "xdigit") == 0)
3534     BUILD_CHARCLASS_LOOP (isxdigit);
3535   else
3536     return REG_ECTYPE;
3537
3538   return REG_NOERROR;
3539 }
3540
3541 static bin_tree_t *
3542 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3543                     const unsigned char *class_name,
3544                     const unsigned char *extra, int non_match,
3545                     reg_errcode_t *err)
3546 {
3547   re_bitset_ptr_t sbcset;
3548 #ifdef RE_ENABLE_I18N
3549   re_charset_t *mbcset;
3550   int alloc = 0;
3551 #endif /* not RE_ENABLE_I18N */
3552   reg_errcode_t ret;
3553   re_token_t br_token;
3554   bin_tree_t *tree;
3555
3556   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3557 #ifdef RE_ENABLE_I18N
3558   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3559 #endif /* RE_ENABLE_I18N */
3560
3561 #ifdef RE_ENABLE_I18N
3562   if (BE (sbcset == NULL || mbcset == NULL, 0))
3563 #else /* not RE_ENABLE_I18N */
3564   if (BE (sbcset == NULL, 0))
3565 #endif /* not RE_ENABLE_I18N */
3566     {
3567       *err = REG_ESPACE;
3568       return NULL;
3569     }
3570
3571   if (non_match)
3572     {
3573 #ifdef RE_ENABLE_I18N
3574       mbcset->non_match = 1;
3575 #endif /* not RE_ENABLE_I18N */
3576     }
3577
3578   /* We don't care the syntax in this case.  */
3579   ret = build_charclass (trans, sbcset,
3580 #ifdef RE_ENABLE_I18N
3581                          mbcset, &alloc,
3582 #endif /* RE_ENABLE_I18N */
3583                          class_name, 0);
3584
3585   if (BE (ret != REG_NOERROR, 0))
3586     {
3587       re_free (sbcset);
3588 #ifdef RE_ENABLE_I18N
3589       free_charset (mbcset);
3590 #endif /* RE_ENABLE_I18N */
3591       *err = ret;
3592       return NULL;
3593     }
3594   /* \w match '_' also.  */
3595   for (; *extra; extra++)
3596     bitset_set (sbcset, *extra);
3597
3598   /* If it is non-matching list.  */
3599   if (non_match)
3600     bitset_not (sbcset);
3601
3602 #ifdef RE_ENABLE_I18N
3603   /* Ensure only single byte characters are set.  */
3604   if (dfa->mb_cur_max > 1)
3605     bitset_mask (sbcset, dfa->sb_char);
3606 #endif
3607
3608   /* Build a tree for simple bracket.  */
3609   br_token.type = SIMPLE_BRACKET;
3610   br_token.opr.sbcset = sbcset;
3611   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3612   if (BE (tree == NULL, 0))
3613     goto build_word_op_espace;
3614
3615 #ifdef RE_ENABLE_I18N
3616   if (dfa->mb_cur_max > 1)
3617     {
3618       bin_tree_t *mbc_tree;
3619       /* Build a tree for complex bracket.  */
3620       br_token.type = COMPLEX_BRACKET;
3621       br_token.opr.mbcset = mbcset;
3622       dfa->has_mb_node = 1;
3623       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3624       if (BE (mbc_tree == NULL, 0))
3625         goto build_word_op_espace;
3626       /* Then join them by ALT node.  */
3627       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3628       if (BE (mbc_tree != NULL, 1))
3629         return tree;
3630     }
3631   else
3632     {
3633       free_charset (mbcset);
3634       return tree;
3635     }
3636 #else /* not RE_ENABLE_I18N */
3637   return tree;
3638 #endif /* not RE_ENABLE_I18N */
3639
3640  build_word_op_espace:
3641   re_free (sbcset);
3642 #ifdef RE_ENABLE_I18N
3643   free_charset (mbcset);
3644 #endif /* RE_ENABLE_I18N */
3645   *err = REG_ESPACE;
3646   return NULL;
3647 }
3648
3649 /* This is intended for the expressions like "a{1,3}".
3650    Fetch a number from `input', and return the number.
3651    Return -1, if the number field is empty like "{,1}".
3652    Return -2, If an error is occured.  */
3653
3654 static int
3655 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3656 {
3657   int num = -1;
3658   unsigned char c;
3659   while (1)
3660     {
3661       fetch_token (token, input, syntax);
3662       c = token->opr.c;
3663       if (BE (token->type == END_OF_RE, 0))
3664         return -2;
3665       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3666         break;
3667       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3668              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3669       num = (num > RE_DUP_MAX) ? -2 : num;
3670     }
3671   return num;
3672 }
3673 \f
3674 #ifdef RE_ENABLE_I18N
3675 static void
3676 free_charset (re_charset_t *cset)
3677 {
3678   re_free (cset->mbchars);
3679 # ifdef _LIBC
3680   re_free (cset->coll_syms);
3681   re_free (cset->equiv_classes);
3682   re_free (cset->range_starts);
3683   re_free (cset->range_ends);
3684 # endif
3685   re_free (cset->char_classes);
3686   re_free (cset);
3687 }
3688 #endif /* RE_ENABLE_I18N */
3689 \f
3690 /* Functions for binary tree operation.  */
3691
3692 /* Create a tree node.  */
3693
3694 static bin_tree_t *
3695 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3696              re_token_type_t type)
3697 {
3698   re_token_t t;
3699   t.type = type;
3700   return create_token_tree (dfa, left, right, &t);
3701 }
3702
3703 static bin_tree_t *
3704 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3705                    const re_token_t *token)
3706 {
3707   bin_tree_t *tree;
3708   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3709     {
3710       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3711
3712       if (storage == NULL)
3713         return NULL;
3714       storage->next = dfa->str_tree_storage;
3715       dfa->str_tree_storage = storage;
3716       dfa->str_tree_storage_idx = 0;
3717     }
3718   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3719
3720   tree->parent = NULL;
3721   tree->left = left;
3722   tree->right = right;
3723   tree->token = *token;
3724   tree->token.duplicated = 0;
3725   tree->token.opt_subexp = 0;
3726   tree->first = NULL;
3727   tree->next = NULL;
3728   tree->node_idx = -1;
3729
3730   if (left != NULL)
3731     left->parent = tree;
3732   if (right != NULL)
3733     right->parent = tree;
3734   return tree;
3735 }
3736
3737 /* Mark the tree SRC as an optional subexpression.
3738    To be called from preorder or postorder.  */
3739
3740 static reg_errcode_t
3741 mark_opt_subexp (void *extra, bin_tree_t *node)
3742 {
3743   int idx = (int) (long) extra;
3744   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3745     node->token.opt_subexp = 1;
3746
3747   return REG_NOERROR;
3748 }
3749
3750 /* Free the allocated memory inside NODE. */
3751
3752 static void
3753 free_token (re_token_t *node)
3754 {
3755 #ifdef RE_ENABLE_I18N
3756   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3757     free_charset (node->opr.mbcset);
3758   else
3759 #endif /* RE_ENABLE_I18N */
3760     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3761       re_free (node->opr.sbcset);
3762 }
3763
3764 /* Worker function for tree walking.  Free the allocated memory inside NODE
3765    and its children. */
3766
3767 static reg_errcode_t
3768 free_tree (void *extra, bin_tree_t *node)
3769 {
3770   free_token (&node->token);
3771   return REG_NOERROR;
3772 }
3773
3774
3775 /* Duplicate the node SRC, and return new node.  This is a preorder
3776    visit similar to the one implemented by the generic visitor, but
3777    we need more infrastructure to maintain two parallel trees --- so,
3778    it's easier to duplicate.  */
3779
3780 static bin_tree_t *
3781 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3782 {
3783   const bin_tree_t *node;
3784   bin_tree_t *dup_root;
3785   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3786
3787   for (node = root; ; )
3788     {
3789       /* Create a new tree and link it back to the current parent.  */
3790       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3791       if (*p_new == NULL)
3792         return NULL;
3793       (*p_new)->parent = dup_node;
3794       (*p_new)->token.duplicated = 1;
3795       dup_node = *p_new;
3796
3797       /* Go to the left node, or up and to the right.  */
3798       if (node->left)
3799         {
3800           node = node->left;
3801           p_new = &dup_node->left;
3802         }
3803       else
3804         {
3805           const bin_tree_t *prev = NULL;
3806           while (node->right == prev || node->right == NULL)
3807             {
3808               prev = node;
3809               node = node->parent;
3810               dup_node = dup_node->parent;
3811               if (!node)
3812                 return dup_root;
3813             }
3814           node = node->right;
3815           p_new = &dup_node->right;
3816         }
3817     }
3818 }