posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <stdint.h>
  21
  22 #ifdef _LIBC
  23 # include <locale/weight.h>
  24 #endif
  25
  26 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  27                                           size_t length, reg_syntax_t syntax);
  28 static void re_compile_fastmap_iter (regex_t *bufp,
  29                                      const re_dfastate_t *init_state,
  30                                      char *fastmap);
  31 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  32 #ifdef RE_ENABLE_I18N
  33 static void free_charset (re_charset_t *cset);
  34 #endif /* RE_ENABLE_I18N */
  35 static void free_workarea_compile (regex_t *preg);
  36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  37 #ifdef RE_ENABLE_I18N
  38 static void optimize_utf8 (re_dfa_t *dfa);
  39 #endif
  40 static reg_errcode_t analyze (regex_t *preg);
  41 static reg_errcode_t preorder (bin_tree_t *root,
  42                                reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                void *extra);
  44 static reg_errcode_t postorder (bin_tree_t *root,
  45                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  46                                 void *extra);
  47 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  48 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  49 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  50                                  bin_tree_t *node);
  51 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  52 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  53 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  54 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  55 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  56                                    unsigned int constraint);
  57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  59                                          int node, int root);
  60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  61 static int fetch_number (re_string_t *input, re_token_t *token,
  62                          reg_syntax_t syntax);
  63 static int peek_token (re_token_t *token, re_string_t *input,
  64                         reg_syntax_t syntax);
  65 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  66                           reg_syntax_t syntax, reg_errcode_t *err);
  67 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  68                                   re_token_t *token, reg_syntax_t syntax,
  69                                   int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  71                                  re_token_t *token, reg_syntax_t syntax,
  72                                  int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  74                                      re_token_t *token, reg_syntax_t syntax,
  75                                      int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  77                                   re_token_t *token, reg_syntax_t syntax,
  78                                   int nest, reg_errcode_t *err);
  79 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  80                                  re_dfa_t *dfa, re_token_t *token,
  81                                  reg_syntax_t syntax, reg_errcode_t *err);
  82 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  83                                       re_token_t *token, reg_syntax_t syntax,
  84                                       reg_errcode_t *err);
  85 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  86                                             re_string_t *regexp,
  87                                             re_token_t *token, int token_len,
  88                                             re_dfa_t *dfa,
  89                                             reg_syntax_t syntax,
  90                                             int accept_hyphen);
  91 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  92                                           re_string_t *regexp,
  93                                           re_token_t *token);
  94 #ifdef RE_ENABLE_I18N
  95 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  96                                         re_charset_t *mbcset,
  97                                         int *equiv_class_alloc,
  98                                         const unsigned char *name);
  99 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 100                                       bitset_t sbcset,
 101                                       re_charset_t *mbcset,
 102                                       int *char_class_alloc,
 103                                       const unsigned char *class_name,
 104                                       reg_syntax_t syntax);
 105 #else  /* not RE_ENABLE_I18N */
 106 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 107                                         const unsigned char *name);
 108 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 109                                       bitset_t sbcset,
 110                                       const unsigned char *class_name,
 111                                       reg_syntax_t syntax);
 112 #endif /* not RE_ENABLE_I18N */
 113 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 114                                        RE_TRANSLATE_TYPE trans,
 115                                        const unsigned char *class_name,
 116                                        const unsigned char *extra,
 117                                        int non_match, reg_errcode_t *err);
 118 static bin_tree_t *create_tree (re_dfa_t *dfa,
 119                                 bin_tree_t *left, bin_tree_t *right,
 120                                 re_token_type_t type);
 121 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 122                                       bin_tree_t *left, bin_tree_t *right,
 123                                       const re_token_t *token);
 124 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 125 static void free_token (re_token_t *node);
 126 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 127 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 128 \f
 129 /* This table gives an error message for each of the error codes listed
 130    in regex.h.  Obviously the order here has to be same as there.
 131    POSIX doesn't require that we do anything for REG_NOERROR,
 132    but why not be nice?  */
 133
 134 const char __re_error_msgid[] attribute_hidden =
 135   {
 136 #define REG_NOERROR_IDX 0
 137     gettext_noop ("Success")    /* REG_NOERROR */
 138     "\0"
 139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 140     gettext_noop ("No match")   /* REG_NOMATCH */
 141     "\0"
 142 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 143     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 144     "\0"
 145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 146     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 147     "\0"
 148 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 149     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 150     "\0"
 151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 152     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 153     "\0"
 154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 155     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 156     "\0"
 157 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 158     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 159     "\0"
 160 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 161     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 162     "\0"
 163 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 164     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 165     "\0"
 166 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 167     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 168     "\0"
 169 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 170     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 171     "\0"
 172 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 173     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 174     "\0"
 175 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 176     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 177     "\0"
 178 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 179     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 180     "\0"
 181 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 182     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 183     "\0"
 184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 185     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 186   };
 187
 188 const size_t __re_error_msgid_idx[] attribute_hidden =
 189   {
 190     REG_NOERROR_IDX,
 191     REG_NOMATCH_IDX,
 192     REG_BADPAT_IDX,
 193     REG_ECOLLATE_IDX,
 194     REG_ECTYPE_IDX,
 195     REG_EESCAPE_IDX,
 196     REG_ESUBREG_IDX,
 197     REG_EBRACK_IDX,
 198     REG_EPAREN_IDX,
 199     REG_EBRACE_IDX,
 200     REG_BADBR_IDX,
 201     REG_ERANGE_IDX,
 202     REG_ESPACE_IDX,
 203     REG_BADRPT_IDX,
 204     REG_EEND_IDX,
 205     REG_ESIZE_IDX,
 206     REG_ERPAREN_IDX
 207   };
 208 \f
 209 /* Entry points for GNU code.  */
 210
 211 /* re_compile_pattern is the GNU regular expression compiler: it
 212    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 213    Returns 0 if the pattern was valid, otherwise an error string.
 214
 215    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 216    are set in BUFP on entry.  */
 217
 218 const char *
 219 re_compile_pattern (const char *pattern, size_t length,
 220                     struct re_pattern_buffer *bufp)
 221 {
 222   reg_errcode_t ret;
 223
 224   /* And GNU code determines whether or not to get register information
 225      by passing null for the REGS argument to re_match, etc., not by
 226      setting no_sub, unless RE_NO_SUB is set.  */
 227   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 228
 229   /* Match anchors at newline.  */
 230   bufp->newline_anchor = 1;
 231
 232   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 233
 234   if (!ret)
 235     return NULL;
 236   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 237 }
 238 #ifdef _LIBC
 239 weak_alias (__re_compile_pattern, re_compile_pattern)
 240 #endif
 241
 242 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 243    also be assigned to arbitrarily: each pattern buffer stores its own
 244    syntax, so it can be changed between regex compilations.  */
 245 /* This has no initializer because initialized variables in Emacs
 246    become read-only after dumping.  */
 247 reg_syntax_t re_syntax_options;
 248
 249
 250 /* Specify the precise syntax of regexps for compilation.  This provides
 251    for compatibility for various utilities which historically have
 252    different, incompatible syntaxes.
 253
 254    The argument SYNTAX is a bit mask comprised of the various bits
 255    defined in regex.h.  We return the old syntax.  */
 256
 257 reg_syntax_t
 258 re_set_syntax (reg_syntax_t syntax)
 259 {
 260   reg_syntax_t ret = re_syntax_options;
 261
 262   re_syntax_options = syntax;
 263   return ret;
 264 }
 265 #ifdef _LIBC
 266 weak_alias (__re_set_syntax, re_set_syntax)
 267 #endif
 268
 269 int
 270 re_compile_fastmap (struct re_pattern_buffer *bufp)
 271 {
 272   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 273   char *fastmap = bufp->fastmap;
 274
 275   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 276   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 277   if (dfa->init_state != dfa->init_state_word)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 279   if (dfa->init_state != dfa->init_state_nl)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 281   if (dfa->init_state != dfa->init_state_begbuf)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 283   bufp->fastmap_accurate = 1;
 284   return 0;
 285 }
 286 #ifdef _LIBC
 287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 288 #endif
 289
 290 static inline void
 291 __attribute__ ((always_inline))
 292 re_set_fastmap (char *fastmap, bool icase, int ch)
 293 {
 294   fastmap[ch] = 1;
 295   if (icase)
 296     fastmap[tolower (ch)] = 1;
 297 }
 298
 299 /* Helper function for re_compile_fastmap.
 300    Compile fastmap for the initial_state INIT_STATE.  */
 301
 302 static void
 303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 304                          char *fastmap)
 305 {
 306   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 307   int node_cnt;
 308   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 309   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 310     {
 311       int node = init_state->nodes.elems[node_cnt];
 312       re_token_type_t type = dfa->nodes[node].type;
 313
 314       if (type == CHARACTER)
 315         {
 316           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 317 #ifdef RE_ENABLE_I18N
 318           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 319             {
 320               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 321               wchar_t wc;
 322               mbstate_t state;
 323
 324               p = buf;
 325               *p++ = dfa->nodes[node].opr.c;
 326               while (++node < dfa->nodes_len
 327                      && dfa->nodes[node].type == CHARACTER
 328                      && dfa->nodes[node].mb_partial)
 329                 *p++ = dfa->nodes[node].opr.c;
 330               memset (&state, '\0', sizeof (state));
 331               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 332                              &state) == p - buf
 333                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 334                       != (size_t) -1))
 335                 re_set_fastmap (fastmap, 0, buf[0]);
 336             }
 337 #endif
 338         }
 339       else if (type == SIMPLE_BRACKET)
 340         {
 341           int i, ch;
 342           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 343             {
 344               int j;
 345               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 346               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 347                 if (w & ((bitset_word_t) 1 << j))
 348                   re_set_fastmap (fastmap, icase, ch);
 349             }
 350         }
 351 #ifdef RE_ENABLE_I18N
 352       else if (type == COMPLEX_BRACKET)
 353         {
 354           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 355           int i;
 356
 357 # ifdef _LIBC
 358           /* See if we have to try all bytes which start multiple collation
 359              elements.
 360              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 361                   collation element, and don't catch 'b' since 'b' is
 362                   the only collation element which starts from 'b' (and
 363                   it is caught by SIMPLE_BRACKET).  */
 364               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 365                   && (cset->ncoll_syms || cset->nranges))
 366                 {
 367                   const int32_t *table = (const int32_t *)
 368                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 369                   for (i = 0; i < SBC_MAX; ++i)
 370                     if (table[i] < 0)
 371                       re_set_fastmap (fastmap, icase, i);
 372                 }
 373 # endif /* _LIBC */
 374
 375           /* See if we have to start the match at all multibyte characters,
 376              i.e. where we would not find an invalid sequence.  This only
 377              applies to multibyte character sets; for single byte character
 378              sets, the SIMPLE_BRACKET again suffices.  */
 379           if (dfa->mb_cur_max > 1
 380               && (cset->nchar_classes || cset->non_match || cset->nranges
 381 # ifdef _LIBC
 382                   || cset->nequiv_classes
 383 # endif /* _LIBC */
 384                  ))
 385             {
 386               unsigned char c = 0;
 387               do
 388                 {
 389                   mbstate_t mbs;
 390                   memset (&mbs, 0, sizeof (mbs));
 391                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 392                     re_set_fastmap (fastmap, false, (int) c);
 393                 }
 394               while (++c != 0);
 395             }
 396
 397           else
 398             {
 399               /* ... Else catch all bytes which can start the mbchars.  */
 400               for (i = 0; i < cset->nmbchars; ++i)
 401                 {
 402                   char buf[256];
 403                   mbstate_t state;
 404                   memset (&state, '\0', sizeof (state));
 405                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 406                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 407                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 408                     {
 409                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 410                           != (size_t) -1)
 411                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 412                     }
 413                 }
 414             }
 415         }
 416 #endif /* RE_ENABLE_I18N */
 417       else if (type == OP_PERIOD
 418 #ifdef RE_ENABLE_I18N
 419                || type == OP_UTF8_PERIOD
 420 #endif /* RE_ENABLE_I18N */
 421                || type == END_OF_RE)
 422         {
 423           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 424           if (type == END_OF_RE)
 425             bufp->can_be_null = 1;
 426           return;
 427         }
 428     }
 429 }
 430 \f
 431 /* Entry point for POSIX code.  */
 432 /* regcomp takes a regular expression as a string and compiles it.
 433
 434    PREG is a regex_t *.  We do not expect any fields to be initialized,
 435    since POSIX says we shouldn't.  Thus, we set
 436
 437      'buffer' to the compiled pattern;
 438      'used' to the length of the compiled pattern;
 439      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 440        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 441        RE_SYNTAX_POSIX_BASIC;
 442      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 443      'fastmap' to an allocated space for the fastmap;
 444      'fastmap_accurate' to zero;
 445      're_nsub' to the number of subexpressions in PATTERN.
 446
 447    PATTERN is the address of the pattern string.
 448
 449    CFLAGS is a series of bits which affect compilation.
 450
 451      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 452      use POSIX basic syntax.
 453
 454      If REG_NEWLINE is set, then . and [^...] don't match newline.
 455      Also, regexec will try a match beginning after every newline.
 456
 457      If REG_ICASE is set, then we considers upper- and lowercase
 458      versions of letters to be equivalent when matching.
 459
 460      If REG_NOSUB is set, then when PREG is passed to regexec, that
 461      routine will report only success or failure, and nothing about the
 462      registers.
 463
 464    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 465    the return codes and their meanings.)  */
 466
 467 int
 468 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 469 {
 470   reg_errcode_t ret;
 471   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 472                          : RE_SYNTAX_POSIX_BASIC);
 473
 474   preg->buffer = NULL;
 475   preg->allocated = 0;
 476   preg->used = 0;
 477
 478   /* Try to allocate space for the fastmap.  */
 479   preg->fastmap = re_malloc (char, SBC_MAX);
 480   if (BE (preg->fastmap == NULL, 0))
 481     return REG_ESPACE;
 482
 483   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 484
 485   /* If REG_NEWLINE is set, newlines are treated differently.  */
 486   if (cflags & REG_NEWLINE)
 487     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 488       syntax &= ~RE_DOT_NEWLINE;
 489       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 490       /* It also changes the matching behavior.  */
 491       preg->newline_anchor = 1;
 492     }
 493   else
 494     preg->newline_anchor = 0;
 495   preg->no_sub = !!(cflags & REG_NOSUB);
 496   preg->translate = NULL;
 497
 498   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 499
 500   /* POSIX doesn't distinguish between an unmatched open-group and an
 501      unmatched close-group: both are REG_EPAREN.  */
 502   if (ret == REG_ERPAREN)
 503     ret = REG_EPAREN;
 504
 505   /* We have already checked preg->fastmap != NULL.  */
 506   if (BE (ret == REG_NOERROR, 1))
 507     /* Compute the fastmap now, since regexec cannot modify the pattern
 508        buffer.  This function never fails in this implementation.  */
 509     (void) re_compile_fastmap (preg);
 510   else
 511     {
 512       /* Some error occurred while compiling the expression.  */
 513       re_free (preg->fastmap);
 514       preg->fastmap = NULL;
 515     }
 516
 517   return (int) ret;
 518 }
 519 #ifdef _LIBC
 520 libc_hidden_def (__regcomp)
 521 weak_alias (__regcomp, regcomp)
 522 #endif
 523
 524 /* Returns a message corresponding to an error code, ERRCODE, returned
 525    from either regcomp or regexec.   We don't use PREG here.  */
 526
 527 size_t
 528 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
 529           size_t errbuf_size)
 530 {
 531   const char *msg;
 532   size_t msg_size;
 533
 534   if (BE (errcode < 0
 535           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 536                                / sizeof (__re_error_msgid_idx[0])), 0))
 537     /* Only error codes returned by the rest of the code should be passed
 538        to this routine.  If we are given anything else, or if other regex
 539        code generates an invalid error code, then the program has a bug.
 540        Dump core so we can fix it.  */
 541     abort ();
 542
 543   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 544
 545   msg_size = strlen (msg) + 1; /* Includes the null.  */
 546
 547   if (BE (errbuf_size != 0, 1))
 548     {
 549       if (BE (msg_size > errbuf_size, 0))
 550         {
 551 #if defined HAVE_MEMPCPY || defined _LIBC
 552           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 553 #else
 554           memcpy (errbuf, msg, errbuf_size - 1);
 555           errbuf[errbuf_size - 1] = 0;
 556 #endif
 557         }
 558       else
 559         memcpy (errbuf, msg, msg_size);
 560     }
 561
 562   return msg_size;
 563 }
 564 #ifdef _LIBC
 565 weak_alias (__regerror, regerror)
 566 #endif
 567
 568
 569 #ifdef RE_ENABLE_I18N
 570 /* This static array is used for the map to single-byte characters when
 571    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 572    it the same all the time.  UTF-8 is the preferred encoding so this is
 573    a worthwhile optimization.  */
 574 static const bitset_t utf8_sb_map =
 575 {
 576   /* Set the first 128 bits.  */
 577   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 578 };
 579 #endif
 580
 581
 582 static void
 583 free_dfa_content (re_dfa_t *dfa)
 584 {
 585   int i, j;
 586
 587   if (dfa->nodes)
 588     for (i = 0; i < dfa->nodes_len; ++i)
 589       free_token (dfa->nodes + i);
 590   re_free (dfa->nexts);
 591   for (i = 0; i < dfa->nodes_len; ++i)
 592     {
 593       if (dfa->eclosures != NULL)
 594         re_node_set_free (dfa->eclosures + i);
 595       if (dfa->inveclosures != NULL)
 596         re_node_set_free (dfa->inveclosures + i);
 597       if (dfa->edests != NULL)
 598         re_node_set_free (dfa->edests + i);
 599     }
 600   re_free (dfa->edests);
 601   re_free (dfa->eclosures);
 602   re_free (dfa->inveclosures);
 603   re_free (dfa->nodes);
 604
 605   if (dfa->state_table)
 606     for (i = 0; i <= dfa->state_hash_mask; ++i)
 607       {
 608         struct re_state_table_entry *entry = dfa->state_table + i;
 609         for (j = 0; j < entry->num; ++j)
 610           {
 611             re_dfastate_t *state = entry->array[j];
 612             free_state (state);
 613           }
 614         re_free (entry->array);
 615       }
 616   re_free (dfa->state_table);
 617 #ifdef RE_ENABLE_I18N
 618   if (dfa->sb_char != utf8_sb_map)
 619     re_free (dfa->sb_char);
 620 #endif
 621   re_free (dfa->subexp_map);
 622 #ifdef DEBUG
 623   re_free (dfa->re_str);
 624 #endif
 625
 626   re_free (dfa);
 627 }
 628
 629
 630 /* Free dynamically allocated space used by PREG.  */
 631
 632 void
 633 regfree (regex_t *preg)
 634 {
 635   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 636   if (BE (dfa != NULL, 1))
 637     free_dfa_content (dfa);
 638   preg->buffer = NULL;
 639   preg->allocated = 0;
 640
 641   re_free (preg->fastmap);
 642   preg->fastmap = NULL;
 643
 644   re_free (preg->translate);
 645   preg->translate = NULL;
 646 }
 647 #ifdef _LIBC
 648 libc_hidden_def (__regfree)
 649 weak_alias (__regfree, regfree)
 650 #endif
 651 \f
 652 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 653    them unless specifically requested.  */
 654
 655 #if defined _REGEX_RE_COMP || defined _LIBC
 656
 657 /* BSD has one and only one pattern buffer.  */
 658 static struct re_pattern_buffer re_comp_buf;
 659
 660 char *
 661 # ifdef _LIBC
 662 /* Make these definitions weak in libc, so POSIX programs can redefine
 663    these names if they don't use our functions, and still use
 664    regcomp/regexec above without link errors.  */
 665 weak_function
 666 # endif
 667 re_comp (const char *s)
 668 {
 669   reg_errcode_t ret;
 670   char *fastmap;
 671
 672   if (!s)
 673     {
 674       if (!re_comp_buf.buffer)
 675         return gettext ("No previous regular expression");
 676       return 0;
 677     }
 678
 679   if (re_comp_buf.buffer)
 680     {
 681       fastmap = re_comp_buf.fastmap;
 682       re_comp_buf.fastmap = NULL;
 683       __regfree (&re_comp_buf);
 684       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 685       re_comp_buf.fastmap = fastmap;
 686     }
 687
 688   if (re_comp_buf.fastmap == NULL)
 689     {
 690       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 691       if (re_comp_buf.fastmap == NULL)
 692         return (char *) gettext (__re_error_msgid
 693                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 694     }
 695
 696   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 697      don't need to initialize the pattern buffer fields which affect it.  */
 698
 699   /* Match anchors at newlines.  */
 700   re_comp_buf.newline_anchor = 1;
 701
 702   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 703
 704   if (!ret)
 705     return NULL;
 706
 707   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 708   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 709 }
 710
 711 #ifdef _LIBC
 712 libc_freeres_fn (free_mem)
 713 {
 714   __regfree (&re_comp_buf);
 715 }
 716 #endif
 717
 718 #endif /* _REGEX_RE_COMP */
 719 \f
 720 /* Internal entry point.
 721    Compile the regular expression PATTERN, whose length is LENGTH.
 722    SYNTAX indicate regular expression's syntax.  */
 723
 724 static reg_errcode_t
 725 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 726                      reg_syntax_t syntax)
 727 {
 728   reg_errcode_t err = REG_NOERROR;
 729   re_dfa_t *dfa;
 730   re_string_t regexp;
 731
 732   /* Initialize the pattern buffer.  */
 733   preg->fastmap_accurate = 0;
 734   preg->syntax = syntax;
 735   preg->not_bol = preg->not_eol = 0;
 736   preg->used = 0;
 737   preg->re_nsub = 0;
 738   preg->can_be_null = 0;
 739   preg->regs_allocated = REGS_UNALLOCATED;
 740
 741   /* Initialize the dfa.  */
 742   dfa = (re_dfa_t *) preg->buffer;
 743   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 744     {
 745       /* If zero allocated, but buffer is non-null, try to realloc
 746          enough space.  This loses if buffer's address is bogus, but
 747          that is the user's responsibility.  If ->buffer is NULL this
 748          is a simple allocation.  */
 749       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 750       if (dfa == NULL)
 751         return REG_ESPACE;
 752       preg->allocated = sizeof (re_dfa_t);
 753       preg->buffer = (unsigned char *) dfa;
 754     }
 755   preg->used = sizeof (re_dfa_t);
 756
 757   err = init_dfa (dfa, length);
 758   if (BE (err != REG_NOERROR, 0))
 759     {
 760       free_dfa_content (dfa);
 761       preg->buffer = NULL;
 762       preg->allocated = 0;
 763       return err;
 764     }
 765 #ifdef DEBUG
 766   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 767   dfa->re_str = re_malloc (char, length + 1);
 768   strncpy (dfa->re_str, pattern, length + 1);
 769 #endif
 770
 771   __libc_lock_init (dfa->lock);
 772
 773   err = re_string_construct (&regexp, pattern, length, preg->translate,
 774                              syntax & RE_ICASE, dfa);
 775   if (BE (err != REG_NOERROR, 0))
 776     {
 777     re_compile_internal_free_return:
 778       free_workarea_compile (preg);
 779       re_string_destruct (&regexp);
 780       free_dfa_content (dfa);
 781       preg->buffer = NULL;
 782       preg->allocated = 0;
 783       return err;
 784     }
 785
 786   /* Parse the regular expression, and build a structure tree.  */
 787   preg->re_nsub = 0;
 788   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 789   if (BE (dfa->str_tree == NULL, 0))
 790     goto re_compile_internal_free_return;
 791
 792   /* Analyze the tree and create the nfa.  */
 793   err = analyze (preg);
 794   if (BE (err != REG_NOERROR, 0))
 795     goto re_compile_internal_free_return;
 796
 797 #ifdef RE_ENABLE_I18N
 798   /* If possible, do searching in single byte encoding to speed things up.  */
 799   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 800     optimize_utf8 (dfa);
 801 #endif
 802
 803   /* Then create the initial state of the dfa.  */
 804   err = create_initial_state (dfa);
 805
 806   /* Release work areas.  */
 807   free_workarea_compile (preg);
 808   re_string_destruct (&regexp);
 809
 810   if (BE (err != REG_NOERROR, 0))
 811     {
 812       free_dfa_content (dfa);
 813       preg->buffer = NULL;
 814       preg->allocated = 0;
 815     }
 816
 817   return err;
 818 }
 819
 820 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 821    as the initial length of some arrays.  */
 822
 823 static reg_errcode_t
 824 init_dfa (re_dfa_t *dfa, size_t pat_len)
 825 {
 826   unsigned int table_size;
 827 #ifndef _LIBC
 828   char *codeset_name;
 829 #endif
 830
 831   memset (dfa, '\0', sizeof (re_dfa_t));
 832
 833   /* Force allocation of str_tree_storage the first time.  */
 834   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 835
 836   /* Avoid overflows.  */
 837   if (pat_len == SIZE_MAX)
 838     return REG_ESPACE;
 839
 840   dfa->nodes_alloc = pat_len + 1;
 841   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 842
 843   /*  table_size = 2 ^ ceil(log pat_len) */
 844   for (table_size = 1; ; table_size <<= 1)
 845     if (table_size > pat_len)
 846       break;
 847
 848   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 849   dfa->state_hash_mask = table_size - 1;
 850
 851   dfa->mb_cur_max = MB_CUR_MAX;
 852 #ifdef _LIBC
 853   if (dfa->mb_cur_max == 6
 854       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 855     dfa->is_utf8 = 1;
 856   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 857                        != 0);
 858 #else
 859 # ifdef HAVE_LANGINFO_CODESET
 860   codeset_name = nl_langinfo (CODESET);
 861 # else
 862   codeset_name = getenv ("LC_ALL");
 863   if (codeset_name == NULL || codeset_name[0] == '\0')
 864     codeset_name = getenv ("LC_CTYPE");
 865   if (codeset_name == NULL || codeset_name[0] == '\0')
 866     codeset_name = getenv ("LANG");
 867   if (codeset_name == NULL)
 868     codeset_name = "";
 869   else if (strchr (codeset_name, '.') !=  NULL)
 870     codeset_name = strchr (codeset_name, '.') + 1;
 871 # endif
 872
 873   if (strcasecmp (codeset_name, "UTF-8") == 0
 874       || strcasecmp (codeset_name, "UTF8") == 0)
 875     dfa->is_utf8 = 1;
 876
 877   /* We check exhaustively in the loop below if this charset is a
 878      superset of ASCII.  */
 879   dfa->map_notascii = 0;
 880 #endif
 881
 882 #ifdef RE_ENABLE_I18N
 883   if (dfa->mb_cur_max > 1)
 884     {
 885       if (dfa->is_utf8)
 886         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 887       else
 888         {
 889           int i, j, ch;
 890
 891           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 892           if (BE (dfa->sb_char == NULL, 0))
 893             return REG_ESPACE;
 894
 895           /* Set the bits corresponding to single byte chars.  */
 896           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 897             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 898               {
 899                 wint_t wch = __btowc (ch);
 900                 if (wch != WEOF)
 901                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 902 # ifndef _LIBC
 903                 if (isascii (ch) && wch != ch)
 904                   dfa->map_notascii = 1;
 905 # endif
 906               }
 907         }
 908     }
 909 #endif
 910
 911   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 912     return REG_ESPACE;
 913   return REG_NOERROR;
 914 }
 915
 916 /* Initialize WORD_CHAR table, which indicate which character is
 917    "word".  In this case "word" means that it is the word construction
 918    character used by some operators like "\<", "\>", etc.  */
 919
 920 static void
 921 init_word_char (re_dfa_t *dfa)
 922 {
 923   dfa->word_ops_used = 1;
 924   int i = 0;
 925   int ch = 0;
 926   if (BE (dfa->map_notascii == 0, 1))
 927     {
 928       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
 929          them, an issue when this code is used in Gnulib.  */
 930       bitset_word_t bits0 = 0x00000000;
 931       bitset_word_t bits1 = 0x03ff0000;
 932       bitset_word_t bits2 = 0x87fffffe;
 933       bitset_word_t bits3 = 0x07fffffe;
 934       if (BITSET_WORD_BITS == 64)
 935         {
 936           /* Pacify gcc -Woverflow on 32-bit platformns.  */
 937           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 938           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 939           i = 2;
 940         }
 941       else if (BITSET_WORD_BITS == 32)
 942         {
 943           dfa->word_char[0] = bits0;
 944           dfa->word_char[1] = bits1;
 945           dfa->word_char[2] = bits2;
 946           dfa->word_char[3] = bits3;
 947           i = 4;
 948         }
 949       else
 950         goto general_case;
 951       ch = 128;
 952
 953       if (BE (dfa->is_utf8, 1))
 954         {
 955           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 956           return;
 957         }
 958     }
 959
 960  general_case:
 961   for (; i < BITSET_WORDS; ++i)
 962     for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 963       if (isalnum (ch) || ch == '_')
 964         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 965 }
 966
 967 /* Free the work area which are only used while compiling.  */
 968
 969 static void
 970 free_workarea_compile (regex_t *preg)
 971 {
 972   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 973   bin_tree_storage_t *storage, *next;
 974   for (storage = dfa->str_tree_storage; storage; storage = next)
 975     {
 976       next = storage->next;
 977       re_free (storage);
 978     }
 979   dfa->str_tree_storage = NULL;
 980   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 981   dfa->str_tree = NULL;
 982   re_free (dfa->org_indices);
 983   dfa->org_indices = NULL;
 984 }
 985
 986 /* Create initial states for all contexts.  */
 987
 988 static reg_errcode_t
 989 create_initial_state (re_dfa_t *dfa)
 990 {
 991   int first, i;
 992   reg_errcode_t err;
 993   re_node_set init_nodes;
 994
 995   /* Initial states have the epsilon closure of the node which is
 996      the first node of the regular expression.  */
 997   first = dfa->str_tree->first->node_idx;
 998   dfa->init_node = first;
 999   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1000   if (BE (err != REG_NOERROR, 0))
1001     return err;
1002
1003   /* The back-references which are in initial states can epsilon transit,
1004      since in this case all of the subexpressions can be null.
1005      Then we add epsilon closures of the nodes which are the next nodes of
1006      the back-references.  */
1007   if (dfa->nbackref > 0)
1008     for (i = 0; i < init_nodes.nelem; ++i)
1009       {
1010         int node_idx = init_nodes.elems[i];
1011         re_token_type_t type = dfa->nodes[node_idx].type;
1012
1013         int clexp_idx;
1014         if (type != OP_BACK_REF)
1015           continue;
1016         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1017           {
1018             re_token_t *clexp_node;
1019             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1020             if (clexp_node->type == OP_CLOSE_SUBEXP
1021                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1022               break;
1023           }
1024         if (clexp_idx == init_nodes.nelem)
1025           continue;
1026
1027         if (type == OP_BACK_REF)
1028           {
1029             int dest_idx = dfa->edests[node_idx].elems[0];
1030             if (!re_node_set_contains (&init_nodes, dest_idx))
1031               {
1032                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1033                                                        dfa->eclosures
1034                                                        + dest_idx);
1035                 if (err != REG_NOERROR)
1036                   return err;
1037                 i = 0;
1038               }
1039           }
1040       }
1041
1042   /* It must be the first time to invoke acquire_state.  */
1043   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1044   /* We don't check ERR here, since the initial state must not be NULL.  */
1045   if (BE (dfa->init_state == NULL, 0))
1046     return err;
1047   if (dfa->init_state->has_constraint)
1048     {
1049       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1050                                                        CONTEXT_WORD);
1051       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1052                                                      CONTEXT_NEWLINE);
1053       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1054                                                          &init_nodes,
1055                                                          CONTEXT_NEWLINE
1056                                                          | CONTEXT_BEGBUF);
1057       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1058               || dfa->init_state_begbuf == NULL, 0))
1059         return err;
1060     }
1061   else
1062     dfa->init_state_word = dfa->init_state_nl
1063       = dfa->init_state_begbuf = dfa->init_state;
1064
1065   re_node_set_free (&init_nodes);
1066   return REG_NOERROR;
1067 }
1068 \f
1069 #ifdef RE_ENABLE_I18N
1070 /* If it is possible to do searching in single byte encoding instead of UTF-8
1071    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1072    DFA nodes where needed.  */
1073
1074 static void
1075 optimize_utf8 (re_dfa_t *dfa)
1076 {
1077   int node, i, mb_chars = 0, has_period = 0;
1078
1079   for (node = 0; node < dfa->nodes_len; ++node)
1080     switch (dfa->nodes[node].type)
1081       {
1082       case CHARACTER:
1083         if (dfa->nodes[node].opr.c >= 0x80)
1084           mb_chars = 1;
1085         break;
1086       case ANCHOR:
1087         switch (dfa->nodes[node].opr.ctx_type)
1088           {
1089           case LINE_FIRST:
1090           case LINE_LAST:
1091           case BUF_FIRST:
1092           case BUF_LAST:
1093             break;
1094           default:
1095             /* Word anchors etc. cannot be handled.  It's okay to test
1096                opr.ctx_type since constraints (for all DFA nodes) are
1097                created by ORing one or more opr.ctx_type values.  */
1098             return;
1099           }
1100         break;
1101       case OP_PERIOD:
1102         has_period = 1;
1103         break;
1104       case OP_BACK_REF:
1105       case OP_ALT:
1106       case END_OF_RE:
1107       case OP_DUP_ASTERISK:
1108       case OP_OPEN_SUBEXP:
1109       case OP_CLOSE_SUBEXP:
1110         break;
1111       case COMPLEX_BRACKET:
1112         return;
1113       case SIMPLE_BRACKET:
1114         /* Just double check.  The non-ASCII range starts at 0x80.  */
1115         assert (0x80 % BITSET_WORD_BITS == 0);
1116         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1117           if (dfa->nodes[node].opr.sbcset[i])
1118             return;
1119         break;
1120       default:
1121         abort ();
1122       }
1123
1124   if (mb_chars || has_period)
1125     for (node = 0; node < dfa->nodes_len; ++node)
1126       {
1127         if (dfa->nodes[node].type == CHARACTER
1128             && dfa->nodes[node].opr.c >= 0x80)
1129           dfa->nodes[node].mb_partial = 0;
1130         else if (dfa->nodes[node].type == OP_PERIOD)
1131           dfa->nodes[node].type = OP_UTF8_PERIOD;
1132       }
1133
1134   /* The search can be in single byte locale.  */
1135   dfa->mb_cur_max = 1;
1136   dfa->is_utf8 = 0;
1137   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1138 }
1139 #endif
1140 \f
1141 /* Analyze the structure tree, and calculate "first", "next", "edest",
1142    "eclosure", and "inveclosure".  */
1143
1144 static reg_errcode_t
1145 analyze (regex_t *preg)
1146 {
1147   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1148   reg_errcode_t ret;
1149
1150   /* Allocate arrays.  */
1151   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1152   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1153   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1154   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1155   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1156           || dfa->eclosures == NULL, 0))
1157     return REG_ESPACE;
1158
1159   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1160   if (dfa->subexp_map != NULL)
1161     {
1162       int i;
1163       for (i = 0; i < preg->re_nsub; i++)
1164         dfa->subexp_map[i] = i;
1165       preorder (dfa->str_tree, optimize_subexps, dfa);
1166       for (i = 0; i < preg->re_nsub; i++)
1167         if (dfa->subexp_map[i] != i)
1168           break;
1169       if (i == preg->re_nsub)
1170         {
1171           free (dfa->subexp_map);
1172           dfa->subexp_map = NULL;
1173         }
1174     }
1175
1176   ret = postorder (dfa->str_tree, lower_subexps, preg);
1177   if (BE (ret != REG_NOERROR, 0))
1178     return ret;
1179   ret = postorder (dfa->str_tree, calc_first, dfa);
1180   if (BE (ret != REG_NOERROR, 0))
1181     return ret;
1182   preorder (dfa->str_tree, calc_next, dfa);
1183   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1184   if (BE (ret != REG_NOERROR, 0))
1185     return ret;
1186   ret = calc_eclosure (dfa);
1187   if (BE (ret != REG_NOERROR, 0))
1188     return ret;
1189
1190   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1191      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1192   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1193       || dfa->nbackref)
1194     {
1195       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1196       if (BE (dfa->inveclosures == NULL, 0))
1197         return REG_ESPACE;
1198       ret = calc_inveclosure (dfa);
1199     }
1200
1201   return ret;
1202 }
1203
1204 /* Our parse trees are very unbalanced, so we cannot use a stack to
1205    implement parse tree visits.  Instead, we use parent pointers and
1206    some hairy code in these two functions.  */
1207 static reg_errcode_t
1208 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1209            void *extra)
1210 {
1211   bin_tree_t *node, *prev;
1212
1213   for (node = root; ; )
1214     {
1215       /* Descend down the tree, preferably to the left (or to the right
1216          if that's the only child).  */
1217       while (node->left || node->right)
1218         if (node->left)
1219           node = node->left;
1220         else
1221           node = node->right;
1222
1223       do
1224         {
1225           reg_errcode_t err = fn (extra, node);
1226           if (BE (err != REG_NOERROR, 0))
1227             return err;
1228           if (node->parent == NULL)
1229             return REG_NOERROR;
1230           prev = node;
1231           node = node->parent;
1232         }
1233       /* Go up while we have a node that is reached from the right.  */
1234       while (node->right == prev || node->right == NULL);
1235       node = node->right;
1236     }
1237 }
1238
1239 static reg_errcode_t
1240 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1241           void *extra)
1242 {
1243   bin_tree_t *node;
1244
1245   for (node = root; ; )
1246     {
1247       reg_errcode_t err = fn (extra, node);
1248       if (BE (err != REG_NOERROR, 0))
1249         return err;
1250
1251       /* Go to the left node, or up and to the right.  */
1252       if (node->left)
1253         node = node->left;
1254       else
1255         {
1256           bin_tree_t *prev = NULL;
1257           while (node->right == prev || node->right == NULL)
1258             {
1259               prev = node;
1260               node = node->parent;
1261               if (!node)
1262                 return REG_NOERROR;
1263             }
1264           node = node->right;
1265         }
1266     }
1267 }
1268
1269 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1270    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1271    backreferences as well.  Requires a preorder visit.  */
1272 static reg_errcode_t
1273 optimize_subexps (void *extra, bin_tree_t *node)
1274 {
1275   re_dfa_t *dfa = (re_dfa_t *) extra;
1276
1277   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1278     {
1279       int idx = node->token.opr.idx;
1280       node->token.opr.idx = dfa->subexp_map[idx];
1281       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1282     }
1283
1284   else if (node->token.type == SUBEXP
1285            && node->left && node->left->token.type == SUBEXP)
1286     {
1287       int other_idx = node->left->token.opr.idx;
1288
1289       node->left = node->left->left;
1290       if (node->left)
1291         node->left->parent = node;
1292
1293       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1294       if (other_idx < BITSET_WORD_BITS)
1295           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1296     }
1297
1298   return REG_NOERROR;
1299 }
1300
1301 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1302    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1303 static reg_errcode_t
1304 lower_subexps (void *extra, bin_tree_t *node)
1305 {
1306   regex_t *preg = (regex_t *) extra;
1307   reg_errcode_t err = REG_NOERROR;
1308
1309   if (node->left && node->left->token.type == SUBEXP)
1310     {
1311       node->left = lower_subexp (&err, preg, node->left);
1312       if (node->left)
1313         node->left->parent = node;
1314     }
1315   if (node->right && node->right->token.type == SUBEXP)
1316     {
1317       node->right = lower_subexp (&err, preg, node->right);
1318       if (node->right)
1319         node->right->parent = node;
1320     }
1321
1322   return err;
1323 }
1324
1325 static bin_tree_t *
1326 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1327 {
1328   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1329   bin_tree_t *body = node->left;
1330   bin_tree_t *op, *cls, *tree1, *tree;
1331
1332   if (preg->no_sub
1333       /* We do not optimize empty subexpressions, because otherwise we may
1334          have bad CONCAT nodes with NULL children.  This is obviously not
1335          very common, so we do not lose much.  An example that triggers
1336          this case is the sed "script" /\(\)/x.  */
1337       && node->left != NULL
1338       && (node->token.opr.idx >= BITSET_WORD_BITS
1339           || !(dfa->used_bkref_map
1340                & ((bitset_word_t) 1 << node->token.opr.idx))))
1341     return node->left;
1342
1343   /* Convert the SUBEXP node to the concatenation of an
1344      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1345   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1346   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1347   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1348   tree = create_tree (dfa, op, tree1, CONCAT);
1349   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1350     {
1351       *err = REG_ESPACE;
1352       return NULL;
1353     }
1354
1355   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1356   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1357   return tree;
1358 }
1359
1360 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1361    nodes.  Requires a postorder visit.  */
1362 static reg_errcode_t
1363 calc_first (void *extra, bin_tree_t *node)
1364 {
1365   re_dfa_t *dfa = (re_dfa_t *) extra;
1366   if (node->token.type == CONCAT)
1367     {
1368       node->first = node->left->first;
1369       node->node_idx = node->left->node_idx;
1370     }
1371   else
1372     {
1373       node->first = node;
1374       node->node_idx = re_dfa_add_node (dfa, node->token);
1375       if (BE (node->node_idx == -1, 0))
1376         return REG_ESPACE;
1377       if (node->token.type == ANCHOR)
1378         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1379     }
1380   return REG_NOERROR;
1381 }
1382
1383 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1384 static reg_errcode_t
1385 calc_next (void *extra, bin_tree_t *node)
1386 {
1387   switch (node->token.type)
1388     {
1389     case OP_DUP_ASTERISK:
1390       node->left->next = node;
1391       break;
1392     case CONCAT:
1393       node->left->next = node->right->first;
1394       node->right->next = node->next;
1395       break;
1396     default:
1397       if (node->left)
1398         node->left->next = node->next;
1399       if (node->right)
1400         node->right->next = node->next;
1401       break;
1402     }
1403   return REG_NOERROR;
1404 }
1405
1406 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1407 static reg_errcode_t
1408 link_nfa_nodes (void *extra, bin_tree_t *node)
1409 {
1410   re_dfa_t *dfa = (re_dfa_t *) extra;
1411   int idx = node->node_idx;
1412   reg_errcode_t err = REG_NOERROR;
1413
1414   switch (node->token.type)
1415     {
1416     case CONCAT:
1417       break;
1418
1419     case END_OF_RE:
1420       assert (node->next == NULL);
1421       break;
1422
1423     case OP_DUP_ASTERISK:
1424     case OP_ALT:
1425       {
1426         int left, right;
1427         dfa->has_plural_match = 1;
1428         if (node->left != NULL)
1429           left = node->left->first->node_idx;
1430         else
1431           left = node->next->node_idx;
1432         if (node->right != NULL)
1433           right = node->right->first->node_idx;
1434         else
1435           right = node->next->node_idx;
1436         assert (left > -1);
1437         assert (right > -1);
1438         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1439       }
1440       break;
1441
1442     case ANCHOR:
1443     case OP_OPEN_SUBEXP:
1444     case OP_CLOSE_SUBEXP:
1445       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1446       break;
1447
1448     case OP_BACK_REF:
1449       dfa->nexts[idx] = node->next->node_idx;
1450       if (node->token.type == OP_BACK_REF)
1451         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1452       break;
1453
1454     default:
1455       assert (!IS_EPSILON_NODE (node->token.type));
1456       dfa->nexts[idx] = node->next->node_idx;
1457       break;
1458     }
1459
1460   return err;
1461 }
1462
1463 /* Duplicate the epsilon closure of the node ROOT_NODE.
1464    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1465    to their own constraint.  */
1466
1467 static reg_errcode_t
1468 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1469                         int root_node, unsigned int init_constraint)
1470 {
1471   int org_node, clone_node, ret;
1472   unsigned int constraint = init_constraint;
1473   for (org_node = top_org_node, clone_node = top_clone_node;;)
1474     {
1475       int org_dest, clone_dest;
1476       if (dfa->nodes[org_node].type == OP_BACK_REF)
1477         {
1478           /* If the back reference epsilon-transit, its destination must
1479              also have the constraint.  Then duplicate the epsilon closure
1480              of the destination of the back reference, and store it in
1481              edests of the back reference.  */
1482           org_dest = dfa->nexts[org_node];
1483           re_node_set_empty (dfa->edests + clone_node);
1484           clone_dest = duplicate_node (dfa, org_dest, constraint);
1485           if (BE (clone_dest == -1, 0))
1486             return REG_ESPACE;
1487           dfa->nexts[clone_node] = dfa->nexts[org_node];
1488           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489           if (BE (ret < 0, 0))
1490             return REG_ESPACE;
1491         }
1492       else if (dfa->edests[org_node].nelem == 0)
1493         {
1494           /* In case of the node can't epsilon-transit, don't duplicate the
1495              destination and store the original destination as the
1496              destination of the node.  */
1497           dfa->nexts[clone_node] = dfa->nexts[org_node];
1498           break;
1499         }
1500       else if (dfa->edests[org_node].nelem == 1)
1501         {
1502           /* In case of the node can epsilon-transit, and it has only one
1503              destination.  */
1504           org_dest = dfa->edests[org_node].elems[0];
1505           re_node_set_empty (dfa->edests + clone_node);
1506           /* If the node is root_node itself, it means the epsilon closure
1507              has a loop.   Then tie it to the destination of the root_node.  */
1508           if (org_node == root_node && clone_node != org_node)
1509             {
1510               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1511               if (BE (ret < 0, 0))
1512                 return REG_ESPACE;
1513               break;
1514             }
1515           /* In case the node has another constraint, append it.  */
1516           constraint |= dfa->nodes[org_node].constraint;
1517           clone_dest = duplicate_node (dfa, org_dest, constraint);
1518           if (BE (clone_dest == -1, 0))
1519             return REG_ESPACE;
1520           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521           if (BE (ret < 0, 0))
1522             return REG_ESPACE;
1523         }
1524       else /* dfa->edests[org_node].nelem == 2 */
1525         {
1526           /* In case of the node can epsilon-transit, and it has two
1527              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1528           org_dest = dfa->edests[org_node].elems[0];
1529           re_node_set_empty (dfa->edests + clone_node);
1530           /* Search for a duplicated node which satisfies the constraint.  */
1531           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1532           if (clone_dest == -1)
1533             {
1534               /* There is no such duplicated node, create a new one.  */
1535               reg_errcode_t err;
1536               clone_dest = duplicate_node (dfa, org_dest, constraint);
1537               if (BE (clone_dest == -1, 0))
1538                 return REG_ESPACE;
1539               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1540               if (BE (ret < 0, 0))
1541                 return REG_ESPACE;
1542               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1543                                             root_node, constraint);
1544               if (BE (err != REG_NOERROR, 0))
1545                 return err;
1546             }
1547           else
1548             {
1549               /* There is a duplicated node which satisfies the constraint,
1550                  use it to avoid infinite loop.  */
1551               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1552               if (BE (ret < 0, 0))
1553                 return REG_ESPACE;
1554             }
1555
1556           org_dest = dfa->edests[org_node].elems[1];
1557           clone_dest = duplicate_node (dfa, org_dest, constraint);
1558           if (BE (clone_dest == -1, 0))
1559             return REG_ESPACE;
1560           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1561           if (BE (ret < 0, 0))
1562             return REG_ESPACE;
1563         }
1564       org_node = org_dest;
1565       clone_node = clone_dest;
1566     }
1567   return REG_NOERROR;
1568 }
1569
1570 /* Search for a node which is duplicated from the node ORG_NODE, and
1571    satisfies the constraint CONSTRAINT.  */
1572
1573 static int
1574 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1575                         unsigned int constraint)
1576 {
1577   int idx;
1578   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1579     {
1580       if (org_node == dfa->org_indices[idx]
1581           && constraint == dfa->nodes[idx].constraint)
1582         return idx; /* Found.  */
1583     }
1584   return -1; /* Not found.  */
1585 }
1586
1587 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1588    Return the index of the new node, or -1 if insufficient storage is
1589    available.  */
1590
1591 static int
1592 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1593 {
1594   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1595   if (BE (dup_idx != -1, 1))
1596     {
1597       dfa->nodes[dup_idx].constraint = constraint;
1598       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1599       dfa->nodes[dup_idx].duplicated = 1;
1600
1601       /* Store the index of the original node.  */
1602       dfa->org_indices[dup_idx] = org_idx;
1603     }
1604   return dup_idx;
1605 }
1606
1607 static reg_errcode_t
1608 calc_inveclosure (re_dfa_t *dfa)
1609 {
1610   int src, idx, ret;
1611   for (idx = 0; idx < dfa->nodes_len; ++idx)
1612     re_node_set_init_empty (dfa->inveclosures + idx);
1613
1614   for (src = 0; src < dfa->nodes_len; ++src)
1615     {
1616       int *elems = dfa->eclosures[src].elems;
1617       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1618         {
1619           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1620           if (BE (ret == -1, 0))
1621             return REG_ESPACE;
1622         }
1623     }
1624
1625   return REG_NOERROR;
1626 }
1627
1628 /* Calculate "eclosure" for all the node in DFA.  */
1629
1630 static reg_errcode_t
1631 calc_eclosure (re_dfa_t *dfa)
1632 {
1633   int node_idx, incomplete;
1634 #ifdef DEBUG
1635   assert (dfa->nodes_len > 0);
1636 #endif
1637   incomplete = 0;
1638   /* For each nodes, calculate epsilon closure.  */
1639   for (node_idx = 0; ; ++node_idx)
1640     {
1641       reg_errcode_t err;
1642       re_node_set eclosure_elem;
1643       if (node_idx == dfa->nodes_len)
1644         {
1645           if (!incomplete)
1646             break;
1647           incomplete = 0;
1648           node_idx = 0;
1649         }
1650
1651 #ifdef DEBUG
1652       assert (dfa->eclosures[node_idx].nelem != -1);
1653 #endif
1654
1655       /* If we have already calculated, skip it.  */
1656       if (dfa->eclosures[node_idx].nelem != 0)
1657         continue;
1658       /* Calculate epsilon closure of 'node_idx'.  */
1659       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1660       if (BE (err != REG_NOERROR, 0))
1661         return err;
1662
1663       if (dfa->eclosures[node_idx].nelem == 0)
1664         {
1665           incomplete = 1;
1666           re_node_set_free (&eclosure_elem);
1667         }
1668     }
1669   return REG_NOERROR;
1670 }
1671
1672 /* Calculate epsilon closure of NODE.  */
1673
1674 static reg_errcode_t
1675 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1676 {
1677   reg_errcode_t err;
1678   int i;
1679   re_node_set eclosure;
1680   int ret;
1681   int incomplete = 0;
1682   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1683   if (BE (err != REG_NOERROR, 0))
1684     return err;
1685
1686   /* This indicates that we are calculating this node now.
1687      We reference this value to avoid infinite loop.  */
1688   dfa->eclosures[node].nelem = -1;
1689
1690   /* If the current node has constraints, duplicate all nodes
1691      since they must inherit the constraints.  */
1692   if (dfa->nodes[node].constraint
1693       && dfa->edests[node].nelem
1694       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1695     {
1696       err = duplicate_node_closure (dfa, node, node, node,
1697                                     dfa->nodes[node].constraint);
1698       if (BE (err != REG_NOERROR, 0))
1699         return err;
1700     }
1701
1702   /* Expand each epsilon destination nodes.  */
1703   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1704     for (i = 0; i < dfa->edests[node].nelem; ++i)
1705       {
1706         re_node_set eclosure_elem;
1707         int edest = dfa->edests[node].elems[i];
1708         /* If calculating the epsilon closure of `edest' is in progress,
1709            return intermediate result.  */
1710         if (dfa->eclosures[edest].nelem == -1)
1711           {
1712             incomplete = 1;
1713             continue;
1714           }
1715         /* If we haven't calculated the epsilon closure of `edest' yet,
1716            calculate now. Otherwise use calculated epsilon closure.  */
1717         if (dfa->eclosures[edest].nelem == 0)
1718           {
1719             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1720             if (BE (err != REG_NOERROR, 0))
1721               return err;
1722           }
1723         else
1724           eclosure_elem = dfa->eclosures[edest];
1725         /* Merge the epsilon closure of 'edest'.  */
1726         err = re_node_set_merge (&eclosure, &eclosure_elem);
1727         if (BE (err != REG_NOERROR, 0))
1728           return err;
1729         /* If the epsilon closure of 'edest' is incomplete,
1730            the epsilon closure of this node is also incomplete.  */
1731         if (dfa->eclosures[edest].nelem == 0)
1732           {
1733             incomplete = 1;
1734             re_node_set_free (&eclosure_elem);
1735           }
1736       }
1737
1738   /* An epsilon closure includes itself.  */
1739   ret = re_node_set_insert (&eclosure, node);
1740   if (BE (ret < 0, 0))
1741     return REG_ESPACE;
1742   if (incomplete && !root)
1743     dfa->eclosures[node].nelem = 0;
1744   else
1745     dfa->eclosures[node] = eclosure;
1746   *new_set = eclosure;
1747   return REG_NOERROR;
1748 }
1749 \f
1750 /* Functions for token which are used in the parser.  */
1751
1752 /* Fetch a token from INPUT.
1753    We must not use this function inside bracket expressions.  */
1754
1755 static void
1756 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1757 {
1758   re_string_skip_bytes (input, peek_token (result, input, syntax));
1759 }
1760
1761 /* Peek a token from INPUT, and return the length of the token.
1762    We must not use this function inside bracket expressions.  */
1763
1764 static int
1765 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1766 {
1767   unsigned char c;
1768
1769   if (re_string_eoi (input))
1770     {
1771       token->type = END_OF_RE;
1772       return 0;
1773     }
1774
1775   c = re_string_peek_byte (input, 0);
1776   token->opr.c = c;
1777
1778   token->word_char = 0;
1779 #ifdef RE_ENABLE_I18N
1780   token->mb_partial = 0;
1781   if (input->mb_cur_max > 1 &&
1782       !re_string_first_byte (input, re_string_cur_idx (input)))
1783     {
1784       token->type = CHARACTER;
1785       token->mb_partial = 1;
1786       return 1;
1787     }
1788 #endif
1789   if (c == '\\')
1790     {
1791       unsigned char c2;
1792       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1793         {
1794           token->type = BACK_SLASH;
1795           return 1;
1796         }
1797
1798       c2 = re_string_peek_byte_case (input, 1);
1799       token->opr.c = c2;
1800       token->type = CHARACTER;
1801 #ifdef RE_ENABLE_I18N
1802       if (input->mb_cur_max > 1)
1803         {
1804           wint_t wc = re_string_wchar_at (input,
1805                                           re_string_cur_idx (input) + 1);
1806           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1807         }
1808       else
1809 #endif
1810         token->word_char = IS_WORD_CHAR (c2) != 0;
1811
1812       switch (c2)
1813         {
1814         case '|':
1815           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1816             token->type = OP_ALT;
1817           break;
1818         case '1': case '2': case '3': case '4': case '5':
1819         case '6': case '7': case '8': case '9':
1820           if (!(syntax & RE_NO_BK_REFS))
1821             {
1822               token->type = OP_BACK_REF;
1823               token->opr.idx = c2 - '1';
1824             }
1825           break;
1826         case '<':
1827           if (!(syntax & RE_NO_GNU_OPS))
1828             {
1829               token->type = ANCHOR;
1830               token->opr.ctx_type = WORD_FIRST;
1831             }
1832           break;
1833         case '>':
1834           if (!(syntax & RE_NO_GNU_OPS))
1835             {
1836               token->type = ANCHOR;
1837               token->opr.ctx_type = WORD_LAST;
1838             }
1839           break;
1840         case 'b':
1841           if (!(syntax & RE_NO_GNU_OPS))
1842             {
1843               token->type = ANCHOR;
1844               token->opr.ctx_type = WORD_DELIM;
1845             }
1846           break;
1847         case 'B':
1848           if (!(syntax & RE_NO_GNU_OPS))
1849             {
1850               token->type = ANCHOR;
1851               token->opr.ctx_type = NOT_WORD_DELIM;
1852             }
1853           break;
1854         case 'w':
1855           if (!(syntax & RE_NO_GNU_OPS))
1856             token->type = OP_WORD;
1857           break;
1858         case 'W':
1859           if (!(syntax & RE_NO_GNU_OPS))
1860             token->type = OP_NOTWORD;
1861           break;
1862         case 's':
1863           if (!(syntax & RE_NO_GNU_OPS))
1864             token->type = OP_SPACE;
1865           break;
1866         case 'S':
1867           if (!(syntax & RE_NO_GNU_OPS))
1868             token->type = OP_NOTSPACE;
1869           break;
1870         case '`':
1871           if (!(syntax & RE_NO_GNU_OPS))
1872             {
1873               token->type = ANCHOR;
1874               token->opr.ctx_type = BUF_FIRST;
1875             }
1876           break;
1877         case '\'':
1878           if (!(syntax & RE_NO_GNU_OPS))
1879             {
1880               token->type = ANCHOR;
1881               token->opr.ctx_type = BUF_LAST;
1882             }
1883           break;
1884         case '(':
1885           if (!(syntax & RE_NO_BK_PARENS))
1886             token->type = OP_OPEN_SUBEXP;
1887           break;
1888         case ')':
1889           if (!(syntax & RE_NO_BK_PARENS))
1890             token->type = OP_CLOSE_SUBEXP;
1891           break;
1892         case '+':
1893           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1894             token->type = OP_DUP_PLUS;
1895           break;
1896         case '?':
1897           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898             token->type = OP_DUP_QUESTION;
1899           break;
1900         case '{':
1901           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1902             token->type = OP_OPEN_DUP_NUM;
1903           break;
1904         case '}':
1905           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906             token->type = OP_CLOSE_DUP_NUM;
1907           break;
1908         default:
1909           break;
1910         }
1911       return 2;
1912     }
1913
1914   token->type = CHARACTER;
1915 #ifdef RE_ENABLE_I18N
1916   if (input->mb_cur_max > 1)
1917     {
1918       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1919       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1920     }
1921   else
1922 #endif
1923     token->word_char = IS_WORD_CHAR (token->opr.c);
1924
1925   switch (c)
1926     {
1927     case '\n':
1928       if (syntax & RE_NEWLINE_ALT)
1929         token->type = OP_ALT;
1930       break;
1931     case '|':
1932       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1933         token->type = OP_ALT;
1934       break;
1935     case '*':
1936       token->type = OP_DUP_ASTERISK;
1937       break;
1938     case '+':
1939       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1940         token->type = OP_DUP_PLUS;
1941       break;
1942     case '?':
1943       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944         token->type = OP_DUP_QUESTION;
1945       break;
1946     case '{':
1947       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1948         token->type = OP_OPEN_DUP_NUM;
1949       break;
1950     case '}':
1951       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952         token->type = OP_CLOSE_DUP_NUM;
1953       break;
1954     case '(':
1955       if (syntax & RE_NO_BK_PARENS)
1956         token->type = OP_OPEN_SUBEXP;
1957       break;
1958     case ')':
1959       if (syntax & RE_NO_BK_PARENS)
1960         token->type = OP_CLOSE_SUBEXP;
1961       break;
1962     case '[':
1963       token->type = OP_OPEN_BRACKET;
1964       break;
1965     case '.':
1966       token->type = OP_PERIOD;
1967       break;
1968     case '^':
1969       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1970           re_string_cur_idx (input) != 0)
1971         {
1972           char prev = re_string_peek_byte (input, -1);
1973           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1974             break;
1975         }
1976       token->type = ANCHOR;
1977       token->opr.ctx_type = LINE_FIRST;
1978       break;
1979     case '$':
1980       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1981           re_string_cur_idx (input) + 1 != re_string_length (input))
1982         {
1983           re_token_t next;
1984           re_string_skip_bytes (input, 1);
1985           peek_token (&next, input, syntax);
1986           re_string_skip_bytes (input, -1);
1987           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1988             break;
1989         }
1990       token->type = ANCHOR;
1991       token->opr.ctx_type = LINE_LAST;
1992       break;
1993     default:
1994       break;
1995     }
1996   return 1;
1997 }
1998
1999 /* Peek a token from INPUT, and return the length of the token.
2000    We must not use this function out of bracket expressions.  */
2001
2002 static int
2003 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2004 {
2005   unsigned char c;
2006   if (re_string_eoi (input))
2007     {
2008       token->type = END_OF_RE;
2009       return 0;
2010     }
2011   c = re_string_peek_byte (input, 0);
2012   token->opr.c = c;
2013
2014 #ifdef RE_ENABLE_I18N
2015   if (input->mb_cur_max > 1 &&
2016       !re_string_first_byte (input, re_string_cur_idx (input)))
2017     {
2018       token->type = CHARACTER;
2019       return 1;
2020     }
2021 #endif /* RE_ENABLE_I18N */
2022
2023   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2024       && re_string_cur_idx (input) + 1 < re_string_length (input))
2025     {
2026       /* In this case, '\' escape a character.  */
2027       unsigned char c2;
2028       re_string_skip_bytes (input, 1);
2029       c2 = re_string_peek_byte (input, 0);
2030       token->opr.c = c2;
2031       token->type = CHARACTER;
2032       return 1;
2033     }
2034   if (c == '[') /* '[' is a special char in a bracket exps.  */
2035     {
2036       unsigned char c2;
2037       int token_len;
2038       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2039         c2 = re_string_peek_byte (input, 1);
2040       else
2041         c2 = 0;
2042       token->opr.c = c2;
2043       token_len = 2;
2044       switch (c2)
2045         {
2046         case '.':
2047           token->type = OP_OPEN_COLL_ELEM;
2048           break;
2049         case '=':
2050           token->type = OP_OPEN_EQUIV_CLASS;
2051           break;
2052         case ':':
2053           if (syntax & RE_CHAR_CLASSES)
2054             {
2055               token->type = OP_OPEN_CHAR_CLASS;
2056               break;
2057             }
2058           /* else fall through.  */
2059         default:
2060           token->type = CHARACTER;
2061           token->opr.c = c;
2062           token_len = 1;
2063           break;
2064         }
2065       return token_len;
2066     }
2067   switch (c)
2068     {
2069     case '-':
2070       token->type = OP_CHARSET_RANGE;
2071       break;
2072     case ']':
2073       token->type = OP_CLOSE_BRACKET;
2074       break;
2075     case '^':
2076       token->type = OP_NON_MATCH_LIST;
2077       break;
2078     default:
2079       token->type = CHARACTER;
2080     }
2081   return 1;
2082 }
2083 \f
2084 /* Functions for parser.  */
2085
2086 /* Entry point of the parser.
2087    Parse the regular expression REGEXP and return the structure tree.
2088    If an error occurs, ERR is set by error code, and return NULL.
2089    This function build the following tree, from regular expression <reg_exp>:
2090            CAT
2091            / \
2092           /   \
2093    <reg_exp>  EOR
2094
2095    CAT means concatenation.
2096    EOR means end of regular expression.  */
2097
2098 static bin_tree_t *
2099 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2100        reg_errcode_t *err)
2101 {
2102   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2103   bin_tree_t *tree, *eor, *root;
2104   re_token_t current_token;
2105   dfa->syntax = syntax;
2106   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2107   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2108   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2109     return NULL;
2110   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2111   if (tree != NULL)
2112     root = create_tree (dfa, tree, eor, CONCAT);
2113   else
2114     root = eor;
2115   if (BE (eor == NULL || root == NULL, 0))
2116     {
2117       *err = REG_ESPACE;
2118       return NULL;
2119     }
2120   return root;
2121 }
2122
2123 /* This function build the following tree, from regular expression
2124    <branch1>|<branch2>:
2125            ALT
2126            / \
2127           /   \
2128    <branch1> <branch2>
2129
2130    ALT means alternative, which represents the operator '|'.  */
2131
2132 static bin_tree_t *
2133 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2134                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2135 {
2136   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2137   bin_tree_t *tree, *branch = NULL;
2138   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2139   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2140     return NULL;
2141
2142   while (token->type == OP_ALT)
2143     {
2144       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2145       if (token->type != OP_ALT && token->type != END_OF_RE
2146           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2147         {
2148           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2149           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2150             {
2151               if (tree != NULL)
2152                 postorder (tree, free_tree, NULL);
2153               return NULL;
2154             }
2155         }
2156       else
2157         branch = NULL;
2158       tree = create_tree (dfa, tree, branch, OP_ALT);
2159       if (BE (tree == NULL, 0))
2160         {
2161           *err = REG_ESPACE;
2162           return NULL;
2163         }
2164     }
2165   return tree;
2166 }
2167
2168 /* This function build the following tree, from regular expression
2169    <exp1><exp2>:
2170         CAT
2171         / \
2172        /   \
2173    <exp1> <exp2>
2174
2175    CAT means concatenation.  */
2176
2177 static bin_tree_t *
2178 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2179               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2180 {
2181   bin_tree_t *tree, *exp;
2182   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2183   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2184   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2185     return NULL;
2186
2187   while (token->type != OP_ALT && token->type != END_OF_RE
2188          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2189     {
2190       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2191       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2192         {
2193           if (tree != NULL)
2194             postorder (tree, free_tree, NULL);
2195           return NULL;
2196         }
2197       if (tree != NULL && exp != NULL)
2198         {
2199           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2200           if (newtree == NULL)
2201             {
2202               postorder (exp, free_tree, NULL);
2203               postorder (tree, free_tree, NULL);
2204               *err = REG_ESPACE;
2205               return NULL;
2206             }
2207           tree = newtree;
2208         }
2209       else if (tree == NULL)
2210         tree = exp;
2211       /* Otherwise exp == NULL, we don't need to create new tree.  */
2212     }
2213   return tree;
2214 }
2215
2216 /* This function build the following tree, from regular expression a*:
2217          *
2218          |
2219          a
2220 */
2221
2222 static bin_tree_t *
2223 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2224                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2225 {
2226   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227   bin_tree_t *tree;
2228   switch (token->type)
2229     {
2230     case CHARACTER:
2231       tree = create_token_tree (dfa, NULL, NULL, token);
2232       if (BE (tree == NULL, 0))
2233         {
2234           *err = REG_ESPACE;
2235           return NULL;
2236         }
2237 #ifdef RE_ENABLE_I18N
2238       if (dfa->mb_cur_max > 1)
2239         {
2240           while (!re_string_eoi (regexp)
2241                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242             {
2243               bin_tree_t *mbc_remain;
2244               fetch_token (token, regexp, syntax);
2245               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2246               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2247               if (BE (mbc_remain == NULL || tree == NULL, 0))
2248                 {
2249                   *err = REG_ESPACE;
2250                   return NULL;
2251                 }
2252             }
2253         }
2254 #endif
2255       break;
2256     case OP_OPEN_SUBEXP:
2257       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2258       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259         return NULL;
2260       break;
2261     case OP_OPEN_BRACKET:
2262       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2263       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2264         return NULL;
2265       break;
2266     case OP_BACK_REF:
2267       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2268         {
2269           *err = REG_ESUBREG;
2270           return NULL;
2271         }
2272       dfa->used_bkref_map |= 1 << token->opr.idx;
2273       tree = create_token_tree (dfa, NULL, NULL, token);
2274       if (BE (tree == NULL, 0))
2275         {
2276           *err = REG_ESPACE;
2277           return NULL;
2278         }
2279       ++dfa->nbackref;
2280       dfa->has_mb_node = 1;
2281       break;
2282     case OP_OPEN_DUP_NUM:
2283       if (syntax & RE_CONTEXT_INVALID_DUP)
2284         {
2285           *err = REG_BADRPT;
2286           return NULL;
2287         }
2288       /* FALLTHROUGH */
2289     case OP_DUP_ASTERISK:
2290     case OP_DUP_PLUS:
2291     case OP_DUP_QUESTION:
2292       if (syntax & RE_CONTEXT_INVALID_OPS)
2293         {
2294           *err = REG_BADRPT;
2295           return NULL;
2296         }
2297       else if (syntax & RE_CONTEXT_INDEP_OPS)
2298         {
2299           fetch_token (token, regexp, syntax);
2300           return parse_expression (regexp, preg, token, syntax, nest, err);
2301         }
2302       /* else fall through  */
2303     case OP_CLOSE_SUBEXP:
2304       if ((token->type == OP_CLOSE_SUBEXP) &&
2305           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2306         {
2307           *err = REG_ERPAREN;
2308           return NULL;
2309         }
2310       /* else fall through  */
2311     case OP_CLOSE_DUP_NUM:
2312       /* We treat it as a normal character.  */
2313
2314       /* Then we can these characters as normal characters.  */
2315       token->type = CHARACTER;
2316       /* mb_partial and word_char bits should be initialized already
2317          by peek_token.  */
2318       tree = create_token_tree (dfa, NULL, NULL, token);
2319       if (BE (tree == NULL, 0))
2320         {
2321           *err = REG_ESPACE;
2322           return NULL;
2323         }
2324       break;
2325     case ANCHOR:
2326       if ((token->opr.ctx_type
2327            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2328           && dfa->word_ops_used == 0)
2329         init_word_char (dfa);
2330       if (token->opr.ctx_type == WORD_DELIM
2331           || token->opr.ctx_type == NOT_WORD_DELIM)
2332         {
2333           bin_tree_t *tree_first, *tree_last;
2334           if (token->opr.ctx_type == WORD_DELIM)
2335             {
2336               token->opr.ctx_type = WORD_FIRST;
2337               tree_first = create_token_tree (dfa, NULL, NULL, token);
2338               token->opr.ctx_type = WORD_LAST;
2339             }
2340           else
2341             {
2342               token->opr.ctx_type = INSIDE_WORD;
2343               tree_first = create_token_tree (dfa, NULL, NULL, token);
2344               token->opr.ctx_type = INSIDE_NOTWORD;
2345             }
2346           tree_last = create_token_tree (dfa, NULL, NULL, token);
2347           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2348           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2349             {
2350               *err = REG_ESPACE;
2351               return NULL;
2352             }
2353         }
2354       else
2355         {
2356           tree = create_token_tree (dfa, NULL, NULL, token);
2357           if (BE (tree == NULL, 0))
2358             {
2359               *err = REG_ESPACE;
2360               return NULL;
2361             }
2362         }
2363       /* We must return here, since ANCHORs can't be followed
2364          by repetition operators.
2365          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2366              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2367       fetch_token (token, regexp, syntax);
2368       return tree;
2369     case OP_PERIOD:
2370       tree = create_token_tree (dfa, NULL, NULL, token);
2371       if (BE (tree == NULL, 0))
2372         {
2373           *err = REG_ESPACE;
2374           return NULL;
2375         }
2376       if (dfa->mb_cur_max > 1)
2377         dfa->has_mb_node = 1;
2378       break;
2379     case OP_WORD:
2380     case OP_NOTWORD:
2381       tree = build_charclass_op (dfa, regexp->trans,
2382                                  (const unsigned char *) "alnum",
2383                                  (const unsigned char *) "_",
2384                                  token->type == OP_NOTWORD, err);
2385       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2386         return NULL;
2387       break;
2388     case OP_SPACE:
2389     case OP_NOTSPACE:
2390       tree = build_charclass_op (dfa, regexp->trans,
2391                                  (const unsigned char *) "space",
2392                                  (const unsigned char *) "",
2393                                  token->type == OP_NOTSPACE, err);
2394       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395         return NULL;
2396       break;
2397     case OP_ALT:
2398     case END_OF_RE:
2399       return NULL;
2400     case BACK_SLASH:
2401       *err = REG_EESCAPE;
2402       return NULL;
2403     default:
2404       /* Must not happen?  */
2405 #ifdef DEBUG
2406       assert (0);
2407 #endif
2408       return NULL;
2409     }
2410   fetch_token (token, regexp, syntax);
2411
2412   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2413          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414     {
2415       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2416       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2417         {
2418           if (tree != NULL)
2419             postorder (tree, free_tree, NULL);
2420           return NULL;
2421         }
2422       tree = dup_tree;
2423       /* In BRE consecutive duplications are not allowed.  */
2424       if ((syntax & RE_CONTEXT_INVALID_DUP)
2425           && (token->type == OP_DUP_ASTERISK
2426               || token->type == OP_OPEN_DUP_NUM))
2427         {
2428           if (tree != NULL)
2429             postorder (tree, free_tree, NULL);
2430           *err = REG_BADRPT;
2431           return NULL;
2432         }
2433     }
2434
2435   return tree;
2436 }
2437
2438 /* This function build the following tree, from regular expression
2439    (<reg_exp>):
2440          SUBEXP
2441             |
2442         <reg_exp>
2443 */
2444
2445 static bin_tree_t *
2446 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2447                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2448 {
2449   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2450   bin_tree_t *tree;
2451   size_t cur_nsub;
2452   cur_nsub = preg->re_nsub++;
2453
2454   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2455
2456   /* The subexpression may be a null string.  */
2457   if (token->type == OP_CLOSE_SUBEXP)
2458     tree = NULL;
2459   else
2460     {
2461       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2462       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2463         {
2464           if (tree != NULL)
2465             postorder (tree, free_tree, NULL);
2466           *err = REG_EPAREN;
2467         }
2468       if (BE (*err != REG_NOERROR, 0))
2469         return NULL;
2470     }
2471
2472   if (cur_nsub <= '9' - '1')
2473     dfa->completed_bkref_map |= 1 << cur_nsub;
2474
2475   tree = create_tree (dfa, tree, NULL, SUBEXP);
2476   if (BE (tree == NULL, 0))
2477     {
2478       *err = REG_ESPACE;
2479       return NULL;
2480     }
2481   tree->token.opr.idx = cur_nsub;
2482   return tree;
2483 }
2484
2485 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2486
2487 static bin_tree_t *
2488 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2489               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2490 {
2491   bin_tree_t *tree = NULL, *old_tree = NULL;
2492   int i, start, end, start_idx = re_string_cur_idx (regexp);
2493   re_token_t start_token = *token;
2494
2495   if (token->type == OP_OPEN_DUP_NUM)
2496     {
2497       end = 0;
2498       start = fetch_number (regexp, token, syntax);
2499       if (start == -1)
2500         {
2501           if (token->type == CHARACTER && token->opr.c == ',')
2502             start = 0; /* We treat "{,m}" as "{0,m}".  */
2503           else
2504             {
2505               *err = REG_BADBR; /* <re>{} is invalid.  */
2506               return NULL;
2507             }
2508         }
2509       if (BE (start != -2, 1))
2510         {
2511           /* We treat "{n}" as "{n,n}".  */
2512           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2513                  : ((token->type == CHARACTER && token->opr.c == ',')
2514                     ? fetch_number (regexp, token, syntax) : -2));
2515         }
2516       if (BE (start == -2 || end == -2, 0))
2517         {
2518           /* Invalid sequence.  */
2519           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2520             {
2521               if (token->type == END_OF_RE)
2522                 *err = REG_EBRACE;
2523               else
2524                 *err = REG_BADBR;
2525
2526               return NULL;
2527             }
2528
2529           /* If the syntax bit is set, rollback.  */
2530           re_string_set_index (regexp, start_idx);
2531           *token = start_token;
2532           token->type = CHARACTER;
2533           /* mb_partial and word_char bits should be already initialized by
2534              peek_token.  */
2535           return elem;
2536         }
2537
2538       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2539         {
2540           /* First number greater than second.  */
2541           *err = REG_BADBR;
2542           return NULL;
2543         }
2544     }
2545   else
2546     {
2547       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2548       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2549     }
2550
2551   fetch_token (token, regexp, syntax);
2552
2553   if (BE (elem == NULL, 0))
2554     return NULL;
2555   if (BE (start == 0 && end == 0, 0))
2556     {
2557       postorder (elem, free_tree, NULL);
2558       return NULL;
2559     }
2560
2561   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2562   if (BE (start > 0, 0))
2563     {
2564       tree = elem;
2565       for (i = 2; i <= start; ++i)
2566         {
2567           elem = duplicate_tree (elem, dfa);
2568           tree = create_tree (dfa, tree, elem, CONCAT);
2569           if (BE (elem == NULL || tree == NULL, 0))
2570             goto parse_dup_op_espace;
2571         }
2572
2573       if (start == end)
2574         return tree;
2575
2576       /* Duplicate ELEM before it is marked optional.  */
2577       elem = duplicate_tree (elem, dfa);
2578       if (BE (elem == NULL, 0))
2579         goto parse_dup_op_espace;
2580       old_tree = tree;
2581     }
2582   else
2583     old_tree = NULL;
2584
2585   if (elem->token.type == SUBEXP)
2586     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2587
2588   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2589   if (BE (tree == NULL, 0))
2590     goto parse_dup_op_espace;
2591
2592   /* This loop is actually executed only when end != -1,
2593      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2594      already created the start+1-th copy.  */
2595   for (i = start + 2; i <= end; ++i)
2596     {
2597       elem = duplicate_tree (elem, dfa);
2598       tree = create_tree (dfa, tree, elem, CONCAT);
2599       if (BE (elem == NULL || tree == NULL, 0))
2600         goto parse_dup_op_espace;
2601
2602       tree = create_tree (dfa, tree, NULL, OP_ALT);
2603       if (BE (tree == NULL, 0))
2604         goto parse_dup_op_espace;
2605     }
2606
2607   if (old_tree)
2608     tree = create_tree (dfa, old_tree, tree, CONCAT);
2609
2610   return tree;
2611
2612  parse_dup_op_espace:
2613   *err = REG_ESPACE;
2614   return NULL;
2615 }
2616
2617 /* Size of the names for collating symbol/equivalence_class/character_class.
2618    I'm not sure, but maybe enough.  */
2619 #define BRACKET_NAME_BUF_SIZE 32
2620
2621 #ifndef _LIBC
2622   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2623      Build the range expression which starts from START_ELEM, and ends
2624      at END_ELEM.  The result are written to MBCSET and SBCSET.
2625      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2626      mbcset->range_ends, is a pointer argument since we may
2627      update it.  */
2628
2629 static reg_errcode_t
2630 # ifdef RE_ENABLE_I18N
2631 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2632                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2635                  bracket_elem_t *end_elem)
2636 # endif /* not RE_ENABLE_I18N */
2637 {
2638   unsigned int start_ch, end_ch;
2639   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2640   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2642           0))
2643     return REG_ERANGE;
2644
2645   /* We can handle no multi character collating elements without libc
2646      support.  */
2647   if (BE ((start_elem->type == COLL_SYM
2648            && strlen ((char *) start_elem->opr.name) > 1)
2649           || (end_elem->type == COLL_SYM
2650               && strlen ((char *) end_elem->opr.name) > 1), 0))
2651     return REG_ECOLLATE;
2652
2653 # ifdef RE_ENABLE_I18N
2654   {
2655     wchar_t wc;
2656     wint_t start_wc;
2657     wint_t end_wc;
2658     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2659
2660     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2662                    : 0));
2663     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2665                  : 0));
2666     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2667                 ? __btowc (start_ch) : start_elem->opr.wch);
2668     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2669               ? __btowc (end_ch) : end_elem->opr.wch);
2670     if (start_wc == WEOF || end_wc == WEOF)
2671       return REG_ECOLLATE;
2672     cmp_buf[0] = start_wc;
2673     cmp_buf[4] = end_wc;
2674     if (__wcscoll (cmp_buf, cmp_buf + 4) > 0)
2675       return REG_ERANGE;
2676
2677     /* Got valid collation sequence values, add them as a new entry.
2678        However, for !_LIBC we have no collation elements: if the
2679        character set is single byte, the single byte character set
2680        that we build below suffices.  parse_bracket_exp passes
2681        no MBCSET if dfa->mb_cur_max == 1.  */
2682     if (mbcset)
2683       {
2684         /* Check the space of the arrays.  */
2685         if (BE (*range_alloc == mbcset->nranges, 0))
2686           {
2687             /* There is not enough space, need realloc.  */
2688             wchar_t *new_array_start, *new_array_end;
2689             int new_nranges;
2690
2691             /* +1 in case of mbcset->nranges is 0.  */
2692             new_nranges = 2 * mbcset->nranges + 1;
2693             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2694                are NULL if *range_alloc == 0.  */
2695             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2696                                           new_nranges);
2697             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2698                                         new_nranges);
2699
2700             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2701               return REG_ESPACE;
2702
2703             mbcset->range_starts = new_array_start;
2704             mbcset->range_ends = new_array_end;
2705             *range_alloc = new_nranges;
2706           }
2707
2708         mbcset->range_starts[mbcset->nranges] = start_wc;
2709         mbcset->range_ends[mbcset->nranges++] = end_wc;
2710       }
2711
2712     /* Build the table for single byte characters.  */
2713     for (wc = 0; wc < SBC_MAX; ++wc)
2714       {
2715         cmp_buf[2] = wc;
2716         if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
2717             && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2718           bitset_set (sbcset, wc);
2719       }
2720   }
2721 # else /* not RE_ENABLE_I18N */
2722   {
2723     unsigned int ch;
2724     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2725                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2726                    : 0));
2727     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2728               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2729                  : 0));
2730     if (start_ch > end_ch)
2731       return REG_ERANGE;
2732     /* Build the table for single byte characters.  */
2733     for (ch = 0; ch < SBC_MAX; ++ch)
2734       if (start_ch <= ch  && ch <= end_ch)
2735         bitset_set (sbcset, ch);
2736   }
2737 # endif /* not RE_ENABLE_I18N */
2738   return REG_NOERROR;
2739 }
2740 #endif /* not _LIBC */
2741
2742 #ifndef _LIBC
2743 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2744    Build the collating element which is represented by NAME.
2745    The result are written to MBCSET and SBCSET.
2746    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2747    pointer argument since we may update it.  */
2748
2749 static reg_errcode_t
2750 # ifdef RE_ENABLE_I18N
2751 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2752                         int *coll_sym_alloc, const unsigned char *name)
2753 # else /* not RE_ENABLE_I18N */
2754 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2755 # endif /* not RE_ENABLE_I18N */
2756 {
2757   size_t name_len = strlen ((const char *) name);
2758   if (BE (name_len != 1, 0))
2759     return REG_ECOLLATE;
2760   else
2761     {
2762       bitset_set (sbcset, name[0]);
2763       return REG_NOERROR;
2764     }
2765 }
2766 #endif /* not _LIBC */
2767
2768 /* This function parse bracket expression like "[abc]", "[a-c]",
2769    "[[.a-a.]]" etc.  */
2770
2771 static bin_tree_t *
2772 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2773                    reg_syntax_t syntax, reg_errcode_t *err)
2774 {
2775 #ifdef _LIBC
2776   const unsigned char *collseqmb;
2777   const char *collseqwc;
2778   uint32_t nrules;
2779   int32_t table_size;
2780   const int32_t *symb_table;
2781   const unsigned char *extra;
2782
2783   /* Local function for parse_bracket_exp used in _LIBC environment.
2784      Seek the collating symbol entry corresponding to NAME.
2785      Return the index of the symbol in the SYMB_TABLE,
2786      or -1 if not found.  */
2787
2788   auto inline int32_t
2789   __attribute__ ((always_inline))
2790   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2791     {
2792       int32_t elem;
2793
2794       for (elem = 0; elem < table_size; elem++)
2795         if (symb_table[2 * elem] != 0)
2796           {
2797             int32_t idx = symb_table[2 * elem + 1];
2798             /* Skip the name of collating element name.  */
2799             idx += 1 + extra[idx];
2800             if (/* Compare the length of the name.  */
2801                 name_len == extra[idx]
2802                 /* Compare the name.  */
2803                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2804               /* Yep, this is the entry.  */
2805               return elem;
2806           }
2807       return -1;
2808     }
2809
2810   /* Local function for parse_bracket_exp used in _LIBC environment.
2811      Look up the collation sequence value of BR_ELEM.
2812      Return the value if succeeded, UINT_MAX otherwise.  */
2813
2814   auto inline unsigned int
2815   __attribute__ ((always_inline))
2816   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2817     {
2818       if (br_elem->type == SB_CHAR)
2819         {
2820           /*
2821           if (MB_CUR_MAX == 1)
2822           */
2823           if (nrules == 0)
2824             return collseqmb[br_elem->opr.ch];
2825           else
2826             {
2827               wint_t wc = __btowc (br_elem->opr.ch);
2828               return __collseq_table_lookup (collseqwc, wc);
2829             }
2830         }
2831       else if (br_elem->type == MB_CHAR)
2832         {
2833           if (nrules != 0)
2834             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2835         }
2836       else if (br_elem->type == COLL_SYM)
2837         {
2838           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2839           if (nrules != 0)
2840             {
2841               int32_t elem, idx;
2842               elem = seek_collating_symbol_entry (br_elem->opr.name,
2843                                                   sym_name_len);
2844               if (elem != -1)
2845                 {
2846                   /* We found the entry.  */
2847                   idx = symb_table[2 * elem + 1];
2848                   /* Skip the name of collating element name.  */
2849                   idx += 1 + extra[idx];
2850                   /* Skip the byte sequence of the collating element.  */
2851                   idx += 1 + extra[idx];
2852                   /* Adjust for the alignment.  */
2853                   idx = (idx + 3) & ~3;
2854                   /* Skip the multibyte collation sequence value.  */
2855                   idx += sizeof (unsigned int);
2856                   /* Skip the wide char sequence of the collating element.  */
2857                   idx += sizeof (unsigned int) *
2858                     (1 + *(unsigned int *) (extra + idx));
2859                   /* Return the collation sequence value.  */
2860                   return *(unsigned int *) (extra + idx);
2861                 }
2862               else if (sym_name_len == 1)
2863                 {
2864                   /* No valid character.  Match it as a single byte
2865                      character.  */
2866                   return collseqmb[br_elem->opr.name[0]];
2867                 }
2868             }
2869           else if (sym_name_len == 1)
2870             return collseqmb[br_elem->opr.name[0]];
2871         }
2872       return UINT_MAX;
2873     }
2874
2875   /* Local function for parse_bracket_exp used in _LIBC environment.
2876      Build the range expression which starts from START_ELEM, and ends
2877      at END_ELEM.  The result are written to MBCSET and SBCSET.
2878      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2879      mbcset->range_ends, is a pointer argument since we may
2880      update it.  */
2881
2882   auto inline reg_errcode_t
2883   __attribute__ ((always_inline))
2884   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2885                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2886     {
2887       unsigned int ch;
2888       uint32_t start_collseq;
2889       uint32_t end_collseq;
2890
2891       /* Equivalence Classes and Character Classes can't be a range
2892          start/end.  */
2893       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2894               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2895               0))
2896         return REG_ERANGE;
2897
2898       start_collseq = lookup_collation_sequence_value (start_elem);
2899       end_collseq = lookup_collation_sequence_value (end_elem);
2900       /* Check start/end collation sequence values.  */
2901       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2902         return REG_ECOLLATE;
2903       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2904         return REG_ERANGE;
2905
2906       /* Got valid collation sequence values, add them as a new entry.
2907          However, if we have no collation elements, and the character set
2908          is single byte, the single byte character set that we
2909          build below suffices. */
2910       if (nrules > 0 || dfa->mb_cur_max > 1)
2911         {
2912           /* Check the space of the arrays.  */
2913           if (BE (*range_alloc == mbcset->nranges, 0))
2914             {
2915               /* There is not enough space, need realloc.  */
2916               uint32_t *new_array_start;
2917               uint32_t *new_array_end;
2918               int new_nranges;
2919
2920               /* +1 in case of mbcset->nranges is 0.  */
2921               new_nranges = 2 * mbcset->nranges + 1;
2922               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2923                                             new_nranges);
2924               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2925                                           new_nranges);
2926
2927               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2928                 return REG_ESPACE;
2929
2930               mbcset->range_starts = new_array_start;
2931               mbcset->range_ends = new_array_end;
2932               *range_alloc = new_nranges;
2933             }
2934
2935           mbcset->range_starts[mbcset->nranges] = start_collseq;
2936           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2937         }
2938
2939       /* Build the table for single byte characters.  */
2940       for (ch = 0; ch < SBC_MAX; ch++)
2941         {
2942           uint32_t ch_collseq;
2943           /*
2944           if (MB_CUR_MAX == 1)
2945           */
2946           if (nrules == 0)
2947             ch_collseq = collseqmb[ch];
2948           else
2949             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2950           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2951             bitset_set (sbcset, ch);
2952         }
2953       return REG_NOERROR;
2954     }
2955
2956   /* Local function for parse_bracket_exp used in _LIBC environment.
2957      Build the collating element which is represented by NAME.
2958      The result are written to MBCSET and SBCSET.
2959      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2960      pointer argument since we may update it.  */
2961
2962   auto inline reg_errcode_t
2963   __attribute__ ((always_inline))
2964   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2965                           int *coll_sym_alloc, const unsigned char *name)
2966     {
2967       int32_t elem, idx;
2968       size_t name_len = strlen ((const char *) name);
2969       if (nrules != 0)
2970         {
2971           elem = seek_collating_symbol_entry (name, name_len);
2972           if (elem != -1)
2973             {
2974               /* We found the entry.  */
2975               idx = symb_table[2 * elem + 1];
2976               /* Skip the name of collating element name.  */
2977               idx += 1 + extra[idx];
2978             }
2979           else if (name_len == 1)
2980             {
2981               /* No valid character, treat it as a normal
2982                  character.  */
2983               bitset_set (sbcset, name[0]);
2984               return REG_NOERROR;
2985             }
2986           else
2987             return REG_ECOLLATE;
2988
2989           /* Got valid collation sequence, add it as a new entry.  */
2990           /* Check the space of the arrays.  */
2991           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2992             {
2993               /* Not enough, realloc it.  */
2994               /* +1 in case of mbcset->ncoll_syms is 0.  */
2995               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2996               /* Use realloc since mbcset->coll_syms is NULL
2997                  if *alloc == 0.  */
2998               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2999                                                    new_coll_sym_alloc);
3000               if (BE (new_coll_syms == NULL, 0))
3001                 return REG_ESPACE;
3002               mbcset->coll_syms = new_coll_syms;
3003               *coll_sym_alloc = new_coll_sym_alloc;
3004             }
3005           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3006           return REG_NOERROR;
3007         }
3008       else
3009         {
3010           if (BE (name_len != 1, 0))
3011             return REG_ECOLLATE;
3012           else
3013             {
3014               bitset_set (sbcset, name[0]);
3015               return REG_NOERROR;
3016             }
3017         }
3018     }
3019 #endif
3020
3021   re_token_t br_token;
3022   re_bitset_ptr_t sbcset;
3023 #ifdef RE_ENABLE_I18N
3024   re_charset_t *mbcset;
3025   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3026   int equiv_class_alloc = 0, char_class_alloc = 0;
3027 #endif /* not RE_ENABLE_I18N */
3028   int non_match = 0;
3029   bin_tree_t *work_tree;
3030   int token_len;
3031   int first_round = 1;
3032 #ifdef _LIBC
3033   collseqmb = (const unsigned char *)
3034     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3035   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3036   if (nrules)
3037     {
3038       /*
3039       if (MB_CUR_MAX > 1)
3040       */
3041       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3042       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3043       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3044                                                   _NL_COLLATE_SYMB_TABLEMB);
3045       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3046                                                    _NL_COLLATE_SYMB_EXTRAMB);
3047     }
3048 #endif
3049   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3050 #ifdef RE_ENABLE_I18N
3051   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3052 #endif /* RE_ENABLE_I18N */
3053 #ifdef RE_ENABLE_I18N
3054   if (BE (sbcset == NULL || mbcset == NULL, 0))
3055 #else
3056   if (BE (sbcset == NULL, 0))
3057 #endif /* RE_ENABLE_I18N */
3058     {
3059       re_free (sbcset);
3060 #ifdef RE_ENABLE_I18N
3061       re_free (mbcset);
3062 #endif
3063       *err = REG_ESPACE;
3064       return NULL;
3065     }
3066
3067   token_len = peek_token_bracket (token, regexp, syntax);
3068   if (BE (token->type == END_OF_RE, 0))
3069     {
3070       *err = REG_BADPAT;
3071       goto parse_bracket_exp_free_return;
3072     }
3073   if (token->type == OP_NON_MATCH_LIST)
3074     {
3075 #ifdef RE_ENABLE_I18N
3076       mbcset->non_match = 1;
3077 #endif /* not RE_ENABLE_I18N */
3078       non_match = 1;
3079       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3080         bitset_set (sbcset, '\n');
3081       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3082       token_len = peek_token_bracket (token, regexp, syntax);
3083       if (BE (token->type == END_OF_RE, 0))
3084         {
3085           *err = REG_BADPAT;
3086           goto parse_bracket_exp_free_return;
3087         }
3088     }
3089
3090   /* We treat the first ']' as a normal character.  */
3091   if (token->type == OP_CLOSE_BRACKET)
3092     token->type = CHARACTER;
3093
3094   while (1)
3095     {
3096       bracket_elem_t start_elem, end_elem;
3097       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3098       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3099       reg_errcode_t ret;
3100       int token_len2 = 0, is_range_exp = 0;
3101       re_token_t token2;
3102
3103       start_elem.opr.name = start_name_buf;
3104       start_elem.type = COLL_SYM;
3105       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3106                                    syntax, first_round);
3107       if (BE (ret != REG_NOERROR, 0))
3108         {
3109           *err = ret;
3110           goto parse_bracket_exp_free_return;
3111         }
3112       first_round = 0;
3113
3114       /* Get information about the next token.  We need it in any case.  */
3115       token_len = peek_token_bracket (token, regexp, syntax);
3116
3117       /* Do not check for ranges if we know they are not allowed.  */
3118       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3119         {
3120           if (BE (token->type == END_OF_RE, 0))
3121             {
3122               *err = REG_EBRACK;
3123               goto parse_bracket_exp_free_return;
3124             }
3125           if (token->type == OP_CHARSET_RANGE)
3126             {
3127               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3128               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3129               if (BE (token2.type == END_OF_RE, 0))
3130                 {
3131                   *err = REG_EBRACK;
3132                   goto parse_bracket_exp_free_return;
3133                 }
3134               if (token2.type == OP_CLOSE_BRACKET)
3135                 {
3136                   /* We treat the last '-' as a normal character.  */
3137                   re_string_skip_bytes (regexp, -token_len);
3138                   token->type = CHARACTER;
3139                 }
3140               else
3141                 is_range_exp = 1;
3142             }
3143         }
3144
3145       if (is_range_exp == 1)
3146         {
3147           end_elem.opr.name = end_name_buf;
3148           end_elem.type = COLL_SYM;
3149           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3150                                        dfa, syntax, 1);
3151           if (BE (ret != REG_NOERROR, 0))
3152             {
3153               *err = ret;
3154               goto parse_bracket_exp_free_return;
3155             }
3156
3157           token_len = peek_token_bracket (token, regexp, syntax);
3158
3159 #ifdef _LIBC
3160           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3161                                   &start_elem, &end_elem);
3162 #else
3163 # ifdef RE_ENABLE_I18N
3164           *err = build_range_exp (sbcset,
3165                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3166                                   &range_alloc, &start_elem, &end_elem);
3167 # else
3168           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3169 # endif
3170 #endif /* RE_ENABLE_I18N */
3171           if (BE (*err != REG_NOERROR, 0))
3172             goto parse_bracket_exp_free_return;
3173         }
3174       else
3175         {
3176           switch (start_elem.type)
3177             {
3178             case SB_CHAR:
3179               bitset_set (sbcset, start_elem.opr.ch);
3180               break;
3181 #ifdef RE_ENABLE_I18N
3182             case MB_CHAR:
3183               /* Check whether the array has enough space.  */
3184               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3185                 {
3186                   wchar_t *new_mbchars;
3187                   /* Not enough, realloc it.  */
3188                   /* +1 in case of mbcset->nmbchars is 0.  */
3189                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3190                   /* Use realloc since array is NULL if *alloc == 0.  */
3191                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3192                                             mbchar_alloc);
3193                   if (BE (new_mbchars == NULL, 0))
3194                     goto parse_bracket_exp_espace;
3195                   mbcset->mbchars = new_mbchars;
3196                 }
3197               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3198               break;
3199 #endif /* RE_ENABLE_I18N */
3200             case EQUIV_CLASS:
3201               *err = build_equiv_class (sbcset,
3202 #ifdef RE_ENABLE_I18N
3203                                         mbcset, &equiv_class_alloc,
3204 #endif /* RE_ENABLE_I18N */
3205                                         start_elem.opr.name);
3206               if (BE (*err != REG_NOERROR, 0))
3207                 goto parse_bracket_exp_free_return;
3208               break;
3209             case COLL_SYM:
3210               *err = build_collating_symbol (sbcset,
3211 #ifdef RE_ENABLE_I18N
3212                                              mbcset, &coll_sym_alloc,
3213 #endif /* RE_ENABLE_I18N */
3214                                              start_elem.opr.name);
3215               if (BE (*err != REG_NOERROR, 0))
3216                 goto parse_bracket_exp_free_return;
3217               break;
3218             case CHAR_CLASS:
3219               *err = build_charclass (regexp->trans, sbcset,
3220 #ifdef RE_ENABLE_I18N
3221                                       mbcset, &char_class_alloc,
3222 #endif /* RE_ENABLE_I18N */
3223                                       start_elem.opr.name, syntax);
3224               if (BE (*err != REG_NOERROR, 0))
3225                goto parse_bracket_exp_free_return;
3226               break;
3227             default:
3228               assert (0);
3229               break;
3230             }
3231         }
3232       if (BE (token->type == END_OF_RE, 0))
3233         {
3234           *err = REG_EBRACK;
3235           goto parse_bracket_exp_free_return;
3236         }
3237       if (token->type == OP_CLOSE_BRACKET)
3238         break;
3239     }
3240
3241   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3242
3243   /* If it is non-matching list.  */
3244   if (non_match)
3245     bitset_not (sbcset);
3246
3247 #ifdef RE_ENABLE_I18N
3248   /* Ensure only single byte characters are set.  */
3249   if (dfa->mb_cur_max > 1)
3250     bitset_mask (sbcset, dfa->sb_char);
3251
3252   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3253       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3254                                                      || mbcset->non_match)))
3255     {
3256       bin_tree_t *mbc_tree;
3257       int sbc_idx;
3258       /* Build a tree for complex bracket.  */
3259       dfa->has_mb_node = 1;
3260       br_token.type = COMPLEX_BRACKET;
3261       br_token.opr.mbcset = mbcset;
3262       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3263       if (BE (mbc_tree == NULL, 0))
3264         goto parse_bracket_exp_espace;
3265       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3266         if (sbcset[sbc_idx])
3267           break;
3268       /* If there are no bits set in sbcset, there is no point
3269          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3270       if (sbc_idx < BITSET_WORDS)
3271         {
3272           /* Build a tree for simple bracket.  */
3273           br_token.type = SIMPLE_BRACKET;
3274           br_token.opr.sbcset = sbcset;
3275           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3276           if (BE (work_tree == NULL, 0))
3277             goto parse_bracket_exp_espace;
3278
3279           /* Then join them by ALT node.  */
3280           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3281           if (BE (work_tree == NULL, 0))
3282             goto parse_bracket_exp_espace;
3283         }
3284       else
3285         {
3286           re_free (sbcset);
3287           work_tree = mbc_tree;
3288         }
3289     }
3290   else
3291 #endif /* not RE_ENABLE_I18N */
3292     {
3293 #ifdef RE_ENABLE_I18N
3294       free_charset (mbcset);
3295 #endif
3296       /* Build a tree for simple bracket.  */
3297       br_token.type = SIMPLE_BRACKET;
3298       br_token.opr.sbcset = sbcset;
3299       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3300       if (BE (work_tree == NULL, 0))
3301         goto parse_bracket_exp_espace;
3302     }
3303   return work_tree;
3304
3305  parse_bracket_exp_espace:
3306   *err = REG_ESPACE;
3307  parse_bracket_exp_free_return:
3308   re_free (sbcset);
3309 #ifdef RE_ENABLE_I18N
3310   free_charset (mbcset);
3311 #endif /* RE_ENABLE_I18N */
3312   return NULL;
3313 }
3314
3315 /* Parse an element in the bracket expression.  */
3316
3317 static reg_errcode_t
3318 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3319                        re_token_t *token, int token_len, re_dfa_t *dfa,
3320                        reg_syntax_t syntax, int accept_hyphen)
3321 {
3322 #ifdef RE_ENABLE_I18N
3323   int cur_char_size;
3324   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3325   if (cur_char_size > 1)
3326     {
3327       elem->type = MB_CHAR;
3328       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3329       re_string_skip_bytes (regexp, cur_char_size);
3330       return REG_NOERROR;
3331     }
3332 #endif /* RE_ENABLE_I18N */
3333   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3334   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3335       || token->type == OP_OPEN_EQUIV_CLASS)
3336     return parse_bracket_symbol (elem, regexp, token);
3337   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3338     {
3339       /* A '-' must only appear as anything but a range indicator before
3340          the closing bracket.  Everything else is an error.  */
3341       re_token_t token2;
3342       (void) peek_token_bracket (&token2, regexp, syntax);
3343       if (token2.type != OP_CLOSE_BRACKET)
3344         /* The actual error value is not standardized since this whole
3345            case is undefined.  But ERANGE makes good sense.  */
3346         return REG_ERANGE;
3347     }
3348   elem->type = SB_CHAR;
3349   elem->opr.ch = token->opr.c;
3350   return REG_NOERROR;
3351 }
3352
3353 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3354    such as [:<character_class>:], [.<collating_element>.], and
3355    [=<equivalent_class>=].  */
3356
3357 static reg_errcode_t
3358 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3359                       re_token_t *token)
3360 {
3361   unsigned char ch, delim = token->opr.c;
3362   int i = 0;
3363   if (re_string_eoi(regexp))
3364     return REG_EBRACK;
3365   for (;; ++i)
3366     {
3367       if (i >= BRACKET_NAME_BUF_SIZE)
3368         return REG_EBRACK;
3369       if (token->type == OP_OPEN_CHAR_CLASS)
3370         ch = re_string_fetch_byte_case (regexp);
3371       else
3372         ch = re_string_fetch_byte (regexp);
3373       if (re_string_eoi(regexp))
3374         return REG_EBRACK;
3375       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3376         break;
3377       elem->opr.name[i] = ch;
3378     }
3379   re_string_skip_bytes (regexp, 1);
3380   elem->opr.name[i] = '\0';
3381   switch (token->type)
3382     {
3383     case OP_OPEN_COLL_ELEM:
3384       elem->type = COLL_SYM;
3385       break;
3386     case OP_OPEN_EQUIV_CLASS:
3387       elem->type = EQUIV_CLASS;
3388       break;
3389     case OP_OPEN_CHAR_CLASS:
3390       elem->type = CHAR_CLASS;
3391       break;
3392     default:
3393       break;
3394     }
3395   return REG_NOERROR;
3396 }
3397
3398   /* Helper function for parse_bracket_exp.
3399      Build the equivalence class which is represented by NAME.
3400      The result are written to MBCSET and SBCSET.
3401      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3402      is a pointer argument since we may update it.  */
3403
3404 static reg_errcode_t
3405 #ifdef RE_ENABLE_I18N
3406 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3407                    int *equiv_class_alloc, const unsigned char *name)
3408 #else /* not RE_ENABLE_I18N */
3409 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3410 #endif /* not RE_ENABLE_I18N */
3411 {
3412 #ifdef _LIBC
3413   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3414   if (nrules != 0)
3415     {
3416       const int32_t *table, *indirect;
3417       const unsigned char *weights, *extra, *cp;
3418       unsigned char char_buf[2];
3419       int32_t idx1, idx2;
3420       unsigned int ch;
3421       size_t len;
3422       /* Calculate the index for equivalence class.  */
3423       cp = name;
3424       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3425       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3426                                                _NL_COLLATE_WEIGHTMB);
3427       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3428                                                    _NL_COLLATE_EXTRAMB);
3429       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3430                                                 _NL_COLLATE_INDIRECTMB);
3431       idx1 = findidx (table, indirect, extra, &cp, -1);
3432       if (BE (idx1 == 0 || *cp != '\0', 0))
3433         /* This isn't a valid character.  */
3434         return REG_ECOLLATE;
3435
3436       /* Build single byte matching table for this equivalence class.  */
3437       len = weights[idx1 & 0xffffff];
3438       for (ch = 0; ch < SBC_MAX; ++ch)
3439         {
3440           char_buf[0] = ch;
3441           cp = char_buf;
3442           idx2 = findidx (table, indirect, extra, &cp, 1);
3443 /*
3444           idx2 = table[ch];
3445 */
3446           if (idx2 == 0)
3447             /* This isn't a valid character.  */
3448             continue;
3449           /* Compare only if the length matches and the collation rule
3450              index is the same.  */
3451           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3452             {
3453               int cnt = 0;
3454
3455               while (cnt <= len &&
3456                      weights[(idx1 & 0xffffff) + 1 + cnt]
3457                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3458                 ++cnt;
3459
3460               if (cnt > len)
3461                 bitset_set (sbcset, ch);
3462             }
3463         }
3464       /* Check whether the array has enough space.  */
3465       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3466         {
3467           /* Not enough, realloc it.  */
3468           /* +1 in case of mbcset->nequiv_classes is 0.  */
3469           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3470           /* Use realloc since the array is NULL if *alloc == 0.  */
3471           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3472                                                    int32_t,
3473                                                    new_equiv_class_alloc);
3474           if (BE (new_equiv_classes == NULL, 0))
3475             return REG_ESPACE;
3476           mbcset->equiv_classes = new_equiv_classes;
3477           *equiv_class_alloc = new_equiv_class_alloc;
3478         }
3479       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3480     }
3481   else
3482 #endif /* _LIBC */
3483     {
3484       if (BE (strlen ((const char *) name) != 1, 0))
3485         return REG_ECOLLATE;
3486       bitset_set (sbcset, *name);
3487     }
3488   return REG_NOERROR;
3489 }
3490
3491   /* Helper function for parse_bracket_exp.
3492      Build the character class which is represented by NAME.
3493      The result are written to MBCSET and SBCSET.
3494      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3495      is a pointer argument since we may update it.  */
3496
3497 static reg_errcode_t
3498 #ifdef RE_ENABLE_I18N
3499 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3500                  re_charset_t *mbcset, int *char_class_alloc,
3501                  const unsigned char *class_name, reg_syntax_t syntax)
3502 #else /* not RE_ENABLE_I18N */
3503 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3504                  const unsigned char *class_name, reg_syntax_t syntax)
3505 #endif /* not RE_ENABLE_I18N */
3506 {
3507   int i;
3508   const char *name = (const char *) class_name;
3509
3510   /* In case of REG_ICASE "upper" and "lower" match the both of
3511      upper and lower cases.  */
3512   if ((syntax & RE_ICASE)
3513       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3514     name = "alpha";
3515
3516 #ifdef RE_ENABLE_I18N
3517   /* Check the space of the arrays.  */
3518   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3519     {
3520       /* Not enough, realloc it.  */
3521       /* +1 in case of mbcset->nchar_classes is 0.  */
3522       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3523       /* Use realloc since array is NULL if *alloc == 0.  */
3524       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3525                                                new_char_class_alloc);
3526       if (BE (new_char_classes == NULL, 0))
3527         return REG_ESPACE;
3528       mbcset->char_classes = new_char_classes;
3529       *char_class_alloc = new_char_class_alloc;
3530     }
3531   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3532 #endif /* RE_ENABLE_I18N */
3533
3534 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3535   do {                                          \
3536     if (BE (trans != NULL, 0))                  \
3537       {                                         \
3538         for (i = 0; i < SBC_MAX; ++i)           \
3539           if (ctype_func (i))                   \
3540             bitset_set (sbcset, trans[i]);      \
3541       }                                         \
3542     else                                        \
3543       {                                         \
3544         for (i = 0; i < SBC_MAX; ++i)           \
3545           if (ctype_func (i))                   \
3546             bitset_set (sbcset, i);             \
3547       }                                         \
3548   } while (0)
3549
3550   if (strcmp (name, "alnum") == 0)
3551     BUILD_CHARCLASS_LOOP (isalnum);
3552   else if (strcmp (name, "cntrl") == 0)
3553     BUILD_CHARCLASS_LOOP (iscntrl);
3554   else if (strcmp (name, "lower") == 0)
3555     BUILD_CHARCLASS_LOOP (islower);
3556   else if (strcmp (name, "space") == 0)
3557     BUILD_CHARCLASS_LOOP (isspace);
3558   else if (strcmp (name, "alpha") == 0)
3559     BUILD_CHARCLASS_LOOP (isalpha);
3560   else if (strcmp (name, "digit") == 0)
3561     BUILD_CHARCLASS_LOOP (isdigit);
3562   else if (strcmp (name, "print") == 0)
3563     BUILD_CHARCLASS_LOOP (isprint);
3564   else if (strcmp (name, "upper") == 0)
3565     BUILD_CHARCLASS_LOOP (isupper);
3566   else if (strcmp (name, "blank") == 0)
3567     BUILD_CHARCLASS_LOOP (isblank);
3568   else if (strcmp (name, "graph") == 0)
3569     BUILD_CHARCLASS_LOOP (isgraph);
3570   else if (strcmp (name, "punct") == 0)
3571     BUILD_CHARCLASS_LOOP (ispunct);
3572   else if (strcmp (name, "xdigit") == 0)
3573     BUILD_CHARCLASS_LOOP (isxdigit);
3574   else
3575     return REG_ECTYPE;
3576
3577   return REG_NOERROR;
3578 }
3579
3580 static bin_tree_t *
3581 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3582                     const unsigned char *class_name,
3583                     const unsigned char *extra, int non_match,
3584                     reg_errcode_t *err)
3585 {
3586   re_bitset_ptr_t sbcset;
3587 #ifdef RE_ENABLE_I18N
3588   re_charset_t *mbcset;
3589   int alloc = 0;
3590 #endif /* not RE_ENABLE_I18N */
3591   reg_errcode_t ret;
3592   re_token_t br_token;
3593   bin_tree_t *tree;
3594
3595   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3596 #ifdef RE_ENABLE_I18N
3597   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3598 #endif /* RE_ENABLE_I18N */
3599
3600 #ifdef RE_ENABLE_I18N
3601   if (BE (sbcset == NULL || mbcset == NULL, 0))
3602 #else /* not RE_ENABLE_I18N */
3603   if (BE (sbcset == NULL, 0))
3604 #endif /* not RE_ENABLE_I18N */
3605     {
3606       *err = REG_ESPACE;
3607       return NULL;
3608     }
3609
3610   if (non_match)
3611     {
3612 #ifdef RE_ENABLE_I18N
3613       mbcset->non_match = 1;
3614 #endif /* not RE_ENABLE_I18N */
3615     }
3616
3617   /* We don't care the syntax in this case.  */
3618   ret = build_charclass (trans, sbcset,
3619 #ifdef RE_ENABLE_I18N
3620                          mbcset, &alloc,
3621 #endif /* RE_ENABLE_I18N */
3622                          class_name, 0);
3623
3624   if (BE (ret != REG_NOERROR, 0))
3625     {
3626       re_free (sbcset);
3627 #ifdef RE_ENABLE_I18N
3628       free_charset (mbcset);
3629 #endif /* RE_ENABLE_I18N */
3630       *err = ret;
3631       return NULL;
3632     }
3633   /* \w match '_' also.  */
3634   for (; *extra; extra++)
3635     bitset_set (sbcset, *extra);
3636
3637   /* If it is non-matching list.  */
3638   if (non_match)
3639     bitset_not (sbcset);
3640
3641 #ifdef RE_ENABLE_I18N
3642   /* Ensure only single byte characters are set.  */
3643   if (dfa->mb_cur_max > 1)
3644     bitset_mask (sbcset, dfa->sb_char);
3645 #endif
3646
3647   /* Build a tree for simple bracket.  */
3648   br_token.type = SIMPLE_BRACKET;
3649   br_token.opr.sbcset = sbcset;
3650   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3651   if (BE (tree == NULL, 0))
3652     goto build_word_op_espace;
3653
3654 #ifdef RE_ENABLE_I18N
3655   if (dfa->mb_cur_max > 1)
3656     {
3657       bin_tree_t *mbc_tree;
3658       /* Build a tree for complex bracket.  */
3659       br_token.type = COMPLEX_BRACKET;
3660       br_token.opr.mbcset = mbcset;
3661       dfa->has_mb_node = 1;
3662       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3663       if (BE (mbc_tree == NULL, 0))
3664         goto build_word_op_espace;
3665       /* Then join them by ALT node.  */
3666       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3667       if (BE (mbc_tree != NULL, 1))
3668         return tree;
3669     }
3670   else
3671     {
3672       free_charset (mbcset);
3673       return tree;
3674     }
3675 #else /* not RE_ENABLE_I18N */
3676   return tree;
3677 #endif /* not RE_ENABLE_I18N */
3678
3679  build_word_op_espace:
3680   re_free (sbcset);
3681 #ifdef RE_ENABLE_I18N
3682   free_charset (mbcset);
3683 #endif /* RE_ENABLE_I18N */
3684   *err = REG_ESPACE;
3685   return NULL;
3686 }
3687
3688 /* This is intended for the expressions like "a{1,3}".
3689    Fetch a number from `input', and return the number.
3690    Return -1, if the number field is empty like "{,1}".
3691    Return -2, If an error is occured.  */
3692
3693 static int
3694 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3695 {
3696   int num = -1;
3697   unsigned char c;
3698   while (1)
3699     {
3700       fetch_token (token, input, syntax);
3701       c = token->opr.c;
3702       if (BE (token->type == END_OF_RE, 0))
3703         return -2;
3704       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3705         break;
3706       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3707              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3708       num = (num > RE_DUP_MAX) ? -2 : num;
3709     }
3710   return num;
3711 }
3712 \f
3713 #ifdef RE_ENABLE_I18N
3714 static void
3715 free_charset (re_charset_t *cset)
3716 {
3717   re_free (cset->mbchars);
3718 # ifdef _LIBC
3719   re_free (cset->coll_syms);
3720   re_free (cset->equiv_classes);
3721   re_free (cset->range_starts);
3722   re_free (cset->range_ends);
3723 # endif
3724   re_free (cset->char_classes);
3725   re_free (cset);
3726 }
3727 #endif /* RE_ENABLE_I18N */
3728 \f
3729 /* Functions for binary tree operation.  */
3730
3731 /* Create a tree node.  */
3732
3733 static bin_tree_t *
3734 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3735              re_token_type_t type)
3736 {
3737   re_token_t t;
3738   t.type = type;
3739   return create_token_tree (dfa, left, right, &t);
3740 }
3741
3742 static bin_tree_t *
3743 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3744                    const re_token_t *token)
3745 {
3746   bin_tree_t *tree;
3747   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3748     {
3749       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3750
3751       if (storage == NULL)
3752         return NULL;
3753       storage->next = dfa->str_tree_storage;
3754       dfa->str_tree_storage = storage;
3755       dfa->str_tree_storage_idx = 0;
3756     }
3757   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3758
3759   tree->parent = NULL;
3760   tree->left = left;
3761   tree->right = right;
3762   tree->token = *token;
3763   tree->token.duplicated = 0;
3764   tree->token.opt_subexp = 0;
3765   tree->first = NULL;
3766   tree->next = NULL;
3767   tree->node_idx = -1;
3768
3769   if (left != NULL)
3770     left->parent = tree;
3771   if (right != NULL)
3772     right->parent = tree;
3773   return tree;
3774 }
3775
3776 /* Mark the tree SRC as an optional subexpression.
3777    To be called from preorder or postorder.  */
3778
3779 static reg_errcode_t
3780 mark_opt_subexp (void *extra, bin_tree_t *node)
3781 {
3782   int idx = (int) (long) extra;
3783   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3784     node->token.opt_subexp = 1;
3785
3786   return REG_NOERROR;
3787 }
3788
3789 /* Free the allocated memory inside NODE. */
3790
3791 static void
3792 free_token (re_token_t *node)
3793 {
3794 #ifdef RE_ENABLE_I18N
3795   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3796     free_charset (node->opr.mbcset);
3797   else
3798 #endif /* RE_ENABLE_I18N */
3799     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3800       re_free (node->opr.sbcset);
3801 }
3802
3803 /* Worker function for tree walking.  Free the allocated memory inside NODE
3804    and its children. */
3805
3806 static reg_errcode_t
3807 free_tree (void *extra, bin_tree_t *node)
3808 {
3809   free_token (&node->token);
3810   return REG_NOERROR;
3811 }
3812
3813
3814 /* Duplicate the node SRC, and return new node.  This is a preorder
3815    visit similar to the one implemented by the generic visitor, but
3816    we need more infrastructure to maintain two parallel trees --- so,
3817    it's easier to duplicate.  */
3818
3819 static bin_tree_t *
3820 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3821 {
3822   const bin_tree_t *node;
3823   bin_tree_t *dup_root;
3824   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3825
3826   for (node = root; ; )
3827     {
3828       /* Create a new tree and link it back to the current parent.  */
3829       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3830       if (*p_new == NULL)
3831         return NULL;
3832       (*p_new)->parent = dup_node;
3833       (*p_new)->token.duplicated = 1;
3834       dup_node = *p_new;
3835
3836       /* Go to the left node, or up and to the right.  */
3837       if (node->left)
3838         {
3839           node = node->left;
3840           p_new = &dup_node->left;
3841         }
3842       else
3843         {
3844           const bin_tree_t *prev = NULL;
3845           while (node->right == prev || node->right == NULL)
3846             {
3847               prev = node;
3848               node = node->parent;
3849               dup_node = dup_node->parent;
3850               if (!node)
3851                 return dup_root;
3852             }
3853           node = node->right;
3854           p_new = &dup_node->right;
3855         }
3856     }
3857 }