posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <stdint.h>
  21
  22 #ifdef _LIBC
  23 # include <locale/weight.h>
  24 #endif
  25
  26 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  27                                           size_t length, reg_syntax_t syntax);
  28 static void re_compile_fastmap_iter (regex_t *bufp,
  29                                      const re_dfastate_t *init_state,
  30                                      char *fastmap);
  31 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  32 #ifdef RE_ENABLE_I18N
  33 static void free_charset (re_charset_t *cset);
  34 #endif /* RE_ENABLE_I18N */
  35 static void free_workarea_compile (regex_t *preg);
  36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  37 #ifdef RE_ENABLE_I18N
  38 static void optimize_utf8 (re_dfa_t *dfa);
  39 #endif
  40 static reg_errcode_t analyze (regex_t *preg);
  41 static reg_errcode_t preorder (bin_tree_t *root,
  42                                reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                void *extra);
  44 static reg_errcode_t postorder (bin_tree_t *root,
  45                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  46                                 void *extra);
  47 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  48 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  49 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  50                                  bin_tree_t *node);
  51 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  52 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  53 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  54 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  55 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  56                                    unsigned int constraint);
  57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  59                                          int node, int root);
  60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  61 static int fetch_number (re_string_t *input, re_token_t *token,
  62                          reg_syntax_t syntax);
  63 static int peek_token (re_token_t *token, re_string_t *input,
  64                         reg_syntax_t syntax) internal_function;
  65 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  66                           reg_syntax_t syntax, reg_errcode_t *err);
  67 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  68                                   re_token_t *token, reg_syntax_t syntax,
  69                                   int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  71                                  re_token_t *token, reg_syntax_t syntax,
  72                                  int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  74                                      re_token_t *token, reg_syntax_t syntax,
  75                                      int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  77                                   re_token_t *token, reg_syntax_t syntax,
  78                                   int nest, reg_errcode_t *err);
  79 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  80                                  re_dfa_t *dfa, re_token_t *token,
  81                                  reg_syntax_t syntax, reg_errcode_t *err);
  82 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  83                                       re_token_t *token, reg_syntax_t syntax,
  84                                       reg_errcode_t *err);
  85 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  86                                             re_string_t *regexp,
  87                                             re_token_t *token, int token_len,
  88                                             re_dfa_t *dfa,
  89                                             reg_syntax_t syntax,
  90                                             int accept_hyphen);
  91 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  92                                           re_string_t *regexp,
  93                                           re_token_t *token);
  94 #ifdef RE_ENABLE_I18N
  95 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  96                                         re_charset_t *mbcset,
  97                                         int *equiv_class_alloc,
  98                                         const unsigned char *name);
  99 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 100                                       bitset_t sbcset,
 101                                       re_charset_t *mbcset,
 102                                       int *char_class_alloc,
 103                                       const unsigned char *class_name,
 104                                       reg_syntax_t syntax);
 105 #else  /* not RE_ENABLE_I18N */
 106 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 107                                         const unsigned char *name);
 108 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 109                                       bitset_t sbcset,
 110                                       const unsigned char *class_name,
 111                                       reg_syntax_t syntax);
 112 #endif /* not RE_ENABLE_I18N */
 113 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 114                                        RE_TRANSLATE_TYPE trans,
 115                                        const unsigned char *class_name,
 116                                        const unsigned char *extra,
 117                                        int non_match, reg_errcode_t *err);
 118 static bin_tree_t *create_tree (re_dfa_t *dfa,
 119                                 bin_tree_t *left, bin_tree_t *right,
 120                                 re_token_type_t type);
 121 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 122                                       bin_tree_t *left, bin_tree_t *right,
 123                                       const re_token_t *token);
 124 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 125 static void free_token (re_token_t *node);
 126 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 127 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 128 \f
 129 /* This table gives an error message for each of the error codes listed
 130    in regex.h.  Obviously the order here has to be same as there.
 131    POSIX doesn't require that we do anything for REG_NOERROR,
 132    but why not be nice?  */
 133
 134 const char __re_error_msgid[] attribute_hidden =
 135   {
 136 #define REG_NOERROR_IDX 0
 137     gettext_noop ("Success")    /* REG_NOERROR */
 138     "\0"
 139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 140     gettext_noop ("No match")   /* REG_NOMATCH */
 141     "\0"
 142 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 143     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 144     "\0"
 145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 146     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 147     "\0"
 148 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 149     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 150     "\0"
 151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 152     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 153     "\0"
 154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 155     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 156     "\0"
 157 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 158     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 159     "\0"
 160 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 161     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 162     "\0"
 163 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 164     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 165     "\0"
 166 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 167     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 168     "\0"
 169 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 170     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 171     "\0"
 172 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 173     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 174     "\0"
 175 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 176     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 177     "\0"
 178 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 179     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 180     "\0"
 181 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 182     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 183     "\0"
 184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 185     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 186   };
 187
 188 const size_t __re_error_msgid_idx[] attribute_hidden =
 189   {
 190     REG_NOERROR_IDX,
 191     REG_NOMATCH_IDX,
 192     REG_BADPAT_IDX,
 193     REG_ECOLLATE_IDX,
 194     REG_ECTYPE_IDX,
 195     REG_EESCAPE_IDX,
 196     REG_ESUBREG_IDX,
 197     REG_EBRACK_IDX,
 198     REG_EPAREN_IDX,
 199     REG_EBRACE_IDX,
 200     REG_BADBR_IDX,
 201     REG_ERANGE_IDX,
 202     REG_ESPACE_IDX,
 203     REG_BADRPT_IDX,
 204     REG_EEND_IDX,
 205     REG_ESIZE_IDX,
 206     REG_ERPAREN_IDX
 207   };
 208 \f
 209 /* Entry points for GNU code.  */
 210
 211 /* re_compile_pattern is the GNU regular expression compiler: it
 212    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 213    Returns 0 if the pattern was valid, otherwise an error string.
 214
 215    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 216    are set in BUFP on entry.  */
 217
 218 const char *
 219 re_compile_pattern (const char *pattern, size_t length,
 220                     struct re_pattern_buffer *bufp)
 221 {
 222   reg_errcode_t ret;
 223
 224   /* And GNU code determines whether or not to get register information
 225      by passing null for the REGS argument to re_match, etc., not by
 226      setting no_sub, unless RE_NO_SUB is set.  */
 227   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 228
 229   /* Match anchors at newline.  */
 230   bufp->newline_anchor = 1;
 231
 232   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 233
 234   if (!ret)
 235     return NULL;
 236   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 237 }
 238 #ifdef _LIBC
 239 weak_alias (__re_compile_pattern, re_compile_pattern)
 240 #endif
 241
 242 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 243    also be assigned to arbitrarily: each pattern buffer stores its own
 244    syntax, so it can be changed between regex compilations.  */
 245 /* This has no initializer because initialized variables in Emacs
 246    become read-only after dumping.  */
 247 reg_syntax_t re_syntax_options;
 248
 249
 250 /* Specify the precise syntax of regexps for compilation.  This provides
 251    for compatibility for various utilities which historically have
 252    different, incompatible syntaxes.
 253
 254    The argument SYNTAX is a bit mask comprised of the various bits
 255    defined in regex.h.  We return the old syntax.  */
 256
 257 reg_syntax_t
 258 re_set_syntax (reg_syntax_t syntax)
 259 {
 260   reg_syntax_t ret = re_syntax_options;
 261
 262   re_syntax_options = syntax;
 263   return ret;
 264 }
 265 #ifdef _LIBC
 266 weak_alias (__re_set_syntax, re_set_syntax)
 267 #endif
 268
 269 int
 270 re_compile_fastmap (struct re_pattern_buffer *bufp)
 271 {
 272   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 273   char *fastmap = bufp->fastmap;
 274
 275   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 276   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 277   if (dfa->init_state != dfa->init_state_word)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 279   if (dfa->init_state != dfa->init_state_nl)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 281   if (dfa->init_state != dfa->init_state_begbuf)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 283   bufp->fastmap_accurate = 1;
 284   return 0;
 285 }
 286 #ifdef _LIBC
 287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 288 #endif
 289
 290 static inline void
 291 __attribute__ ((always_inline))
 292 re_set_fastmap (char *fastmap, bool icase, int ch)
 293 {
 294   fastmap[ch] = 1;
 295   if (icase)
 296     fastmap[tolower (ch)] = 1;
 297 }
 298
 299 /* Helper function for re_compile_fastmap.
 300    Compile fastmap for the initial_state INIT_STATE.  */
 301
 302 static void
 303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 304                          char *fastmap)
 305 {
 306   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 307   int node_cnt;
 308   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 309   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 310     {
 311       int node = init_state->nodes.elems[node_cnt];
 312       re_token_type_t type = dfa->nodes[node].type;
 313
 314       if (type == CHARACTER)
 315         {
 316           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 317 #ifdef RE_ENABLE_I18N
 318           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 319             {
 320               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 321               wchar_t wc;
 322               mbstate_t state;
 323
 324               p = buf;
 325               *p++ = dfa->nodes[node].opr.c;
 326               while (++node < dfa->nodes_len
 327                      && dfa->nodes[node].type == CHARACTER
 328                      && dfa->nodes[node].mb_partial)
 329                 *p++ = dfa->nodes[node].opr.c;
 330               memset (&state, '\0', sizeof (state));
 331               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 332                              &state) == p - buf
 333                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 334                       != (size_t) -1))
 335                 re_set_fastmap (fastmap, 0, buf[0]);
 336             }
 337 #endif
 338         }
 339       else if (type == SIMPLE_BRACKET)
 340         {
 341           int i, ch;
 342           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 343             {
 344               int j;
 345               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 346               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 347                 if (w & ((bitset_word_t) 1 << j))
 348                   re_set_fastmap (fastmap, icase, ch);
 349             }
 350         }
 351 #ifdef RE_ENABLE_I18N
 352       else if (type == COMPLEX_BRACKET)
 353         {
 354           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 355           int i;
 356
 357 # ifdef _LIBC
 358           /* See if we have to try all bytes which start multiple collation
 359              elements.
 360              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 361                   collation element, and don't catch 'b' since 'b' is
 362                   the only collation element which starts from 'b' (and
 363                   it is caught by SIMPLE_BRACKET).  */
 364               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 365                   && (cset->ncoll_syms || cset->nranges))
 366                 {
 367                   const int32_t *table = (const int32_t *)
 368                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 369                   for (i = 0; i < SBC_MAX; ++i)
 370                     if (table[i] < 0)
 371                       re_set_fastmap (fastmap, icase, i);
 372                 }
 373 # endif /* _LIBC */
 374
 375           /* See if we have to start the match at all multibyte characters,
 376              i.e. where we would not find an invalid sequence.  This only
 377              applies to multibyte character sets; for single byte character
 378              sets, the SIMPLE_BRACKET again suffices.  */
 379           if (dfa->mb_cur_max > 1
 380               && (cset->nchar_classes || cset->non_match || cset->nranges
 381 # ifdef _LIBC
 382                   || cset->nequiv_classes
 383 # endif /* _LIBC */
 384                  ))
 385             {
 386               unsigned char c = 0;
 387               do
 388                 {
 389                   mbstate_t mbs;
 390                   memset (&mbs, 0, sizeof (mbs));
 391                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 392                     re_set_fastmap (fastmap, false, (int) c);
 393                 }
 394               while (++c != 0);
 395             }
 396
 397           else
 398             {
 399               /* ... Else catch all bytes which can start the mbchars.  */
 400               for (i = 0; i < cset->nmbchars; ++i)
 401                 {
 402                   char buf[256];
 403                   mbstate_t state;
 404                   memset (&state, '\0', sizeof (state));
 405                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 406                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 407                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 408                     {
 409                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 410                           != (size_t) -1)
 411                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 412                     }
 413                 }
 414             }
 415         }
 416 #endif /* RE_ENABLE_I18N */
 417       else if (type == OP_PERIOD
 418 #ifdef RE_ENABLE_I18N
 419                || type == OP_UTF8_PERIOD
 420 #endif /* RE_ENABLE_I18N */
 421                || type == END_OF_RE)
 422         {
 423           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 424           if (type == END_OF_RE)
 425             bufp->can_be_null = 1;
 426           return;
 427         }
 428     }
 429 }
 430 \f
 431 /* Entry point for POSIX code.  */
 432 /* regcomp takes a regular expression as a string and compiles it.
 433
 434    PREG is a regex_t *.  We do not expect any fields to be initialized,
 435    since POSIX says we shouldn't.  Thus, we set
 436
 437      'buffer' to the compiled pattern;
 438      'used' to the length of the compiled pattern;
 439      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 440        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 441        RE_SYNTAX_POSIX_BASIC;
 442      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 443      'fastmap' to an allocated space for the fastmap;
 444      'fastmap_accurate' to zero;
 445      're_nsub' to the number of subexpressions in PATTERN.
 446
 447    PATTERN is the address of the pattern string.
 448
 449    CFLAGS is a series of bits which affect compilation.
 450
 451      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 452      use POSIX basic syntax.
 453
 454      If REG_NEWLINE is set, then . and [^...] don't match newline.
 455      Also, regexec will try a match beginning after every newline.
 456
 457      If REG_ICASE is set, then we considers upper- and lowercase
 458      versions of letters to be equivalent when matching.
 459
 460      If REG_NOSUB is set, then when PREG is passed to regexec, that
 461      routine will report only success or failure, and nothing about the
 462      registers.
 463
 464    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 465    the return codes and their meanings.)  */
 466
 467 int
 468 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 469 {
 470   reg_errcode_t ret;
 471   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 472                          : RE_SYNTAX_POSIX_BASIC);
 473
 474   preg->buffer = NULL;
 475   preg->allocated = 0;
 476   preg->used = 0;
 477
 478   /* Try to allocate space for the fastmap.  */
 479   preg->fastmap = re_malloc (char, SBC_MAX);
 480   if (BE (preg->fastmap == NULL, 0))
 481     return REG_ESPACE;
 482
 483   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 484
 485   /* If REG_NEWLINE is set, newlines are treated differently.  */
 486   if (cflags & REG_NEWLINE)
 487     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 488       syntax &= ~RE_DOT_NEWLINE;
 489       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 490       /* It also changes the matching behavior.  */
 491       preg->newline_anchor = 1;
 492     }
 493   else
 494     preg->newline_anchor = 0;
 495   preg->no_sub = !!(cflags & REG_NOSUB);
 496   preg->translate = NULL;
 497
 498   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 499
 500   /* POSIX doesn't distinguish between an unmatched open-group and an
 501      unmatched close-group: both are REG_EPAREN.  */
 502   if (ret == REG_ERPAREN)
 503     ret = REG_EPAREN;
 504
 505   /* We have already checked preg->fastmap != NULL.  */
 506   if (BE (ret == REG_NOERROR, 1))
 507     /* Compute the fastmap now, since regexec cannot modify the pattern
 508        buffer.  This function never fails in this implementation.  */
 509     (void) re_compile_fastmap (preg);
 510   else
 511     {
 512       /* Some error occurred while compiling the expression.  */
 513       re_free (preg->fastmap);
 514       preg->fastmap = NULL;
 515     }
 516
 517   return (int) ret;
 518 }
 519 #ifdef _LIBC
 520 weak_alias (__regcomp, regcomp)
 521 #endif
 522
 523 /* Returns a message corresponding to an error code, ERRCODE, returned
 524    from either regcomp or regexec.   We don't use PREG here.  */
 525
 526 size_t
 527 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
 528           size_t errbuf_size)
 529 {
 530   const char *msg;
 531   size_t msg_size;
 532
 533   if (BE (errcode < 0
 534           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 535                                / sizeof (__re_error_msgid_idx[0])), 0))
 536     /* Only error codes returned by the rest of the code should be passed
 537        to this routine.  If we are given anything else, or if other regex
 538        code generates an invalid error code, then the program has a bug.
 539        Dump core so we can fix it.  */
 540     abort ();
 541
 542   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 543
 544   msg_size = strlen (msg) + 1; /* Includes the null.  */
 545
 546   if (BE (errbuf_size != 0, 1))
 547     {
 548       if (BE (msg_size > errbuf_size, 0))
 549         {
 550 #if defined HAVE_MEMPCPY || defined _LIBC
 551           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 552 #else
 553           memcpy (errbuf, msg, errbuf_size - 1);
 554           errbuf[errbuf_size - 1] = 0;
 555 #endif
 556         }
 557       else
 558         memcpy (errbuf, msg, msg_size);
 559     }
 560
 561   return msg_size;
 562 }
 563 #ifdef _LIBC
 564 weak_alias (__regerror, regerror)
 565 #endif
 566
 567
 568 #ifdef RE_ENABLE_I18N
 569 /* This static array is used for the map to single-byte characters when
 570    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 571    it the same all the time.  UTF-8 is the preferred encoding so this is
 572    a worthwhile optimization.  */
 573 static const bitset_t utf8_sb_map =
 574 {
 575   /* Set the first 128 bits.  */
 576   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 577 };
 578 #endif
 579
 580
 581 static void
 582 free_dfa_content (re_dfa_t *dfa)
 583 {
 584   int i, j;
 585
 586   if (dfa->nodes)
 587     for (i = 0; i < dfa->nodes_len; ++i)
 588       free_token (dfa->nodes + i);
 589   re_free (dfa->nexts);
 590   for (i = 0; i < dfa->nodes_len; ++i)
 591     {
 592       if (dfa->eclosures != NULL)
 593         re_node_set_free (dfa->eclosures + i);
 594       if (dfa->inveclosures != NULL)
 595         re_node_set_free (dfa->inveclosures + i);
 596       if (dfa->edests != NULL)
 597         re_node_set_free (dfa->edests + i);
 598     }
 599   re_free (dfa->edests);
 600   re_free (dfa->eclosures);
 601   re_free (dfa->inveclosures);
 602   re_free (dfa->nodes);
 603
 604   if (dfa->state_table)
 605     for (i = 0; i <= dfa->state_hash_mask; ++i)
 606       {
 607         struct re_state_table_entry *entry = dfa->state_table + i;
 608         for (j = 0; j < entry->num; ++j)
 609           {
 610             re_dfastate_t *state = entry->array[j];
 611             free_state (state);
 612           }
 613         re_free (entry->array);
 614       }
 615   re_free (dfa->state_table);
 616 #ifdef RE_ENABLE_I18N
 617   if (dfa->sb_char != utf8_sb_map)
 618     re_free (dfa->sb_char);
 619 #endif
 620   re_free (dfa->subexp_map);
 621 #ifdef DEBUG
 622   re_free (dfa->re_str);
 623 #endif
 624
 625   re_free (dfa);
 626 }
 627
 628
 629 /* Free dynamically allocated space used by PREG.  */
 630
 631 void
 632 regfree (regex_t *preg)
 633 {
 634   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 635   if (BE (dfa != NULL, 1))
 636     free_dfa_content (dfa);
 637   preg->buffer = NULL;
 638   preg->allocated = 0;
 639
 640   re_free (preg->fastmap);
 641   preg->fastmap = NULL;
 642
 643   re_free (preg->translate);
 644   preg->translate = NULL;
 645 }
 646 #ifdef _LIBC
 647 weak_alias (__regfree, regfree)
 648 #endif
 649 \f
 650 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 651    them unless specifically requested.  */
 652
 653 #if defined _REGEX_RE_COMP || defined _LIBC
 654
 655 /* BSD has one and only one pattern buffer.  */
 656 static struct re_pattern_buffer re_comp_buf;
 657
 658 char *
 659 # ifdef _LIBC
 660 /* Make these definitions weak in libc, so POSIX programs can redefine
 661    these names if they don't use our functions, and still use
 662    regcomp/regexec above without link errors.  */
 663 weak_function
 664 # endif
 665 re_comp (const char *s)
 666 {
 667   reg_errcode_t ret;
 668   char *fastmap;
 669
 670   if (!s)
 671     {
 672       if (!re_comp_buf.buffer)
 673         return gettext ("No previous regular expression");
 674       return 0;
 675     }
 676
 677   if (re_comp_buf.buffer)
 678     {
 679       fastmap = re_comp_buf.fastmap;
 680       re_comp_buf.fastmap = NULL;
 681       __regfree (&re_comp_buf);
 682       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 683       re_comp_buf.fastmap = fastmap;
 684     }
 685
 686   if (re_comp_buf.fastmap == NULL)
 687     {
 688       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 689       if (re_comp_buf.fastmap == NULL)
 690         return (char *) gettext (__re_error_msgid
 691                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 692     }
 693
 694   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 695      don't need to initialize the pattern buffer fields which affect it.  */
 696
 697   /* Match anchors at newlines.  */
 698   re_comp_buf.newline_anchor = 1;
 699
 700   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 701
 702   if (!ret)
 703     return NULL;
 704
 705   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 706   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 707 }
 708
 709 #ifdef _LIBC
 710 libc_freeres_fn (free_mem)
 711 {
 712   __regfree (&re_comp_buf);
 713 }
 714 #endif
 715
 716 #endif /* _REGEX_RE_COMP */
 717 \f
 718 /* Internal entry point.
 719    Compile the regular expression PATTERN, whose length is LENGTH.
 720    SYNTAX indicate regular expression's syntax.  */
 721
 722 static reg_errcode_t
 723 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 724                      reg_syntax_t syntax)
 725 {
 726   reg_errcode_t err = REG_NOERROR;
 727   re_dfa_t *dfa;
 728   re_string_t regexp;
 729
 730   /* Initialize the pattern buffer.  */
 731   preg->fastmap_accurate = 0;
 732   preg->syntax = syntax;
 733   preg->not_bol = preg->not_eol = 0;
 734   preg->used = 0;
 735   preg->re_nsub = 0;
 736   preg->can_be_null = 0;
 737   preg->regs_allocated = REGS_UNALLOCATED;
 738
 739   /* Initialize the dfa.  */
 740   dfa = (re_dfa_t *) preg->buffer;
 741   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 742     {
 743       /* If zero allocated, but buffer is non-null, try to realloc
 744          enough space.  This loses if buffer's address is bogus, but
 745          that is the user's responsibility.  If ->buffer is NULL this
 746          is a simple allocation.  */
 747       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 748       if (dfa == NULL)
 749         return REG_ESPACE;
 750       preg->allocated = sizeof (re_dfa_t);
 751       preg->buffer = (unsigned char *) dfa;
 752     }
 753   preg->used = sizeof (re_dfa_t);
 754
 755   err = init_dfa (dfa, length);
 756   if (BE (err != REG_NOERROR, 0))
 757     {
 758       free_dfa_content (dfa);
 759       preg->buffer = NULL;
 760       preg->allocated = 0;
 761       return err;
 762     }
 763 #ifdef DEBUG
 764   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 765   dfa->re_str = re_malloc (char, length + 1);
 766   strncpy (dfa->re_str, pattern, length + 1);
 767 #endif
 768
 769   __libc_lock_init (dfa->lock);
 770
 771   err = re_string_construct (&regexp, pattern, length, preg->translate,
 772                              syntax & RE_ICASE, dfa);
 773   if (BE (err != REG_NOERROR, 0))
 774     {
 775     re_compile_internal_free_return:
 776       free_workarea_compile (preg);
 777       re_string_destruct (&regexp);
 778       free_dfa_content (dfa);
 779       preg->buffer = NULL;
 780       preg->allocated = 0;
 781       return err;
 782     }
 783
 784   /* Parse the regular expression, and build a structure tree.  */
 785   preg->re_nsub = 0;
 786   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 787   if (BE (dfa->str_tree == NULL, 0))
 788     goto re_compile_internal_free_return;
 789
 790   /* Analyze the tree and create the nfa.  */
 791   err = analyze (preg);
 792   if (BE (err != REG_NOERROR, 0))
 793     goto re_compile_internal_free_return;
 794
 795 #ifdef RE_ENABLE_I18N
 796   /* If possible, do searching in single byte encoding to speed things up.  */
 797   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 798     optimize_utf8 (dfa);
 799 #endif
 800
 801   /* Then create the initial state of the dfa.  */
 802   err = create_initial_state (dfa);
 803
 804   /* Release work areas.  */
 805   free_workarea_compile (preg);
 806   re_string_destruct (&regexp);
 807
 808   if (BE (err != REG_NOERROR, 0))
 809     {
 810       free_dfa_content (dfa);
 811       preg->buffer = NULL;
 812       preg->allocated = 0;
 813     }
 814
 815   return err;
 816 }
 817
 818 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 819    as the initial length of some arrays.  */
 820
 821 static reg_errcode_t
 822 init_dfa (re_dfa_t *dfa, size_t pat_len)
 823 {
 824   unsigned int table_size;
 825 #ifndef _LIBC
 826   char *codeset_name;
 827 #endif
 828
 829   memset (dfa, '\0', sizeof (re_dfa_t));
 830
 831   /* Force allocation of str_tree_storage the first time.  */
 832   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 833
 834   /* Avoid overflows.  */
 835   if (pat_len == SIZE_MAX)
 836     return REG_ESPACE;
 837
 838   dfa->nodes_alloc = pat_len + 1;
 839   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 840
 841   /*  table_size = 2 ^ ceil(log pat_len) */
 842   for (table_size = 1; ; table_size <<= 1)
 843     if (table_size > pat_len)
 844       break;
 845
 846   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 847   dfa->state_hash_mask = table_size - 1;
 848
 849   dfa->mb_cur_max = MB_CUR_MAX;
 850 #ifdef _LIBC
 851   if (dfa->mb_cur_max == 6
 852       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 853     dfa->is_utf8 = 1;
 854   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 855                        != 0);
 856 #else
 857 # ifdef HAVE_LANGINFO_CODESET
 858   codeset_name = nl_langinfo (CODESET);
 859 # else
 860   codeset_name = getenv ("LC_ALL");
 861   if (codeset_name == NULL || codeset_name[0] == '\0')
 862     codeset_name = getenv ("LC_CTYPE");
 863   if (codeset_name == NULL || codeset_name[0] == '\0')
 864     codeset_name = getenv ("LANG");
 865   if (codeset_name == NULL)
 866     codeset_name = "";
 867   else if (strchr (codeset_name, '.') !=  NULL)
 868     codeset_name = strchr (codeset_name, '.') + 1;
 869 # endif
 870
 871   if (strcasecmp (codeset_name, "UTF-8") == 0
 872       || strcasecmp (codeset_name, "UTF8") == 0)
 873     dfa->is_utf8 = 1;
 874
 875   /* We check exhaustively in the loop below if this charset is a
 876      superset of ASCII.  */
 877   dfa->map_notascii = 0;
 878 #endif
 879
 880 #ifdef RE_ENABLE_I18N
 881   if (dfa->mb_cur_max > 1)
 882     {
 883       if (dfa->is_utf8)
 884         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 885       else
 886         {
 887           int i, j, ch;
 888
 889           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 890           if (BE (dfa->sb_char == NULL, 0))
 891             return REG_ESPACE;
 892
 893           /* Set the bits corresponding to single byte chars.  */
 894           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 895             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 896               {
 897                 wint_t wch = __btowc (ch);
 898                 if (wch != WEOF)
 899                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 900 # ifndef _LIBC
 901                 if (isascii (ch) && wch != ch)
 902                   dfa->map_notascii = 1;
 903 # endif
 904               }
 905         }
 906     }
 907 #endif
 908
 909   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 910     return REG_ESPACE;
 911   return REG_NOERROR;
 912 }
 913
 914 /* Initialize WORD_CHAR table, which indicate which character is
 915    "word".  In this case "word" means that it is the word construction
 916    character used by some operators like "\<", "\>", etc.  */
 917
 918 static void
 919 internal_function
 920 init_word_char (re_dfa_t *dfa)
 921 {
 922   dfa->word_ops_used = 1;
 923   int i = 0;
 924   int ch = 0;
 925   if (BE (dfa->map_notascii == 0, 1))
 926     {
 927       if (sizeof (dfa->word_char[0]) == 8)
 928         {
 929           /* The extra temporaries here avoid "implicitly truncated"
 930              warnings in the case when this is dead code, i.e. 32-bit.  */
 931           const uint64_t wc0 = UINT64_C (0x03ff000000000000);
 932           const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
 933           dfa->word_char[0] = wc0;
 934           dfa->word_char[1] = wc1;
 935           i = 2;
 936         }
 937       else if (sizeof (dfa->word_char[0]) == 4)
 938         {
 939           dfa->word_char[0] = UINT32_C (0x00000000);
 940           dfa->word_char[1] = UINT32_C (0x03ff0000);
 941           dfa->word_char[2] = UINT32_C (0x87fffffe);
 942           dfa->word_char[3] = UINT32_C (0x07fffffe);
 943           i = 4;
 944         }
 945       else
 946         abort ();
 947       ch = 128;
 948
 949       if (BE (dfa->is_utf8, 1))
 950         {
 951           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 952           return;
 953         }
 954     }
 955
 956   for (; i < BITSET_WORDS; ++i)
 957     for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 958       if (isalnum (ch) || ch == '_')
 959         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 960 }
 961
 962 /* Free the work area which are only used while compiling.  */
 963
 964 static void
 965 free_workarea_compile (regex_t *preg)
 966 {
 967   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 968   bin_tree_storage_t *storage, *next;
 969   for (storage = dfa->str_tree_storage; storage; storage = next)
 970     {
 971       next = storage->next;
 972       re_free (storage);
 973     }
 974   dfa->str_tree_storage = NULL;
 975   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 976   dfa->str_tree = NULL;
 977   re_free (dfa->org_indices);
 978   dfa->org_indices = NULL;
 979 }
 980
 981 /* Create initial states for all contexts.  */
 982
 983 static reg_errcode_t
 984 create_initial_state (re_dfa_t *dfa)
 985 {
 986   int first, i;
 987   reg_errcode_t err;
 988   re_node_set init_nodes;
 989
 990   /* Initial states have the epsilon closure of the node which is
 991      the first node of the regular expression.  */
 992   first = dfa->str_tree->first->node_idx;
 993   dfa->init_node = first;
 994   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 995   if (BE (err != REG_NOERROR, 0))
 996     return err;
 997
 998   /* The back-references which are in initial states can epsilon transit,
 999      since in this case all of the subexpressions can be null.
1000      Then we add epsilon closures of the nodes which are the next nodes of
1001      the back-references.  */
1002   if (dfa->nbackref > 0)
1003     for (i = 0; i < init_nodes.nelem; ++i)
1004       {
1005         int node_idx = init_nodes.elems[i];
1006         re_token_type_t type = dfa->nodes[node_idx].type;
1007
1008         int clexp_idx;
1009         if (type != OP_BACK_REF)
1010           continue;
1011         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1012           {
1013             re_token_t *clexp_node;
1014             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1015             if (clexp_node->type == OP_CLOSE_SUBEXP
1016                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1017               break;
1018           }
1019         if (clexp_idx == init_nodes.nelem)
1020           continue;
1021
1022         if (type == OP_BACK_REF)
1023           {
1024             int dest_idx = dfa->edests[node_idx].elems[0];
1025             if (!re_node_set_contains (&init_nodes, dest_idx))
1026               {
1027                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1028                                                        dfa->eclosures
1029                                                        + dest_idx);
1030                 if (err != REG_NOERROR)
1031                   return err;
1032                 i = 0;
1033               }
1034           }
1035       }
1036
1037   /* It must be the first time to invoke acquire_state.  */
1038   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1039   /* We don't check ERR here, since the initial state must not be NULL.  */
1040   if (BE (dfa->init_state == NULL, 0))
1041     return err;
1042   if (dfa->init_state->has_constraint)
1043     {
1044       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1045                                                        CONTEXT_WORD);
1046       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1047                                                      CONTEXT_NEWLINE);
1048       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1049                                                          &init_nodes,
1050                                                          CONTEXT_NEWLINE
1051                                                          | CONTEXT_BEGBUF);
1052       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1053               || dfa->init_state_begbuf == NULL, 0))
1054         return err;
1055     }
1056   else
1057     dfa->init_state_word = dfa->init_state_nl
1058       = dfa->init_state_begbuf = dfa->init_state;
1059
1060   re_node_set_free (&init_nodes);
1061   return REG_NOERROR;
1062 }
1063 \f
1064 #ifdef RE_ENABLE_I18N
1065 /* If it is possible to do searching in single byte encoding instead of UTF-8
1066    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1067    DFA nodes where needed.  */
1068
1069 static void
1070 optimize_utf8 (re_dfa_t *dfa)
1071 {
1072   int node, i, mb_chars = 0, has_period = 0;
1073
1074   for (node = 0; node < dfa->nodes_len; ++node)
1075     switch (dfa->nodes[node].type)
1076       {
1077       case CHARACTER:
1078         if (dfa->nodes[node].opr.c >= 0x80)
1079           mb_chars = 1;
1080         break;
1081       case ANCHOR:
1082         switch (dfa->nodes[node].opr.ctx_type)
1083           {
1084           case LINE_FIRST:
1085           case LINE_LAST:
1086           case BUF_FIRST:
1087           case BUF_LAST:
1088             break;
1089           default:
1090             /* Word anchors etc. cannot be handled.  It's okay to test
1091                opr.ctx_type since constraints (for all DFA nodes) are
1092                created by ORing one or more opr.ctx_type values.  */
1093             return;
1094           }
1095         break;
1096       case OP_PERIOD:
1097         has_period = 1;
1098         break;
1099       case OP_BACK_REF:
1100       case OP_ALT:
1101       case END_OF_RE:
1102       case OP_DUP_ASTERISK:
1103       case OP_OPEN_SUBEXP:
1104       case OP_CLOSE_SUBEXP:
1105         break;
1106       case COMPLEX_BRACKET:
1107         return;
1108       case SIMPLE_BRACKET:
1109         /* Just double check.  The non-ASCII range starts at 0x80.  */
1110         assert (0x80 % BITSET_WORD_BITS == 0);
1111         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1112           if (dfa->nodes[node].opr.sbcset[i])
1113             return;
1114         break;
1115       default:
1116         abort ();
1117       }
1118
1119   if (mb_chars || has_period)
1120     for (node = 0; node < dfa->nodes_len; ++node)
1121       {
1122         if (dfa->nodes[node].type == CHARACTER
1123             && dfa->nodes[node].opr.c >= 0x80)
1124           dfa->nodes[node].mb_partial = 0;
1125         else if (dfa->nodes[node].type == OP_PERIOD)
1126           dfa->nodes[node].type = OP_UTF8_PERIOD;
1127       }
1128
1129   /* The search can be in single byte locale.  */
1130   dfa->mb_cur_max = 1;
1131   dfa->is_utf8 = 0;
1132   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1133 }
1134 #endif
1135 \f
1136 /* Analyze the structure tree, and calculate "first", "next", "edest",
1137    "eclosure", and "inveclosure".  */
1138
1139 static reg_errcode_t
1140 analyze (regex_t *preg)
1141 {
1142   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1143   reg_errcode_t ret;
1144
1145   /* Allocate arrays.  */
1146   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1147   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1148   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1149   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1150   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1151           || dfa->eclosures == NULL, 0))
1152     return REG_ESPACE;
1153
1154   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1155   if (dfa->subexp_map != NULL)
1156     {
1157       int i;
1158       for (i = 0; i < preg->re_nsub; i++)
1159         dfa->subexp_map[i] = i;
1160       preorder (dfa->str_tree, optimize_subexps, dfa);
1161       for (i = 0; i < preg->re_nsub; i++)
1162         if (dfa->subexp_map[i] != i)
1163           break;
1164       if (i == preg->re_nsub)
1165         {
1166           free (dfa->subexp_map);
1167           dfa->subexp_map = NULL;
1168         }
1169     }
1170
1171   ret = postorder (dfa->str_tree, lower_subexps, preg);
1172   if (BE (ret != REG_NOERROR, 0))
1173     return ret;
1174   ret = postorder (dfa->str_tree, calc_first, dfa);
1175   if (BE (ret != REG_NOERROR, 0))
1176     return ret;
1177   preorder (dfa->str_tree, calc_next, dfa);
1178   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1179   if (BE (ret != REG_NOERROR, 0))
1180     return ret;
1181   ret = calc_eclosure (dfa);
1182   if (BE (ret != REG_NOERROR, 0))
1183     return ret;
1184
1185   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1186      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1187   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1188       || dfa->nbackref)
1189     {
1190       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1191       if (BE (dfa->inveclosures == NULL, 0))
1192         return REG_ESPACE;
1193       ret = calc_inveclosure (dfa);
1194     }
1195
1196   return ret;
1197 }
1198
1199 /* Our parse trees are very unbalanced, so we cannot use a stack to
1200    implement parse tree visits.  Instead, we use parent pointers and
1201    some hairy code in these two functions.  */
1202 static reg_errcode_t
1203 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1204            void *extra)
1205 {
1206   bin_tree_t *node, *prev;
1207
1208   for (node = root; ; )
1209     {
1210       /* Descend down the tree, preferably to the left (or to the right
1211          if that's the only child).  */
1212       while (node->left || node->right)
1213         if (node->left)
1214           node = node->left;
1215         else
1216           node = node->right;
1217
1218       do
1219         {
1220           reg_errcode_t err = fn (extra, node);
1221           if (BE (err != REG_NOERROR, 0))
1222             return err;
1223           if (node->parent == NULL)
1224             return REG_NOERROR;
1225           prev = node;
1226           node = node->parent;
1227         }
1228       /* Go up while we have a node that is reached from the right.  */
1229       while (node->right == prev || node->right == NULL);
1230       node = node->right;
1231     }
1232 }
1233
1234 static reg_errcode_t
1235 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1236           void *extra)
1237 {
1238   bin_tree_t *node;
1239
1240   for (node = root; ; )
1241     {
1242       reg_errcode_t err = fn (extra, node);
1243       if (BE (err != REG_NOERROR, 0))
1244         return err;
1245
1246       /* Go to the left node, or up and to the right.  */
1247       if (node->left)
1248         node = node->left;
1249       else
1250         {
1251           bin_tree_t *prev = NULL;
1252           while (node->right == prev || node->right == NULL)
1253             {
1254               prev = node;
1255               node = node->parent;
1256               if (!node)
1257                 return REG_NOERROR;
1258             }
1259           node = node->right;
1260         }
1261     }
1262 }
1263
1264 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1265    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1266    backreferences as well.  Requires a preorder visit.  */
1267 static reg_errcode_t
1268 optimize_subexps (void *extra, bin_tree_t *node)
1269 {
1270   re_dfa_t *dfa = (re_dfa_t *) extra;
1271
1272   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1273     {
1274       int idx = node->token.opr.idx;
1275       node->token.opr.idx = dfa->subexp_map[idx];
1276       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1277     }
1278
1279   else if (node->token.type == SUBEXP
1280            && node->left && node->left->token.type == SUBEXP)
1281     {
1282       int other_idx = node->left->token.opr.idx;
1283
1284       node->left = node->left->left;
1285       if (node->left)
1286         node->left->parent = node;
1287
1288       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1289       if (other_idx < BITSET_WORD_BITS)
1290           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1291     }
1292
1293   return REG_NOERROR;
1294 }
1295
1296 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1297    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1298 static reg_errcode_t
1299 lower_subexps (void *extra, bin_tree_t *node)
1300 {
1301   regex_t *preg = (regex_t *) extra;
1302   reg_errcode_t err = REG_NOERROR;
1303
1304   if (node->left && node->left->token.type == SUBEXP)
1305     {
1306       node->left = lower_subexp (&err, preg, node->left);
1307       if (node->left)
1308         node->left->parent = node;
1309     }
1310   if (node->right && node->right->token.type == SUBEXP)
1311     {
1312       node->right = lower_subexp (&err, preg, node->right);
1313       if (node->right)
1314         node->right->parent = node;
1315     }
1316
1317   return err;
1318 }
1319
1320 static bin_tree_t *
1321 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1322 {
1323   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1324   bin_tree_t *body = node->left;
1325   bin_tree_t *op, *cls, *tree1, *tree;
1326
1327   if (preg->no_sub
1328       /* We do not optimize empty subexpressions, because otherwise we may
1329          have bad CONCAT nodes with NULL children.  This is obviously not
1330          very common, so we do not lose much.  An example that triggers
1331          this case is the sed "script" /\(\)/x.  */
1332       && node->left != NULL
1333       && (node->token.opr.idx >= BITSET_WORD_BITS
1334           || !(dfa->used_bkref_map
1335                & ((bitset_word_t) 1 << node->token.opr.idx))))
1336     return node->left;
1337
1338   /* Convert the SUBEXP node to the concatenation of an
1339      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1340   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1341   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1342   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1343   tree = create_tree (dfa, op, tree1, CONCAT);
1344   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1345     {
1346       *err = REG_ESPACE;
1347       return NULL;
1348     }
1349
1350   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1351   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1352   return tree;
1353 }
1354
1355 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1356    nodes.  Requires a postorder visit.  */
1357 static reg_errcode_t
1358 calc_first (void *extra, bin_tree_t *node)
1359 {
1360   re_dfa_t *dfa = (re_dfa_t *) extra;
1361   if (node->token.type == CONCAT)
1362     {
1363       node->first = node->left->first;
1364       node->node_idx = node->left->node_idx;
1365     }
1366   else
1367     {
1368       node->first = node;
1369       node->node_idx = re_dfa_add_node (dfa, node->token);
1370       if (BE (node->node_idx == -1, 0))
1371         return REG_ESPACE;
1372       if (node->token.type == ANCHOR)
1373         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1374     }
1375   return REG_NOERROR;
1376 }
1377
1378 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1379 static reg_errcode_t
1380 calc_next (void *extra, bin_tree_t *node)
1381 {
1382   switch (node->token.type)
1383     {
1384     case OP_DUP_ASTERISK:
1385       node->left->next = node;
1386       break;
1387     case CONCAT:
1388       node->left->next = node->right->first;
1389       node->right->next = node->next;
1390       break;
1391     default:
1392       if (node->left)
1393         node->left->next = node->next;
1394       if (node->right)
1395         node->right->next = node->next;
1396       break;
1397     }
1398   return REG_NOERROR;
1399 }
1400
1401 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1402 static reg_errcode_t
1403 link_nfa_nodes (void *extra, bin_tree_t *node)
1404 {
1405   re_dfa_t *dfa = (re_dfa_t *) extra;
1406   int idx = node->node_idx;
1407   reg_errcode_t err = REG_NOERROR;
1408
1409   switch (node->token.type)
1410     {
1411     case CONCAT:
1412       break;
1413
1414     case END_OF_RE:
1415       assert (node->next == NULL);
1416       break;
1417
1418     case OP_DUP_ASTERISK:
1419     case OP_ALT:
1420       {
1421         int left, right;
1422         dfa->has_plural_match = 1;
1423         if (node->left != NULL)
1424           left = node->left->first->node_idx;
1425         else
1426           left = node->next->node_idx;
1427         if (node->right != NULL)
1428           right = node->right->first->node_idx;
1429         else
1430           right = node->next->node_idx;
1431         assert (left > -1);
1432         assert (right > -1);
1433         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1434       }
1435       break;
1436
1437     case ANCHOR:
1438     case OP_OPEN_SUBEXP:
1439     case OP_CLOSE_SUBEXP:
1440       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1441       break;
1442
1443     case OP_BACK_REF:
1444       dfa->nexts[idx] = node->next->node_idx;
1445       if (node->token.type == OP_BACK_REF)
1446         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1447       break;
1448
1449     default:
1450       assert (!IS_EPSILON_NODE (node->token.type));
1451       dfa->nexts[idx] = node->next->node_idx;
1452       break;
1453     }
1454
1455   return err;
1456 }
1457
1458 /* Duplicate the epsilon closure of the node ROOT_NODE.
1459    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1460    to their own constraint.  */
1461
1462 static reg_errcode_t
1463 internal_function
1464 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1465                         int root_node, unsigned int init_constraint)
1466 {
1467   int org_node, clone_node, ret;
1468   unsigned int constraint = init_constraint;
1469   for (org_node = top_org_node, clone_node = top_clone_node;;)
1470     {
1471       int org_dest, clone_dest;
1472       if (dfa->nodes[org_node].type == OP_BACK_REF)
1473         {
1474           /* If the back reference epsilon-transit, its destination must
1475              also have the constraint.  Then duplicate the epsilon closure
1476              of the destination of the back reference, and store it in
1477              edests of the back reference.  */
1478           org_dest = dfa->nexts[org_node];
1479           re_node_set_empty (dfa->edests + clone_node);
1480           clone_dest = duplicate_node (dfa, org_dest, constraint);
1481           if (BE (clone_dest == -1, 0))
1482             return REG_ESPACE;
1483           dfa->nexts[clone_node] = dfa->nexts[org_node];
1484           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1485           if (BE (ret < 0, 0))
1486             return REG_ESPACE;
1487         }
1488       else if (dfa->edests[org_node].nelem == 0)
1489         {
1490           /* In case of the node can't epsilon-transit, don't duplicate the
1491              destination and store the original destination as the
1492              destination of the node.  */
1493           dfa->nexts[clone_node] = dfa->nexts[org_node];
1494           break;
1495         }
1496       else if (dfa->edests[org_node].nelem == 1)
1497         {
1498           /* In case of the node can epsilon-transit, and it has only one
1499              destination.  */
1500           org_dest = dfa->edests[org_node].elems[0];
1501           re_node_set_empty (dfa->edests + clone_node);
1502           /* If the node is root_node itself, it means the epsilon closure
1503              has a loop.   Then tie it to the destination of the root_node.  */
1504           if (org_node == root_node && clone_node != org_node)
1505             {
1506               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1507               if (BE (ret < 0, 0))
1508                 return REG_ESPACE;
1509               break;
1510             }
1511           /* In case the node has another constraint, append it.  */
1512           constraint |= dfa->nodes[org_node].constraint;
1513           clone_dest = duplicate_node (dfa, org_dest, constraint);
1514           if (BE (clone_dest == -1, 0))
1515             return REG_ESPACE;
1516           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517           if (BE (ret < 0, 0))
1518             return REG_ESPACE;
1519         }
1520       else /* dfa->edests[org_node].nelem == 2 */
1521         {
1522           /* In case of the node can epsilon-transit, and it has two
1523              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1524           org_dest = dfa->edests[org_node].elems[0];
1525           re_node_set_empty (dfa->edests + clone_node);
1526           /* Search for a duplicated node which satisfies the constraint.  */
1527           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1528           if (clone_dest == -1)
1529             {
1530               /* There is no such duplicated node, create a new one.  */
1531               reg_errcode_t err;
1532               clone_dest = duplicate_node (dfa, org_dest, constraint);
1533               if (BE (clone_dest == -1, 0))
1534                 return REG_ESPACE;
1535               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536               if (BE (ret < 0, 0))
1537                 return REG_ESPACE;
1538               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1539                                             root_node, constraint);
1540               if (BE (err != REG_NOERROR, 0))
1541                 return err;
1542             }
1543           else
1544             {
1545               /* There is a duplicated node which satisfies the constraint,
1546                  use it to avoid infinite loop.  */
1547               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548               if (BE (ret < 0, 0))
1549                 return REG_ESPACE;
1550             }
1551
1552           org_dest = dfa->edests[org_node].elems[1];
1553           clone_dest = duplicate_node (dfa, org_dest, constraint);
1554           if (BE (clone_dest == -1, 0))
1555             return REG_ESPACE;
1556           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557           if (BE (ret < 0, 0))
1558             return REG_ESPACE;
1559         }
1560       org_node = org_dest;
1561       clone_node = clone_dest;
1562     }
1563   return REG_NOERROR;
1564 }
1565
1566 /* Search for a node which is duplicated from the node ORG_NODE, and
1567    satisfies the constraint CONSTRAINT.  */
1568
1569 static int
1570 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1571                         unsigned int constraint)
1572 {
1573   int idx;
1574   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1575     {
1576       if (org_node == dfa->org_indices[idx]
1577           && constraint == dfa->nodes[idx].constraint)
1578         return idx; /* Found.  */
1579     }
1580   return -1; /* Not found.  */
1581 }
1582
1583 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1584    Return the index of the new node, or -1 if insufficient storage is
1585    available.  */
1586
1587 static int
1588 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1589 {
1590   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1591   if (BE (dup_idx != -1, 1))
1592     {
1593       dfa->nodes[dup_idx].constraint = constraint;
1594       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1595       dfa->nodes[dup_idx].duplicated = 1;
1596
1597       /* Store the index of the original node.  */
1598       dfa->org_indices[dup_idx] = org_idx;
1599     }
1600   return dup_idx;
1601 }
1602
1603 static reg_errcode_t
1604 calc_inveclosure (re_dfa_t *dfa)
1605 {
1606   int src, idx, ret;
1607   for (idx = 0; idx < dfa->nodes_len; ++idx)
1608     re_node_set_init_empty (dfa->inveclosures + idx);
1609
1610   for (src = 0; src < dfa->nodes_len; ++src)
1611     {
1612       int *elems = dfa->eclosures[src].elems;
1613       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1614         {
1615           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1616           if (BE (ret == -1, 0))
1617             return REG_ESPACE;
1618         }
1619     }
1620
1621   return REG_NOERROR;
1622 }
1623
1624 /* Calculate "eclosure" for all the node in DFA.  */
1625
1626 static reg_errcode_t
1627 calc_eclosure (re_dfa_t *dfa)
1628 {
1629   int node_idx, incomplete;
1630 #ifdef DEBUG
1631   assert (dfa->nodes_len > 0);
1632 #endif
1633   incomplete = 0;
1634   /* For each nodes, calculate epsilon closure.  */
1635   for (node_idx = 0; ; ++node_idx)
1636     {
1637       reg_errcode_t err;
1638       re_node_set eclosure_elem;
1639       if (node_idx == dfa->nodes_len)
1640         {
1641           if (!incomplete)
1642             break;
1643           incomplete = 0;
1644           node_idx = 0;
1645         }
1646
1647 #ifdef DEBUG
1648       assert (dfa->eclosures[node_idx].nelem != -1);
1649 #endif
1650
1651       /* If we have already calculated, skip it.  */
1652       if (dfa->eclosures[node_idx].nelem != 0)
1653         continue;
1654       /* Calculate epsilon closure of 'node_idx'.  */
1655       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1656       if (BE (err != REG_NOERROR, 0))
1657         return err;
1658
1659       if (dfa->eclosures[node_idx].nelem == 0)
1660         {
1661           incomplete = 1;
1662           re_node_set_free (&eclosure_elem);
1663         }
1664     }
1665   return REG_NOERROR;
1666 }
1667
1668 /* Calculate epsilon closure of NODE.  */
1669
1670 static reg_errcode_t
1671 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1672 {
1673   reg_errcode_t err;
1674   int i;
1675   re_node_set eclosure;
1676   int ret;
1677   int incomplete = 0;
1678   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1679   if (BE (err != REG_NOERROR, 0))
1680     return err;
1681
1682   /* This indicates that we are calculating this node now.
1683      We reference this value to avoid infinite loop.  */
1684   dfa->eclosures[node].nelem = -1;
1685
1686   /* If the current node has constraints, duplicate all nodes
1687      since they must inherit the constraints.  */
1688   if (dfa->nodes[node].constraint
1689       && dfa->edests[node].nelem
1690       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1691     {
1692       err = duplicate_node_closure (dfa, node, node, node,
1693                                     dfa->nodes[node].constraint);
1694       if (BE (err != REG_NOERROR, 0))
1695         return err;
1696     }
1697
1698   /* Expand each epsilon destination nodes.  */
1699   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1700     for (i = 0; i < dfa->edests[node].nelem; ++i)
1701       {
1702         re_node_set eclosure_elem;
1703         int edest = dfa->edests[node].elems[i];
1704         /* If calculating the epsilon closure of `edest' is in progress,
1705            return intermediate result.  */
1706         if (dfa->eclosures[edest].nelem == -1)
1707           {
1708             incomplete = 1;
1709             continue;
1710           }
1711         /* If we haven't calculated the epsilon closure of `edest' yet,
1712            calculate now. Otherwise use calculated epsilon closure.  */
1713         if (dfa->eclosures[edest].nelem == 0)
1714           {
1715             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1716             if (BE (err != REG_NOERROR, 0))
1717               return err;
1718           }
1719         else
1720           eclosure_elem = dfa->eclosures[edest];
1721         /* Merge the epsilon closure of 'edest'.  */
1722         err = re_node_set_merge (&eclosure, &eclosure_elem);
1723         if (BE (err != REG_NOERROR, 0))
1724           return err;
1725         /* If the epsilon closure of 'edest' is incomplete,
1726            the epsilon closure of this node is also incomplete.  */
1727         if (dfa->eclosures[edest].nelem == 0)
1728           {
1729             incomplete = 1;
1730             re_node_set_free (&eclosure_elem);
1731           }
1732       }
1733
1734   /* An epsilon closure includes itself.  */
1735   ret = re_node_set_insert (&eclosure, node);
1736   if (BE (ret < 0, 0))
1737     return REG_ESPACE;
1738   if (incomplete && !root)
1739     dfa->eclosures[node].nelem = 0;
1740   else
1741     dfa->eclosures[node] = eclosure;
1742   *new_set = eclosure;
1743   return REG_NOERROR;
1744 }
1745 \f
1746 /* Functions for token which are used in the parser.  */
1747
1748 /* Fetch a token from INPUT.
1749    We must not use this function inside bracket expressions.  */
1750
1751 static void
1752 internal_function
1753 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1754 {
1755   re_string_skip_bytes (input, peek_token (result, input, syntax));
1756 }
1757
1758 /* Peek a token from INPUT, and return the length of the token.
1759    We must not use this function inside bracket expressions.  */
1760
1761 static int
1762 internal_function
1763 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1764 {
1765   unsigned char c;
1766
1767   if (re_string_eoi (input))
1768     {
1769       token->type = END_OF_RE;
1770       return 0;
1771     }
1772
1773   c = re_string_peek_byte (input, 0);
1774   token->opr.c = c;
1775
1776   token->word_char = 0;
1777 #ifdef RE_ENABLE_I18N
1778   token->mb_partial = 0;
1779   if (input->mb_cur_max > 1 &&
1780       !re_string_first_byte (input, re_string_cur_idx (input)))
1781     {
1782       token->type = CHARACTER;
1783       token->mb_partial = 1;
1784       return 1;
1785     }
1786 #endif
1787   if (c == '\\')
1788     {
1789       unsigned char c2;
1790       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1791         {
1792           token->type = BACK_SLASH;
1793           return 1;
1794         }
1795
1796       c2 = re_string_peek_byte_case (input, 1);
1797       token->opr.c = c2;
1798       token->type = CHARACTER;
1799 #ifdef RE_ENABLE_I18N
1800       if (input->mb_cur_max > 1)
1801         {
1802           wint_t wc = re_string_wchar_at (input,
1803                                           re_string_cur_idx (input) + 1);
1804           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1805         }
1806       else
1807 #endif
1808         token->word_char = IS_WORD_CHAR (c2) != 0;
1809
1810       switch (c2)
1811         {
1812         case '|':
1813           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1814             token->type = OP_ALT;
1815           break;
1816         case '1': case '2': case '3': case '4': case '5':
1817         case '6': case '7': case '8': case '9':
1818           if (!(syntax & RE_NO_BK_REFS))
1819             {
1820               token->type = OP_BACK_REF;
1821               token->opr.idx = c2 - '1';
1822             }
1823           break;
1824         case '<':
1825           if (!(syntax & RE_NO_GNU_OPS))
1826             {
1827               token->type = ANCHOR;
1828               token->opr.ctx_type = WORD_FIRST;
1829             }
1830           break;
1831         case '>':
1832           if (!(syntax & RE_NO_GNU_OPS))
1833             {
1834               token->type = ANCHOR;
1835               token->opr.ctx_type = WORD_LAST;
1836             }
1837           break;
1838         case 'b':
1839           if (!(syntax & RE_NO_GNU_OPS))
1840             {
1841               token->type = ANCHOR;
1842               token->opr.ctx_type = WORD_DELIM;
1843             }
1844           break;
1845         case 'B':
1846           if (!(syntax & RE_NO_GNU_OPS))
1847             {
1848               token->type = ANCHOR;
1849               token->opr.ctx_type = NOT_WORD_DELIM;
1850             }
1851           break;
1852         case 'w':
1853           if (!(syntax & RE_NO_GNU_OPS))
1854             token->type = OP_WORD;
1855           break;
1856         case 'W':
1857           if (!(syntax & RE_NO_GNU_OPS))
1858             token->type = OP_NOTWORD;
1859           break;
1860         case 's':
1861           if (!(syntax & RE_NO_GNU_OPS))
1862             token->type = OP_SPACE;
1863           break;
1864         case 'S':
1865           if (!(syntax & RE_NO_GNU_OPS))
1866             token->type = OP_NOTSPACE;
1867           break;
1868         case '`':
1869           if (!(syntax & RE_NO_GNU_OPS))
1870             {
1871               token->type = ANCHOR;
1872               token->opr.ctx_type = BUF_FIRST;
1873             }
1874           break;
1875         case '\'':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             {
1878               token->type = ANCHOR;
1879               token->opr.ctx_type = BUF_LAST;
1880             }
1881           break;
1882         case '(':
1883           if (!(syntax & RE_NO_BK_PARENS))
1884             token->type = OP_OPEN_SUBEXP;
1885           break;
1886         case ')':
1887           if (!(syntax & RE_NO_BK_PARENS))
1888             token->type = OP_CLOSE_SUBEXP;
1889           break;
1890         case '+':
1891           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1892             token->type = OP_DUP_PLUS;
1893           break;
1894         case '?':
1895           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1896             token->type = OP_DUP_QUESTION;
1897           break;
1898         case '{':
1899           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1900             token->type = OP_OPEN_DUP_NUM;
1901           break;
1902         case '}':
1903           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1904             token->type = OP_CLOSE_DUP_NUM;
1905           break;
1906         default:
1907           break;
1908         }
1909       return 2;
1910     }
1911
1912   token->type = CHARACTER;
1913 #ifdef RE_ENABLE_I18N
1914   if (input->mb_cur_max > 1)
1915     {
1916       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1917       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1918     }
1919   else
1920 #endif
1921     token->word_char = IS_WORD_CHAR (token->opr.c);
1922
1923   switch (c)
1924     {
1925     case '\n':
1926       if (syntax & RE_NEWLINE_ALT)
1927         token->type = OP_ALT;
1928       break;
1929     case '|':
1930       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1931         token->type = OP_ALT;
1932       break;
1933     case '*':
1934       token->type = OP_DUP_ASTERISK;
1935       break;
1936     case '+':
1937       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1938         token->type = OP_DUP_PLUS;
1939       break;
1940     case '?':
1941       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1942         token->type = OP_DUP_QUESTION;
1943       break;
1944     case '{':
1945       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1946         token->type = OP_OPEN_DUP_NUM;
1947       break;
1948     case '}':
1949       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1950         token->type = OP_CLOSE_DUP_NUM;
1951       break;
1952     case '(':
1953       if (syntax & RE_NO_BK_PARENS)
1954         token->type = OP_OPEN_SUBEXP;
1955       break;
1956     case ')':
1957       if (syntax & RE_NO_BK_PARENS)
1958         token->type = OP_CLOSE_SUBEXP;
1959       break;
1960     case '[':
1961       token->type = OP_OPEN_BRACKET;
1962       break;
1963     case '.':
1964       token->type = OP_PERIOD;
1965       break;
1966     case '^':
1967       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1968           re_string_cur_idx (input) != 0)
1969         {
1970           char prev = re_string_peek_byte (input, -1);
1971           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1972             break;
1973         }
1974       token->type = ANCHOR;
1975       token->opr.ctx_type = LINE_FIRST;
1976       break;
1977     case '$':
1978       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1979           re_string_cur_idx (input) + 1 != re_string_length (input))
1980         {
1981           re_token_t next;
1982           re_string_skip_bytes (input, 1);
1983           peek_token (&next, input, syntax);
1984           re_string_skip_bytes (input, -1);
1985           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1986             break;
1987         }
1988       token->type = ANCHOR;
1989       token->opr.ctx_type = LINE_LAST;
1990       break;
1991     default:
1992       break;
1993     }
1994   return 1;
1995 }
1996
1997 /* Peek a token from INPUT, and return the length of the token.
1998    We must not use this function out of bracket expressions.  */
1999
2000 static int
2001 internal_function
2002 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2003 {
2004   unsigned char c;
2005   if (re_string_eoi (input))
2006     {
2007       token->type = END_OF_RE;
2008       return 0;
2009     }
2010   c = re_string_peek_byte (input, 0);
2011   token->opr.c = c;
2012
2013 #ifdef RE_ENABLE_I18N
2014   if (input->mb_cur_max > 1 &&
2015       !re_string_first_byte (input, re_string_cur_idx (input)))
2016     {
2017       token->type = CHARACTER;
2018       return 1;
2019     }
2020 #endif /* RE_ENABLE_I18N */
2021
2022   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2023       && re_string_cur_idx (input) + 1 < re_string_length (input))
2024     {
2025       /* In this case, '\' escape a character.  */
2026       unsigned char c2;
2027       re_string_skip_bytes (input, 1);
2028       c2 = re_string_peek_byte (input, 0);
2029       token->opr.c = c2;
2030       token->type = CHARACTER;
2031       return 1;
2032     }
2033   if (c == '[') /* '[' is a special char in a bracket exps.  */
2034     {
2035       unsigned char c2;
2036       int token_len;
2037       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2038         c2 = re_string_peek_byte (input, 1);
2039       else
2040         c2 = 0;
2041       token->opr.c = c2;
2042       token_len = 2;
2043       switch (c2)
2044         {
2045         case '.':
2046           token->type = OP_OPEN_COLL_ELEM;
2047           break;
2048         case '=':
2049           token->type = OP_OPEN_EQUIV_CLASS;
2050           break;
2051         case ':':
2052           if (syntax & RE_CHAR_CLASSES)
2053             {
2054               token->type = OP_OPEN_CHAR_CLASS;
2055               break;
2056             }
2057           /* else fall through.  */
2058         default:
2059           token->type = CHARACTER;
2060           token->opr.c = c;
2061           token_len = 1;
2062           break;
2063         }
2064       return token_len;
2065     }
2066   switch (c)
2067     {
2068     case '-':
2069       token->type = OP_CHARSET_RANGE;
2070       break;
2071     case ']':
2072       token->type = OP_CLOSE_BRACKET;
2073       break;
2074     case '^':
2075       token->type = OP_NON_MATCH_LIST;
2076       break;
2077     default:
2078       token->type = CHARACTER;
2079     }
2080   return 1;
2081 }
2082 \f
2083 /* Functions for parser.  */
2084
2085 /* Entry point of the parser.
2086    Parse the regular expression REGEXP and return the structure tree.
2087    If an error occurs, ERR is set by error code, and return NULL.
2088    This function build the following tree, from regular expression <reg_exp>:
2089            CAT
2090            / \
2091           /   \
2092    <reg_exp>  EOR
2093
2094    CAT means concatenation.
2095    EOR means end of regular expression.  */
2096
2097 static bin_tree_t *
2098 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2099        reg_errcode_t *err)
2100 {
2101   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2102   bin_tree_t *tree, *eor, *root;
2103   re_token_t current_token;
2104   dfa->syntax = syntax;
2105   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2106   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2107   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2108     return NULL;
2109   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2110   if (tree != NULL)
2111     root = create_tree (dfa, tree, eor, CONCAT);
2112   else
2113     root = eor;
2114   if (BE (eor == NULL || root == NULL, 0))
2115     {
2116       *err = REG_ESPACE;
2117       return NULL;
2118     }
2119   return root;
2120 }
2121
2122 /* This function build the following tree, from regular expression
2123    <branch1>|<branch2>:
2124            ALT
2125            / \
2126           /   \
2127    <branch1> <branch2>
2128
2129    ALT means alternative, which represents the operator '|'.  */
2130
2131 static bin_tree_t *
2132 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2133                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2134 {
2135   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2136   bin_tree_t *tree, *branch = NULL;
2137   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2138   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2139     return NULL;
2140
2141   while (token->type == OP_ALT)
2142     {
2143       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2144       if (token->type != OP_ALT && token->type != END_OF_RE
2145           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2146         {
2147           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2148           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2149             {
2150               if (tree != NULL)
2151                 postorder (tree, free_tree, NULL);
2152               return NULL;
2153             }
2154         }
2155       else
2156         branch = NULL;
2157       tree = create_tree (dfa, tree, branch, OP_ALT);
2158       if (BE (tree == NULL, 0))
2159         {
2160           *err = REG_ESPACE;
2161           return NULL;
2162         }
2163     }
2164   return tree;
2165 }
2166
2167 /* This function build the following tree, from regular expression
2168    <exp1><exp2>:
2169         CAT
2170         / \
2171        /   \
2172    <exp1> <exp2>
2173
2174    CAT means concatenation.  */
2175
2176 static bin_tree_t *
2177 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2178               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2179 {
2180   bin_tree_t *tree, *exp;
2181   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2182   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2183   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2184     return NULL;
2185
2186   while (token->type != OP_ALT && token->type != END_OF_RE
2187          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2188     {
2189       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2190       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2191         {
2192           if (tree != NULL)
2193             postorder (tree, free_tree, NULL);
2194           return NULL;
2195         }
2196       if (tree != NULL && exp != NULL)
2197         {
2198           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2199           if (newtree == NULL)
2200             {
2201               postorder (exp, free_tree, NULL);
2202               postorder (tree, free_tree, NULL);
2203               *err = REG_ESPACE;
2204               return NULL;
2205             }
2206           tree = newtree;
2207         }
2208       else if (tree == NULL)
2209         tree = exp;
2210       /* Otherwise exp == NULL, we don't need to create new tree.  */
2211     }
2212   return tree;
2213 }
2214
2215 /* This function build the following tree, from regular expression a*:
2216          *
2217          |
2218          a
2219 */
2220
2221 static bin_tree_t *
2222 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2223                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2224 {
2225   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2226   bin_tree_t *tree;
2227   switch (token->type)
2228     {
2229     case CHARACTER:
2230       tree = create_token_tree (dfa, NULL, NULL, token);
2231       if (BE (tree == NULL, 0))
2232         {
2233           *err = REG_ESPACE;
2234           return NULL;
2235         }
2236 #ifdef RE_ENABLE_I18N
2237       if (dfa->mb_cur_max > 1)
2238         {
2239           while (!re_string_eoi (regexp)
2240                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2241             {
2242               bin_tree_t *mbc_remain;
2243               fetch_token (token, regexp, syntax);
2244               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2245               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2246               if (BE (mbc_remain == NULL || tree == NULL, 0))
2247                 {
2248                   *err = REG_ESPACE;
2249                   return NULL;
2250                 }
2251             }
2252         }
2253 #endif
2254       break;
2255     case OP_OPEN_SUBEXP:
2256       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2257       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2258         return NULL;
2259       break;
2260     case OP_OPEN_BRACKET:
2261       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2262       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2263         return NULL;
2264       break;
2265     case OP_BACK_REF:
2266       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2267         {
2268           *err = REG_ESUBREG;
2269           return NULL;
2270         }
2271       dfa->used_bkref_map |= 1 << token->opr.idx;
2272       tree = create_token_tree (dfa, NULL, NULL, token);
2273       if (BE (tree == NULL, 0))
2274         {
2275           *err = REG_ESPACE;
2276           return NULL;
2277         }
2278       ++dfa->nbackref;
2279       dfa->has_mb_node = 1;
2280       break;
2281     case OP_OPEN_DUP_NUM:
2282       if (syntax & RE_CONTEXT_INVALID_DUP)
2283         {
2284           *err = REG_BADRPT;
2285           return NULL;
2286         }
2287       /* FALLTHROUGH */
2288     case OP_DUP_ASTERISK:
2289     case OP_DUP_PLUS:
2290     case OP_DUP_QUESTION:
2291       if (syntax & RE_CONTEXT_INVALID_OPS)
2292         {
2293           *err = REG_BADRPT;
2294           return NULL;
2295         }
2296       else if (syntax & RE_CONTEXT_INDEP_OPS)
2297         {
2298           fetch_token (token, regexp, syntax);
2299           return parse_expression (regexp, preg, token, syntax, nest, err);
2300         }
2301       /* else fall through  */
2302     case OP_CLOSE_SUBEXP:
2303       if ((token->type == OP_CLOSE_SUBEXP) &&
2304           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2305         {
2306           *err = REG_ERPAREN;
2307           return NULL;
2308         }
2309       /* else fall through  */
2310     case OP_CLOSE_DUP_NUM:
2311       /* We treat it as a normal character.  */
2312
2313       /* Then we can these characters as normal characters.  */
2314       token->type = CHARACTER;
2315       /* mb_partial and word_char bits should be initialized already
2316          by peek_token.  */
2317       tree = create_token_tree (dfa, NULL, NULL, token);
2318       if (BE (tree == NULL, 0))
2319         {
2320           *err = REG_ESPACE;
2321           return NULL;
2322         }
2323       break;
2324     case ANCHOR:
2325       if ((token->opr.ctx_type
2326            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2327           && dfa->word_ops_used == 0)
2328         init_word_char (dfa);
2329       if (token->opr.ctx_type == WORD_DELIM
2330           || token->opr.ctx_type == NOT_WORD_DELIM)
2331         {
2332           bin_tree_t *tree_first, *tree_last;
2333           if (token->opr.ctx_type == WORD_DELIM)
2334             {
2335               token->opr.ctx_type = WORD_FIRST;
2336               tree_first = create_token_tree (dfa, NULL, NULL, token);
2337               token->opr.ctx_type = WORD_LAST;
2338             }
2339           else
2340             {
2341               token->opr.ctx_type = INSIDE_WORD;
2342               tree_first = create_token_tree (dfa, NULL, NULL, token);
2343               token->opr.ctx_type = INSIDE_NOTWORD;
2344             }
2345           tree_last = create_token_tree (dfa, NULL, NULL, token);
2346           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2347           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2348             {
2349               *err = REG_ESPACE;
2350               return NULL;
2351             }
2352         }
2353       else
2354         {
2355           tree = create_token_tree (dfa, NULL, NULL, token);
2356           if (BE (tree == NULL, 0))
2357             {
2358               *err = REG_ESPACE;
2359               return NULL;
2360             }
2361         }
2362       /* We must return here, since ANCHORs can't be followed
2363          by repetition operators.
2364          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2365              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2366       fetch_token (token, regexp, syntax);
2367       return tree;
2368     case OP_PERIOD:
2369       tree = create_token_tree (dfa, NULL, NULL, token);
2370       if (BE (tree == NULL, 0))
2371         {
2372           *err = REG_ESPACE;
2373           return NULL;
2374         }
2375       if (dfa->mb_cur_max > 1)
2376         dfa->has_mb_node = 1;
2377       break;
2378     case OP_WORD:
2379     case OP_NOTWORD:
2380       tree = build_charclass_op (dfa, regexp->trans,
2381                                  (const unsigned char *) "alnum",
2382                                  (const unsigned char *) "_",
2383                                  token->type == OP_NOTWORD, err);
2384       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2385         return NULL;
2386       break;
2387     case OP_SPACE:
2388     case OP_NOTSPACE:
2389       tree = build_charclass_op (dfa, regexp->trans,
2390                                  (const unsigned char *) "space",
2391                                  (const unsigned char *) "",
2392                                  token->type == OP_NOTSPACE, err);
2393       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394         return NULL;
2395       break;
2396     case OP_ALT:
2397     case END_OF_RE:
2398       return NULL;
2399     case BACK_SLASH:
2400       *err = REG_EESCAPE;
2401       return NULL;
2402     default:
2403       /* Must not happen?  */
2404 #ifdef DEBUG
2405       assert (0);
2406 #endif
2407       return NULL;
2408     }
2409   fetch_token (token, regexp, syntax);
2410
2411   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2412          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2413     {
2414       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2415       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2416         {
2417           if (tree != NULL)
2418             postorder (tree, free_tree, NULL);
2419           return NULL;
2420         }
2421       tree = dup_tree;
2422       /* In BRE consecutive duplications are not allowed.  */
2423       if ((syntax & RE_CONTEXT_INVALID_DUP)
2424           && (token->type == OP_DUP_ASTERISK
2425               || token->type == OP_OPEN_DUP_NUM))
2426         {
2427           if (tree != NULL)
2428             postorder (tree, free_tree, NULL);
2429           *err = REG_BADRPT;
2430           return NULL;
2431         }
2432     }
2433
2434   return tree;
2435 }
2436
2437 /* This function build the following tree, from regular expression
2438    (<reg_exp>):
2439          SUBEXP
2440             |
2441         <reg_exp>
2442 */
2443
2444 static bin_tree_t *
2445 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2446                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2447 {
2448   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2449   bin_tree_t *tree;
2450   size_t cur_nsub;
2451   cur_nsub = preg->re_nsub++;
2452
2453   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2454
2455   /* The subexpression may be a null string.  */
2456   if (token->type == OP_CLOSE_SUBEXP)
2457     tree = NULL;
2458   else
2459     {
2460       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2461       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2462         {
2463           if (tree != NULL)
2464             postorder (tree, free_tree, NULL);
2465           *err = REG_EPAREN;
2466         }
2467       if (BE (*err != REG_NOERROR, 0))
2468         return NULL;
2469     }
2470
2471   if (cur_nsub <= '9' - '1')
2472     dfa->completed_bkref_map |= 1 << cur_nsub;
2473
2474   tree = create_tree (dfa, tree, NULL, SUBEXP);
2475   if (BE (tree == NULL, 0))
2476     {
2477       *err = REG_ESPACE;
2478       return NULL;
2479     }
2480   tree->token.opr.idx = cur_nsub;
2481   return tree;
2482 }
2483
2484 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2485
2486 static bin_tree_t *
2487 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2488               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2489 {
2490   bin_tree_t *tree = NULL, *old_tree = NULL;
2491   int i, start, end, start_idx = re_string_cur_idx (regexp);
2492   re_token_t start_token = *token;
2493
2494   if (token->type == OP_OPEN_DUP_NUM)
2495     {
2496       end = 0;
2497       start = fetch_number (regexp, token, syntax);
2498       if (start == -1)
2499         {
2500           if (token->type == CHARACTER && token->opr.c == ',')
2501             start = 0; /* We treat "{,m}" as "{0,m}".  */
2502           else
2503             {
2504               *err = REG_BADBR; /* <re>{} is invalid.  */
2505               return NULL;
2506             }
2507         }
2508       if (BE (start != -2, 1))
2509         {
2510           /* We treat "{n}" as "{n,n}".  */
2511           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2512                  : ((token->type == CHARACTER && token->opr.c == ',')
2513                     ? fetch_number (regexp, token, syntax) : -2));
2514         }
2515       if (BE (start == -2 || end == -2, 0))
2516         {
2517           /* Invalid sequence.  */
2518           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2519             {
2520               if (token->type == END_OF_RE)
2521                 *err = REG_EBRACE;
2522               else
2523                 *err = REG_BADBR;
2524
2525               return NULL;
2526             }
2527
2528           /* If the syntax bit is set, rollback.  */
2529           re_string_set_index (regexp, start_idx);
2530           *token = start_token;
2531           token->type = CHARACTER;
2532           /* mb_partial and word_char bits should be already initialized by
2533              peek_token.  */
2534           return elem;
2535         }
2536
2537       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2538         {
2539           /* First number greater than second.  */
2540           *err = REG_BADBR;
2541           return NULL;
2542         }
2543     }
2544   else
2545     {
2546       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2547       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2548     }
2549
2550   fetch_token (token, regexp, syntax);
2551
2552   if (BE (elem == NULL, 0))
2553     return NULL;
2554   if (BE (start == 0 && end == 0, 0))
2555     {
2556       postorder (elem, free_tree, NULL);
2557       return NULL;
2558     }
2559
2560   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2561   if (BE (start > 0, 0))
2562     {
2563       tree = elem;
2564       for (i = 2; i <= start; ++i)
2565         {
2566           elem = duplicate_tree (elem, dfa);
2567           tree = create_tree (dfa, tree, elem, CONCAT);
2568           if (BE (elem == NULL || tree == NULL, 0))
2569             goto parse_dup_op_espace;
2570         }
2571
2572       if (start == end)
2573         return tree;
2574
2575       /* Duplicate ELEM before it is marked optional.  */
2576       elem = duplicate_tree (elem, dfa);
2577       if (BE (elem == NULL, 0))
2578         goto parse_dup_op_espace;
2579       old_tree = tree;
2580     }
2581   else
2582     old_tree = NULL;
2583
2584   if (elem->token.type == SUBEXP)
2585     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2586
2587   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2588   if (BE (tree == NULL, 0))
2589     goto parse_dup_op_espace;
2590
2591   /* This loop is actually executed only when end != -1,
2592      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2593      already created the start+1-th copy.  */
2594   for (i = start + 2; i <= end; ++i)
2595     {
2596       elem = duplicate_tree (elem, dfa);
2597       tree = create_tree (dfa, tree, elem, CONCAT);
2598       if (BE (elem == NULL || tree == NULL, 0))
2599         goto parse_dup_op_espace;
2600
2601       tree = create_tree (dfa, tree, NULL, OP_ALT);
2602       if (BE (tree == NULL, 0))
2603         goto parse_dup_op_espace;
2604     }
2605
2606   if (old_tree)
2607     tree = create_tree (dfa, old_tree, tree, CONCAT);
2608
2609   return tree;
2610
2611  parse_dup_op_espace:
2612   *err = REG_ESPACE;
2613   return NULL;
2614 }
2615
2616 /* Size of the names for collating symbol/equivalence_class/character_class.
2617    I'm not sure, but maybe enough.  */
2618 #define BRACKET_NAME_BUF_SIZE 32
2619
2620 #ifndef _LIBC
2621   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2622      Build the range expression which starts from START_ELEM, and ends
2623      at END_ELEM.  The result are written to MBCSET and SBCSET.
2624      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2625      mbcset->range_ends, is a pointer argument since we may
2626      update it.  */
2627
2628 static reg_errcode_t
2629 internal_function
2630 # ifdef RE_ENABLE_I18N
2631 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2632                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2635                  bracket_elem_t *end_elem)
2636 # endif /* not RE_ENABLE_I18N */
2637 {
2638   unsigned int start_ch, end_ch;
2639   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2640   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2642           0))
2643     return REG_ERANGE;
2644
2645   /* We can handle no multi character collating elements without libc
2646      support.  */
2647   if (BE ((start_elem->type == COLL_SYM
2648            && strlen ((char *) start_elem->opr.name) > 1)
2649           || (end_elem->type == COLL_SYM
2650               && strlen ((char *) end_elem->opr.name) > 1), 0))
2651     return REG_ECOLLATE;
2652
2653 # ifdef RE_ENABLE_I18N
2654   {
2655     wchar_t wc;
2656     wint_t start_wc;
2657     wint_t end_wc;
2658     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2659
2660     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2662                    : 0));
2663     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2665                  : 0));
2666     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2667                 ? __btowc (start_ch) : start_elem->opr.wch);
2668     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2669               ? __btowc (end_ch) : end_elem->opr.wch);
2670     if (start_wc == WEOF || end_wc == WEOF)
2671       return REG_ECOLLATE;
2672     cmp_buf[0] = start_wc;
2673     cmp_buf[4] = end_wc;
2674     if (__wcscoll (cmp_buf, cmp_buf + 4) > 0)
2675       return REG_ERANGE;
2676
2677     /* Got valid collation sequence values, add them as a new entry.
2678        However, for !_LIBC we have no collation elements: if the
2679        character set is single byte, the single byte character set
2680        that we build below suffices.  parse_bracket_exp passes
2681        no MBCSET if dfa->mb_cur_max == 1.  */
2682     if (mbcset)
2683       {
2684         /* Check the space of the arrays.  */
2685         if (BE (*range_alloc == mbcset->nranges, 0))
2686           {
2687             /* There is not enough space, need realloc.  */
2688             wchar_t *new_array_start, *new_array_end;
2689             int new_nranges;
2690
2691             /* +1 in case of mbcset->nranges is 0.  */
2692             new_nranges = 2 * mbcset->nranges + 1;
2693             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2694                are NULL if *range_alloc == 0.  */
2695             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2696                                           new_nranges);
2697             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2698                                         new_nranges);
2699
2700             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2701               return REG_ESPACE;
2702
2703             mbcset->range_starts = new_array_start;
2704             mbcset->range_ends = new_array_end;
2705             *range_alloc = new_nranges;
2706           }
2707
2708         mbcset->range_starts[mbcset->nranges] = start_wc;
2709         mbcset->range_ends[mbcset->nranges++] = end_wc;
2710       }
2711
2712     /* Build the table for single byte characters.  */
2713     for (wc = 0; wc < SBC_MAX; ++wc)
2714       {
2715         cmp_buf[2] = wc;
2716         if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
2717             && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2718           bitset_set (sbcset, wc);
2719       }
2720   }
2721 # else /* not RE_ENABLE_I18N */
2722   {
2723     unsigned int ch;
2724     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2725                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2726                    : 0));
2727     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2728               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2729                  : 0));
2730     if (start_ch > end_ch)
2731       return REG_ERANGE;
2732     /* Build the table for single byte characters.  */
2733     for (ch = 0; ch < SBC_MAX; ++ch)
2734       if (start_ch <= ch  && ch <= end_ch)
2735         bitset_set (sbcset, ch);
2736   }
2737 # endif /* not RE_ENABLE_I18N */
2738   return REG_NOERROR;
2739 }
2740 #endif /* not _LIBC */
2741
2742 #ifndef _LIBC
2743 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2744    Build the collating element which is represented by NAME.
2745    The result are written to MBCSET and SBCSET.
2746    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2747    pointer argument since we may update it.  */
2748
2749 static reg_errcode_t
2750 internal_function
2751 # ifdef RE_ENABLE_I18N
2752 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2753                         int *coll_sym_alloc, const unsigned char *name)
2754 # else /* not RE_ENABLE_I18N */
2755 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2756 # endif /* not RE_ENABLE_I18N */
2757 {
2758   size_t name_len = strlen ((const char *) name);
2759   if (BE (name_len != 1, 0))
2760     return REG_ECOLLATE;
2761   else
2762     {
2763       bitset_set (sbcset, name[0]);
2764       return REG_NOERROR;
2765     }
2766 }
2767 #endif /* not _LIBC */
2768
2769 /* This function parse bracket expression like "[abc]", "[a-c]",
2770    "[[.a-a.]]" etc.  */
2771
2772 static bin_tree_t *
2773 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2774                    reg_syntax_t syntax, reg_errcode_t *err)
2775 {
2776 #ifdef _LIBC
2777   const unsigned char *collseqmb;
2778   const char *collseqwc;
2779   uint32_t nrules;
2780   int32_t table_size;
2781   const int32_t *symb_table;
2782   const unsigned char *extra;
2783
2784   /* Local function for parse_bracket_exp used in _LIBC environment.
2785      Seek the collating symbol entry corresponding to NAME.
2786      Return the index of the symbol in the SYMB_TABLE,
2787      or -1 if not found.  */
2788
2789   auto inline int32_t
2790   __attribute__ ((always_inline))
2791   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2792     {
2793       int32_t elem;
2794
2795       for (elem = 0; elem < table_size; elem++)
2796         if (symb_table[2 * elem] != 0)
2797           {
2798             int32_t idx = symb_table[2 * elem + 1];
2799             /* Skip the name of collating element name.  */
2800             idx += 1 + extra[idx];
2801             if (/* Compare the length of the name.  */
2802                 name_len == extra[idx]
2803                 /* Compare the name.  */
2804                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2805               /* Yep, this is the entry.  */
2806               return elem;
2807           }
2808       return -1;
2809     }
2810
2811   /* Local function for parse_bracket_exp used in _LIBC environment.
2812      Look up the collation sequence value of BR_ELEM.
2813      Return the value if succeeded, UINT_MAX otherwise.  */
2814
2815   auto inline unsigned int
2816   __attribute__ ((always_inline))
2817   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2818     {
2819       if (br_elem->type == SB_CHAR)
2820         {
2821           /*
2822           if (MB_CUR_MAX == 1)
2823           */
2824           if (nrules == 0)
2825             return collseqmb[br_elem->opr.ch];
2826           else
2827             {
2828               wint_t wc = __btowc (br_elem->opr.ch);
2829               return __collseq_table_lookup (collseqwc, wc);
2830             }
2831         }
2832       else if (br_elem->type == MB_CHAR)
2833         {
2834           if (nrules != 0)
2835             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2836         }
2837       else if (br_elem->type == COLL_SYM)
2838         {
2839           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2840           if (nrules != 0)
2841             {
2842               int32_t elem, idx;
2843               elem = seek_collating_symbol_entry (br_elem->opr.name,
2844                                                   sym_name_len);
2845               if (elem != -1)
2846                 {
2847                   /* We found the entry.  */
2848                   idx = symb_table[2 * elem + 1];
2849                   /* Skip the name of collating element name.  */
2850                   idx += 1 + extra[idx];
2851                   /* Skip the byte sequence of the collating element.  */
2852                   idx += 1 + extra[idx];
2853                   /* Adjust for the alignment.  */
2854                   idx = (idx + 3) & ~3;
2855                   /* Skip the multibyte collation sequence value.  */
2856                   idx += sizeof (unsigned int);
2857                   /* Skip the wide char sequence of the collating element.  */
2858                   idx += sizeof (unsigned int) *
2859                     (1 + *(unsigned int *) (extra + idx));
2860                   /* Return the collation sequence value.  */
2861                   return *(unsigned int *) (extra + idx);
2862                 }
2863               else if (sym_name_len == 1)
2864                 {
2865                   /* No valid character.  Match it as a single byte
2866                      character.  */
2867                   return collseqmb[br_elem->opr.name[0]];
2868                 }
2869             }
2870           else if (sym_name_len == 1)
2871             return collseqmb[br_elem->opr.name[0]];
2872         }
2873       return UINT_MAX;
2874     }
2875
2876   /* Local function for parse_bracket_exp used in _LIBC environment.
2877      Build the range expression which starts from START_ELEM, and ends
2878      at END_ELEM.  The result are written to MBCSET and SBCSET.
2879      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2880      mbcset->range_ends, is a pointer argument since we may
2881      update it.  */
2882
2883   auto inline reg_errcode_t
2884   __attribute__ ((always_inline))
2885   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2886                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2887     {
2888       unsigned int ch;
2889       uint32_t start_collseq;
2890       uint32_t end_collseq;
2891
2892       /* Equivalence Classes and Character Classes can't be a range
2893          start/end.  */
2894       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2895               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2896               0))
2897         return REG_ERANGE;
2898
2899       start_collseq = lookup_collation_sequence_value (start_elem);
2900       end_collseq = lookup_collation_sequence_value (end_elem);
2901       /* Check start/end collation sequence values.  */
2902       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2903         return REG_ECOLLATE;
2904       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2905         return REG_ERANGE;
2906
2907       /* Got valid collation sequence values, add them as a new entry.
2908          However, if we have no collation elements, and the character set
2909          is single byte, the single byte character set that we
2910          build below suffices. */
2911       if (nrules > 0 || dfa->mb_cur_max > 1)
2912         {
2913           /* Check the space of the arrays.  */
2914           if (BE (*range_alloc == mbcset->nranges, 0))
2915             {
2916               /* There is not enough space, need realloc.  */
2917               uint32_t *new_array_start;
2918               uint32_t *new_array_end;
2919               int new_nranges;
2920
2921               /* +1 in case of mbcset->nranges is 0.  */
2922               new_nranges = 2 * mbcset->nranges + 1;
2923               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2924                                             new_nranges);
2925               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2926                                           new_nranges);
2927
2928               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2929                 return REG_ESPACE;
2930
2931               mbcset->range_starts = new_array_start;
2932               mbcset->range_ends = new_array_end;
2933               *range_alloc = new_nranges;
2934             }
2935
2936           mbcset->range_starts[mbcset->nranges] = start_collseq;
2937           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2938         }
2939
2940       /* Build the table for single byte characters.  */
2941       for (ch = 0; ch < SBC_MAX; ch++)
2942         {
2943           uint32_t ch_collseq;
2944           /*
2945           if (MB_CUR_MAX == 1)
2946           */
2947           if (nrules == 0)
2948             ch_collseq = collseqmb[ch];
2949           else
2950             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2951           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2952             bitset_set (sbcset, ch);
2953         }
2954       return REG_NOERROR;
2955     }
2956
2957   /* Local function for parse_bracket_exp used in _LIBC environment.
2958      Build the collating element which is represented by NAME.
2959      The result are written to MBCSET and SBCSET.
2960      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2961      pointer argument since we may update it.  */
2962
2963   auto inline reg_errcode_t
2964   __attribute__ ((always_inline))
2965   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2966                           int *coll_sym_alloc, const unsigned char *name)
2967     {
2968       int32_t elem, idx;
2969       size_t name_len = strlen ((const char *) name);
2970       if (nrules != 0)
2971         {
2972           elem = seek_collating_symbol_entry (name, name_len);
2973           if (elem != -1)
2974             {
2975               /* We found the entry.  */
2976               idx = symb_table[2 * elem + 1];
2977               /* Skip the name of collating element name.  */
2978               idx += 1 + extra[idx];
2979             }
2980           else if (name_len == 1)
2981             {
2982               /* No valid character, treat it as a normal
2983                  character.  */
2984               bitset_set (sbcset, name[0]);
2985               return REG_NOERROR;
2986             }
2987           else
2988             return REG_ECOLLATE;
2989
2990           /* Got valid collation sequence, add it as a new entry.  */
2991           /* Check the space of the arrays.  */
2992           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2993             {
2994               /* Not enough, realloc it.  */
2995               /* +1 in case of mbcset->ncoll_syms is 0.  */
2996               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2997               /* Use realloc since mbcset->coll_syms is NULL
2998                  if *alloc == 0.  */
2999               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3000                                                    new_coll_sym_alloc);
3001               if (BE (new_coll_syms == NULL, 0))
3002                 return REG_ESPACE;
3003               mbcset->coll_syms = new_coll_syms;
3004               *coll_sym_alloc = new_coll_sym_alloc;
3005             }
3006           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3007           return REG_NOERROR;
3008         }
3009       else
3010         {
3011           if (BE (name_len != 1, 0))
3012             return REG_ECOLLATE;
3013           else
3014             {
3015               bitset_set (sbcset, name[0]);
3016               return REG_NOERROR;
3017             }
3018         }
3019     }
3020 #endif
3021
3022   re_token_t br_token;
3023   re_bitset_ptr_t sbcset;
3024 #ifdef RE_ENABLE_I18N
3025   re_charset_t *mbcset;
3026   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3027   int equiv_class_alloc = 0, char_class_alloc = 0;
3028 #endif /* not RE_ENABLE_I18N */
3029   int non_match = 0;
3030   bin_tree_t *work_tree;
3031   int token_len;
3032   int first_round = 1;
3033 #ifdef _LIBC
3034   collseqmb = (const unsigned char *)
3035     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3036   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3037   if (nrules)
3038     {
3039       /*
3040       if (MB_CUR_MAX > 1)
3041       */
3042       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3043       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3044       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3045                                                   _NL_COLLATE_SYMB_TABLEMB);
3046       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3047                                                    _NL_COLLATE_SYMB_EXTRAMB);
3048     }
3049 #endif
3050   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3051 #ifdef RE_ENABLE_I18N
3052   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3053 #endif /* RE_ENABLE_I18N */
3054 #ifdef RE_ENABLE_I18N
3055   if (BE (sbcset == NULL || mbcset == NULL, 0))
3056 #else
3057   if (BE (sbcset == NULL, 0))
3058 #endif /* RE_ENABLE_I18N */
3059     {
3060       re_free (sbcset);
3061 #ifdef RE_ENABLE_I18N
3062       re_free (mbcset);
3063 #endif
3064       *err = REG_ESPACE;
3065       return NULL;
3066     }
3067
3068   token_len = peek_token_bracket (token, regexp, syntax);
3069   if (BE (token->type == END_OF_RE, 0))
3070     {
3071       *err = REG_BADPAT;
3072       goto parse_bracket_exp_free_return;
3073     }
3074   if (token->type == OP_NON_MATCH_LIST)
3075     {
3076 #ifdef RE_ENABLE_I18N
3077       mbcset->non_match = 1;
3078 #endif /* not RE_ENABLE_I18N */
3079       non_match = 1;
3080       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3081         bitset_set (sbcset, '\n');
3082       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3083       token_len = peek_token_bracket (token, regexp, syntax);
3084       if (BE (token->type == END_OF_RE, 0))
3085         {
3086           *err = REG_BADPAT;
3087           goto parse_bracket_exp_free_return;
3088         }
3089     }
3090
3091   /* We treat the first ']' as a normal character.  */
3092   if (token->type == OP_CLOSE_BRACKET)
3093     token->type = CHARACTER;
3094
3095   while (1)
3096     {
3097       bracket_elem_t start_elem, end_elem;
3098       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3099       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3100       reg_errcode_t ret;
3101       int token_len2 = 0, is_range_exp = 0;
3102       re_token_t token2;
3103
3104       start_elem.opr.name = start_name_buf;
3105       start_elem.type = COLL_SYM;
3106       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3107                                    syntax, first_round);
3108       if (BE (ret != REG_NOERROR, 0))
3109         {
3110           *err = ret;
3111           goto parse_bracket_exp_free_return;
3112         }
3113       first_round = 0;
3114
3115       /* Get information about the next token.  We need it in any case.  */
3116       token_len = peek_token_bracket (token, regexp, syntax);
3117
3118       /* Do not check for ranges if we know they are not allowed.  */
3119       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3120         {
3121           if (BE (token->type == END_OF_RE, 0))
3122             {
3123               *err = REG_EBRACK;
3124               goto parse_bracket_exp_free_return;
3125             }
3126           if (token->type == OP_CHARSET_RANGE)
3127             {
3128               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3129               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3130               if (BE (token2.type == END_OF_RE, 0))
3131                 {
3132                   *err = REG_EBRACK;
3133                   goto parse_bracket_exp_free_return;
3134                 }
3135               if (token2.type == OP_CLOSE_BRACKET)
3136                 {
3137                   /* We treat the last '-' as a normal character.  */
3138                   re_string_skip_bytes (regexp, -token_len);
3139                   token->type = CHARACTER;
3140                 }
3141               else
3142                 is_range_exp = 1;
3143             }
3144         }
3145
3146       if (is_range_exp == 1)
3147         {
3148           end_elem.opr.name = end_name_buf;
3149           end_elem.type = COLL_SYM;
3150           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3151                                        dfa, syntax, 1);
3152           if (BE (ret != REG_NOERROR, 0))
3153             {
3154               *err = ret;
3155               goto parse_bracket_exp_free_return;
3156             }
3157
3158           token_len = peek_token_bracket (token, regexp, syntax);
3159
3160 #ifdef _LIBC
3161           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3162                                   &start_elem, &end_elem);
3163 #else
3164 # ifdef RE_ENABLE_I18N
3165           *err = build_range_exp (sbcset,
3166                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3167                                   &range_alloc, &start_elem, &end_elem);
3168 # else
3169           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3170 # endif
3171 #endif /* RE_ENABLE_I18N */
3172           if (BE (*err != REG_NOERROR, 0))
3173             goto parse_bracket_exp_free_return;
3174         }
3175       else
3176         {
3177           switch (start_elem.type)
3178             {
3179             case SB_CHAR:
3180               bitset_set (sbcset, start_elem.opr.ch);
3181               break;
3182 #ifdef RE_ENABLE_I18N
3183             case MB_CHAR:
3184               /* Check whether the array has enough space.  */
3185               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3186                 {
3187                   wchar_t *new_mbchars;
3188                   /* Not enough, realloc it.  */
3189                   /* +1 in case of mbcset->nmbchars is 0.  */
3190                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3191                   /* Use realloc since array is NULL if *alloc == 0.  */
3192                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3193                                             mbchar_alloc);
3194                   if (BE (new_mbchars == NULL, 0))
3195                     goto parse_bracket_exp_espace;
3196                   mbcset->mbchars = new_mbchars;
3197                 }
3198               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3199               break;
3200 #endif /* RE_ENABLE_I18N */
3201             case EQUIV_CLASS:
3202               *err = build_equiv_class (sbcset,
3203 #ifdef RE_ENABLE_I18N
3204                                         mbcset, &equiv_class_alloc,
3205 #endif /* RE_ENABLE_I18N */
3206                                         start_elem.opr.name);
3207               if (BE (*err != REG_NOERROR, 0))
3208                 goto parse_bracket_exp_free_return;
3209               break;
3210             case COLL_SYM:
3211               *err = build_collating_symbol (sbcset,
3212 #ifdef RE_ENABLE_I18N
3213                                              mbcset, &coll_sym_alloc,
3214 #endif /* RE_ENABLE_I18N */
3215                                              start_elem.opr.name);
3216               if (BE (*err != REG_NOERROR, 0))
3217                 goto parse_bracket_exp_free_return;
3218               break;
3219             case CHAR_CLASS:
3220               *err = build_charclass (regexp->trans, sbcset,
3221 #ifdef RE_ENABLE_I18N
3222                                       mbcset, &char_class_alloc,
3223 #endif /* RE_ENABLE_I18N */
3224                                       start_elem.opr.name, syntax);
3225               if (BE (*err != REG_NOERROR, 0))
3226                goto parse_bracket_exp_free_return;
3227               break;
3228             default:
3229               assert (0);
3230               break;
3231             }
3232         }
3233       if (BE (token->type == END_OF_RE, 0))
3234         {
3235           *err = REG_EBRACK;
3236           goto parse_bracket_exp_free_return;
3237         }
3238       if (token->type == OP_CLOSE_BRACKET)
3239         break;
3240     }
3241
3242   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3243
3244   /* If it is non-matching list.  */
3245   if (non_match)
3246     bitset_not (sbcset);
3247
3248 #ifdef RE_ENABLE_I18N
3249   /* Ensure only single byte characters are set.  */
3250   if (dfa->mb_cur_max > 1)
3251     bitset_mask (sbcset, dfa->sb_char);
3252
3253   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3254       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3255                                                      || mbcset->non_match)))
3256     {
3257       bin_tree_t *mbc_tree;
3258       int sbc_idx;
3259       /* Build a tree for complex bracket.  */
3260       dfa->has_mb_node = 1;
3261       br_token.type = COMPLEX_BRACKET;
3262       br_token.opr.mbcset = mbcset;
3263       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3264       if (BE (mbc_tree == NULL, 0))
3265         goto parse_bracket_exp_espace;
3266       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3267         if (sbcset[sbc_idx])
3268           break;
3269       /* If there are no bits set in sbcset, there is no point
3270          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3271       if (sbc_idx < BITSET_WORDS)
3272         {
3273           /* Build a tree for simple bracket.  */
3274           br_token.type = SIMPLE_BRACKET;
3275           br_token.opr.sbcset = sbcset;
3276           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3277           if (BE (work_tree == NULL, 0))
3278             goto parse_bracket_exp_espace;
3279
3280           /* Then join them by ALT node.  */
3281           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3282           if (BE (work_tree == NULL, 0))
3283             goto parse_bracket_exp_espace;
3284         }
3285       else
3286         {
3287           re_free (sbcset);
3288           work_tree = mbc_tree;
3289         }
3290     }
3291   else
3292 #endif /* not RE_ENABLE_I18N */
3293     {
3294 #ifdef RE_ENABLE_I18N
3295       free_charset (mbcset);
3296 #endif
3297       /* Build a tree for simple bracket.  */
3298       br_token.type = SIMPLE_BRACKET;
3299       br_token.opr.sbcset = sbcset;
3300       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3301       if (BE (work_tree == NULL, 0))
3302         goto parse_bracket_exp_espace;
3303     }
3304   return work_tree;
3305
3306  parse_bracket_exp_espace:
3307   *err = REG_ESPACE;
3308  parse_bracket_exp_free_return:
3309   re_free (sbcset);
3310 #ifdef RE_ENABLE_I18N
3311   free_charset (mbcset);
3312 #endif /* RE_ENABLE_I18N */
3313   return NULL;
3314 }
3315
3316 /* Parse an element in the bracket expression.  */
3317
3318 static reg_errcode_t
3319 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3320                        re_token_t *token, int token_len, re_dfa_t *dfa,
3321                        reg_syntax_t syntax, int accept_hyphen)
3322 {
3323 #ifdef RE_ENABLE_I18N
3324   int cur_char_size;
3325   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3326   if (cur_char_size > 1)
3327     {
3328       elem->type = MB_CHAR;
3329       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3330       re_string_skip_bytes (regexp, cur_char_size);
3331       return REG_NOERROR;
3332     }
3333 #endif /* RE_ENABLE_I18N */
3334   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3335   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3336       || token->type == OP_OPEN_EQUIV_CLASS)
3337     return parse_bracket_symbol (elem, regexp, token);
3338   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3339     {
3340       /* A '-' must only appear as anything but a range indicator before
3341          the closing bracket.  Everything else is an error.  */
3342       re_token_t token2;
3343       (void) peek_token_bracket (&token2, regexp, syntax);
3344       if (token2.type != OP_CLOSE_BRACKET)
3345         /* The actual error value is not standardized since this whole
3346            case is undefined.  But ERANGE makes good sense.  */
3347         return REG_ERANGE;
3348     }
3349   elem->type = SB_CHAR;
3350   elem->opr.ch = token->opr.c;
3351   return REG_NOERROR;
3352 }
3353
3354 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3355    such as [:<character_class>:], [.<collating_element>.], and
3356    [=<equivalent_class>=].  */
3357
3358 static reg_errcode_t
3359 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3360                       re_token_t *token)
3361 {
3362   unsigned char ch, delim = token->opr.c;
3363   int i = 0;
3364   if (re_string_eoi(regexp))
3365     return REG_EBRACK;
3366   for (;; ++i)
3367     {
3368       if (i >= BRACKET_NAME_BUF_SIZE)
3369         return REG_EBRACK;
3370       if (token->type == OP_OPEN_CHAR_CLASS)
3371         ch = re_string_fetch_byte_case (regexp);
3372       else
3373         ch = re_string_fetch_byte (regexp);
3374       if (re_string_eoi(regexp))
3375         return REG_EBRACK;
3376       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3377         break;
3378       elem->opr.name[i] = ch;
3379     }
3380   re_string_skip_bytes (regexp, 1);
3381   elem->opr.name[i] = '\0';
3382   switch (token->type)
3383     {
3384     case OP_OPEN_COLL_ELEM:
3385       elem->type = COLL_SYM;
3386       break;
3387     case OP_OPEN_EQUIV_CLASS:
3388       elem->type = EQUIV_CLASS;
3389       break;
3390     case OP_OPEN_CHAR_CLASS:
3391       elem->type = CHAR_CLASS;
3392       break;
3393     default:
3394       break;
3395     }
3396   return REG_NOERROR;
3397 }
3398
3399   /* Helper function for parse_bracket_exp.
3400      Build the equivalence class which is represented by NAME.
3401      The result are written to MBCSET and SBCSET.
3402      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3403      is a pointer argument since we may update it.  */
3404
3405 static reg_errcode_t
3406 #ifdef RE_ENABLE_I18N
3407 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3408                    int *equiv_class_alloc, const unsigned char *name)
3409 #else /* not RE_ENABLE_I18N */
3410 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3411 #endif /* not RE_ENABLE_I18N */
3412 {
3413 #ifdef _LIBC
3414   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3415   if (nrules != 0)
3416     {
3417       const int32_t *table, *indirect;
3418       const unsigned char *weights, *extra, *cp;
3419       unsigned char char_buf[2];
3420       int32_t idx1, idx2;
3421       unsigned int ch;
3422       size_t len;
3423       /* Calculate the index for equivalence class.  */
3424       cp = name;
3425       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3426       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3427                                                _NL_COLLATE_WEIGHTMB);
3428       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3429                                                    _NL_COLLATE_EXTRAMB);
3430       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3431                                                 _NL_COLLATE_INDIRECTMB);
3432       idx1 = findidx (table, indirect, extra, &cp, -1);
3433       if (BE (idx1 == 0 || *cp != '\0', 0))
3434         /* This isn't a valid character.  */
3435         return REG_ECOLLATE;
3436
3437       /* Build single byte matcing table for this equivalence class.  */
3438       len = weights[idx1 & 0xffffff];
3439       for (ch = 0; ch < SBC_MAX; ++ch)
3440         {
3441           char_buf[0] = ch;
3442           cp = char_buf;
3443           idx2 = findidx (table, indirect, extra, &cp, 1);
3444 /*
3445           idx2 = table[ch];
3446 */
3447           if (idx2 == 0)
3448             /* This isn't a valid character.  */
3449             continue;
3450           /* Compare only if the length matches and the collation rule
3451              index is the same.  */
3452           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3453             {
3454               int cnt = 0;
3455
3456               while (cnt <= len &&
3457                      weights[(idx1 & 0xffffff) + 1 + cnt]
3458                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3459                 ++cnt;
3460
3461               if (cnt > len)
3462                 bitset_set (sbcset, ch);
3463             }
3464         }
3465       /* Check whether the array has enough space.  */
3466       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3467         {
3468           /* Not enough, realloc it.  */
3469           /* +1 in case of mbcset->nequiv_classes is 0.  */
3470           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3471           /* Use realloc since the array is NULL if *alloc == 0.  */
3472           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3473                                                    int32_t,
3474                                                    new_equiv_class_alloc);
3475           if (BE (new_equiv_classes == NULL, 0))
3476             return REG_ESPACE;
3477           mbcset->equiv_classes = new_equiv_classes;
3478           *equiv_class_alloc = new_equiv_class_alloc;
3479         }
3480       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3481     }
3482   else
3483 #endif /* _LIBC */
3484     {
3485       if (BE (strlen ((const char *) name) != 1, 0))
3486         return REG_ECOLLATE;
3487       bitset_set (sbcset, *name);
3488     }
3489   return REG_NOERROR;
3490 }
3491
3492   /* Helper function for parse_bracket_exp.
3493      Build the character class which is represented by NAME.
3494      The result are written to MBCSET and SBCSET.
3495      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3496      is a pointer argument since we may update it.  */
3497
3498 static reg_errcode_t
3499 #ifdef RE_ENABLE_I18N
3500 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3501                  re_charset_t *mbcset, int *char_class_alloc,
3502                  const unsigned char *class_name, reg_syntax_t syntax)
3503 #else /* not RE_ENABLE_I18N */
3504 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3505                  const unsigned char *class_name, reg_syntax_t syntax)
3506 #endif /* not RE_ENABLE_I18N */
3507 {
3508   int i;
3509   const char *name = (const char *) class_name;
3510
3511   /* In case of REG_ICASE "upper" and "lower" match the both of
3512      upper and lower cases.  */
3513   if ((syntax & RE_ICASE)
3514       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3515     name = "alpha";
3516
3517 #ifdef RE_ENABLE_I18N
3518   /* Check the space of the arrays.  */
3519   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3520     {
3521       /* Not enough, realloc it.  */
3522       /* +1 in case of mbcset->nchar_classes is 0.  */
3523       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3524       /* Use realloc since array is NULL if *alloc == 0.  */
3525       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3526                                                new_char_class_alloc);
3527       if (BE (new_char_classes == NULL, 0))
3528         return REG_ESPACE;
3529       mbcset->char_classes = new_char_classes;
3530       *char_class_alloc = new_char_class_alloc;
3531     }
3532   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3533 #endif /* RE_ENABLE_I18N */
3534
3535 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3536   do {                                          \
3537     if (BE (trans != NULL, 0))                  \
3538       {                                         \
3539         for (i = 0; i < SBC_MAX; ++i)           \
3540           if (ctype_func (i))                   \
3541             bitset_set (sbcset, trans[i]);      \
3542       }                                         \
3543     else                                        \
3544       {                                         \
3545         for (i = 0; i < SBC_MAX; ++i)           \
3546           if (ctype_func (i))                   \
3547             bitset_set (sbcset, i);             \
3548       }                                         \
3549   } while (0)
3550
3551   if (strcmp (name, "alnum") == 0)
3552     BUILD_CHARCLASS_LOOP (isalnum);
3553   else if (strcmp (name, "cntrl") == 0)
3554     BUILD_CHARCLASS_LOOP (iscntrl);
3555   else if (strcmp (name, "lower") == 0)
3556     BUILD_CHARCLASS_LOOP (islower);
3557   else if (strcmp (name, "space") == 0)
3558     BUILD_CHARCLASS_LOOP (isspace);
3559   else if (strcmp (name, "alpha") == 0)
3560     BUILD_CHARCLASS_LOOP (isalpha);
3561   else if (strcmp (name, "digit") == 0)
3562     BUILD_CHARCLASS_LOOP (isdigit);
3563   else if (strcmp (name, "print") == 0)
3564     BUILD_CHARCLASS_LOOP (isprint);
3565   else if (strcmp (name, "upper") == 0)
3566     BUILD_CHARCLASS_LOOP (isupper);
3567   else if (strcmp (name, "blank") == 0)
3568     BUILD_CHARCLASS_LOOP (isblank);
3569   else if (strcmp (name, "graph") == 0)
3570     BUILD_CHARCLASS_LOOP (isgraph);
3571   else if (strcmp (name, "punct") == 0)
3572     BUILD_CHARCLASS_LOOP (ispunct);
3573   else if (strcmp (name, "xdigit") == 0)
3574     BUILD_CHARCLASS_LOOP (isxdigit);
3575   else
3576     return REG_ECTYPE;
3577
3578   return REG_NOERROR;
3579 }
3580
3581 static bin_tree_t *
3582 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3583                     const unsigned char *class_name,
3584                     const unsigned char *extra, int non_match,
3585                     reg_errcode_t *err)
3586 {
3587   re_bitset_ptr_t sbcset;
3588 #ifdef RE_ENABLE_I18N
3589   re_charset_t *mbcset;
3590   int alloc = 0;
3591 #endif /* not RE_ENABLE_I18N */
3592   reg_errcode_t ret;
3593   re_token_t br_token;
3594   bin_tree_t *tree;
3595
3596   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3597 #ifdef RE_ENABLE_I18N
3598   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3599 #endif /* RE_ENABLE_I18N */
3600
3601 #ifdef RE_ENABLE_I18N
3602   if (BE (sbcset == NULL || mbcset == NULL, 0))
3603 #else /* not RE_ENABLE_I18N */
3604   if (BE (sbcset == NULL, 0))
3605 #endif /* not RE_ENABLE_I18N */
3606     {
3607       *err = REG_ESPACE;
3608       return NULL;
3609     }
3610
3611   if (non_match)
3612     {
3613 #ifdef RE_ENABLE_I18N
3614       mbcset->non_match = 1;
3615 #endif /* not RE_ENABLE_I18N */
3616     }
3617
3618   /* We don't care the syntax in this case.  */
3619   ret = build_charclass (trans, sbcset,
3620 #ifdef RE_ENABLE_I18N
3621                          mbcset, &alloc,
3622 #endif /* RE_ENABLE_I18N */
3623                          class_name, 0);
3624
3625   if (BE (ret != REG_NOERROR, 0))
3626     {
3627       re_free (sbcset);
3628 #ifdef RE_ENABLE_I18N
3629       free_charset (mbcset);
3630 #endif /* RE_ENABLE_I18N */
3631       *err = ret;
3632       return NULL;
3633     }
3634   /* \w match '_' also.  */
3635   for (; *extra; extra++)
3636     bitset_set (sbcset, *extra);
3637
3638   /* If it is non-matching list.  */
3639   if (non_match)
3640     bitset_not (sbcset);
3641
3642 #ifdef RE_ENABLE_I18N
3643   /* Ensure only single byte characters are set.  */
3644   if (dfa->mb_cur_max > 1)
3645     bitset_mask (sbcset, dfa->sb_char);
3646 #endif
3647
3648   /* Build a tree for simple bracket.  */
3649   br_token.type = SIMPLE_BRACKET;
3650   br_token.opr.sbcset = sbcset;
3651   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3652   if (BE (tree == NULL, 0))
3653     goto build_word_op_espace;
3654
3655 #ifdef RE_ENABLE_I18N
3656   if (dfa->mb_cur_max > 1)
3657     {
3658       bin_tree_t *mbc_tree;
3659       /* Build a tree for complex bracket.  */
3660       br_token.type = COMPLEX_BRACKET;
3661       br_token.opr.mbcset = mbcset;
3662       dfa->has_mb_node = 1;
3663       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3664       if (BE (mbc_tree == NULL, 0))
3665         goto build_word_op_espace;
3666       /* Then join them by ALT node.  */
3667       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3668       if (BE (mbc_tree != NULL, 1))
3669         return tree;
3670     }
3671   else
3672     {
3673       free_charset (mbcset);
3674       return tree;
3675     }
3676 #else /* not RE_ENABLE_I18N */
3677   return tree;
3678 #endif /* not RE_ENABLE_I18N */
3679
3680  build_word_op_espace:
3681   re_free (sbcset);
3682 #ifdef RE_ENABLE_I18N
3683   free_charset (mbcset);
3684 #endif /* RE_ENABLE_I18N */
3685   *err = REG_ESPACE;
3686   return NULL;
3687 }
3688
3689 /* This is intended for the expressions like "a{1,3}".
3690    Fetch a number from `input', and return the number.
3691    Return -1, if the number field is empty like "{,1}".
3692    Return -2, If an error is occured.  */
3693
3694 static int
3695 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3696 {
3697   int num = -1;
3698   unsigned char c;
3699   while (1)
3700     {
3701       fetch_token (token, input, syntax);
3702       c = token->opr.c;
3703       if (BE (token->type == END_OF_RE, 0))
3704         return -2;
3705       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3706         break;
3707       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3708              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3709       num = (num > RE_DUP_MAX) ? -2 : num;
3710     }
3711   return num;
3712 }
3713 \f
3714 #ifdef RE_ENABLE_I18N
3715 static void
3716 free_charset (re_charset_t *cset)
3717 {
3718   re_free (cset->mbchars);
3719 # ifdef _LIBC
3720   re_free (cset->coll_syms);
3721   re_free (cset->equiv_classes);
3722   re_free (cset->range_starts);
3723   re_free (cset->range_ends);
3724 # endif
3725   re_free (cset->char_classes);
3726   re_free (cset);
3727 }
3728 #endif /* RE_ENABLE_I18N */
3729 \f
3730 /* Functions for binary tree operation.  */
3731
3732 /* Create a tree node.  */
3733
3734 static bin_tree_t *
3735 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3736              re_token_type_t type)
3737 {
3738   re_token_t t;
3739   t.type = type;
3740   return create_token_tree (dfa, left, right, &t);
3741 }
3742
3743 static bin_tree_t *
3744 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3745                    const re_token_t *token)
3746 {
3747   bin_tree_t *tree;
3748   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3749     {
3750       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3751
3752       if (storage == NULL)
3753         return NULL;
3754       storage->next = dfa->str_tree_storage;
3755       dfa->str_tree_storage = storage;
3756       dfa->str_tree_storage_idx = 0;
3757     }
3758   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3759
3760   tree->parent = NULL;
3761   tree->left = left;
3762   tree->right = right;
3763   tree->token = *token;
3764   tree->token.duplicated = 0;
3765   tree->token.opt_subexp = 0;
3766   tree->first = NULL;
3767   tree->next = NULL;
3768   tree->node_idx = -1;
3769
3770   if (left != NULL)
3771     left->parent = tree;
3772   if (right != NULL)
3773     right->parent = tree;
3774   return tree;
3775 }
3776
3777 /* Mark the tree SRC as an optional subexpression.
3778    To be called from preorder or postorder.  */
3779
3780 static reg_errcode_t
3781 mark_opt_subexp (void *extra, bin_tree_t *node)
3782 {
3783   int idx = (int) (long) extra;
3784   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3785     node->token.opt_subexp = 1;
3786
3787   return REG_NOERROR;
3788 }
3789
3790 /* Free the allocated memory inside NODE. */
3791
3792 static void
3793 free_token (re_token_t *node)
3794 {
3795 #ifdef RE_ENABLE_I18N
3796   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3797     free_charset (node->opr.mbcset);
3798   else
3799 #endif /* RE_ENABLE_I18N */
3800     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3801       re_free (node->opr.sbcset);
3802 }
3803
3804 /* Worker function for tree walking.  Free the allocated memory inside NODE
3805    and its children. */
3806
3807 static reg_errcode_t
3808 free_tree (void *extra, bin_tree_t *node)
3809 {
3810   free_token (&node->token);
3811   return REG_NOERROR;
3812 }
3813
3814
3815 /* Duplicate the node SRC, and return new node.  This is a preorder
3816    visit similar to the one implemented by the generic visitor, but
3817    we need more infrastructure to maintain two parallel trees --- so,
3818    it's easier to duplicate.  */
3819
3820 static bin_tree_t *
3821 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3822 {
3823   const bin_tree_t *node;
3824   bin_tree_t *dup_root;
3825   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3826
3827   for (node = root; ; )
3828     {
3829       /* Create a new tree and link it back to the current parent.  */
3830       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3831       if (*p_new == NULL)
3832         return NULL;
3833       (*p_new)->parent = dup_node;
3834       (*p_new)->token.duplicated = 1;
3835       dup_node = *p_new;
3836
3837       /* Go to the left node, or up and to the right.  */
3838       if (node->left)
3839         {
3840           node = node->left;
3841           p_new = &dup_node->left;
3842         }
3843       else
3844         {
3845           const bin_tree_t *prev = NULL;
3846           while (node->right == prev || node->right == NULL)
3847             {
3848               prev = node;
3849               node = node->parent;
3850               dup_node = dup_node->parent;
3851               if (!node)
3852                 return dup_root;
3853             }
3854           node = node->right;
3855           p_new = &dup_node->right;
3856         }
3857     }
3858 }