posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <stdint.h>
  21
  22 #ifdef _LIBC
  23 # include <locale/weight.h>
  24 #endif
  25
  26 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  27                                           size_t length, reg_syntax_t syntax);
  28 static void re_compile_fastmap_iter (regex_t *bufp,
  29                                      const re_dfastate_t *init_state,
  30                                      char *fastmap);
  31 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  32 #ifdef RE_ENABLE_I18N
  33 static void free_charset (re_charset_t *cset);
  34 #endif /* RE_ENABLE_I18N */
  35 static void free_workarea_compile (regex_t *preg);
  36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  37 #ifdef RE_ENABLE_I18N
  38 static void optimize_utf8 (re_dfa_t *dfa);
  39 #endif
  40 static reg_errcode_t analyze (regex_t *preg);
  41 static reg_errcode_t preorder (bin_tree_t *root,
  42                                reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                void *extra);
  44 static reg_errcode_t postorder (bin_tree_t *root,
  45                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  46                                 void *extra);
  47 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  48 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  49 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  50                                  bin_tree_t *node);
  51 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  52 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  53 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  54 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  55 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  56                                    unsigned int constraint);
  57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  59                                          int node, int root);
  60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  61 static int fetch_number (re_string_t *input, re_token_t *token,
  62                          reg_syntax_t syntax);
  63 static int peek_token (re_token_t *token, re_string_t *input,
  64                         reg_syntax_t syntax);
  65 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  66                           reg_syntax_t syntax, reg_errcode_t *err);
  67 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  68                                   re_token_t *token, reg_syntax_t syntax,
  69                                   int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  71                                  re_token_t *token, reg_syntax_t syntax,
  72                                  int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  74                                      re_token_t *token, reg_syntax_t syntax,
  75                                      int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  77                                   re_token_t *token, reg_syntax_t syntax,
  78                                   int nest, reg_errcode_t *err);
  79 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  80                                  re_dfa_t *dfa, re_token_t *token,
  81                                  reg_syntax_t syntax, reg_errcode_t *err);
  82 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  83                                       re_token_t *token, reg_syntax_t syntax,
  84                                       reg_errcode_t *err);
  85 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  86                                             re_string_t *regexp,
  87                                             re_token_t *token, int token_len,
  88                                             re_dfa_t *dfa,
  89                                             reg_syntax_t syntax,
  90                                             int accept_hyphen);
  91 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  92                                           re_string_t *regexp,
  93                                           re_token_t *token);
  94 #ifdef RE_ENABLE_I18N
  95 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  96                                         re_charset_t *mbcset,
  97                                         int *equiv_class_alloc,
  98                                         const unsigned char *name);
  99 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 100                                       bitset_t sbcset,
 101                                       re_charset_t *mbcset,
 102                                       int *char_class_alloc,
 103                                       const unsigned char *class_name,
 104                                       reg_syntax_t syntax);
 105 #else  /* not RE_ENABLE_I18N */
 106 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 107                                         const unsigned char *name);
 108 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 109                                       bitset_t sbcset,
 110                                       const unsigned char *class_name,
 111                                       reg_syntax_t syntax);
 112 #endif /* not RE_ENABLE_I18N */
 113 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 114                                        RE_TRANSLATE_TYPE trans,
 115                                        const unsigned char *class_name,
 116                                        const unsigned char *extra,
 117                                        int non_match, reg_errcode_t *err);
 118 static bin_tree_t *create_tree (re_dfa_t *dfa,
 119                                 bin_tree_t *left, bin_tree_t *right,
 120                                 re_token_type_t type);
 121 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 122                                       bin_tree_t *left, bin_tree_t *right,
 123                                       const re_token_t *token);
 124 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 125 static void free_token (re_token_t *node);
 126 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 127 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 128 \f
 129 /* This table gives an error message for each of the error codes listed
 130    in regex.h.  Obviously the order here has to be same as there.
 131    POSIX doesn't require that we do anything for REG_NOERROR,
 132    but why not be nice?  */
 133
 134 const char __re_error_msgid[] attribute_hidden =
 135   {
 136 #define REG_NOERROR_IDX 0
 137     gettext_noop ("Success")    /* REG_NOERROR */
 138     "\0"
 139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 140     gettext_noop ("No match")   /* REG_NOMATCH */
 141     "\0"
 142 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 143     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 144     "\0"
 145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 146     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 147     "\0"
 148 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 149     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 150     "\0"
 151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 152     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 153     "\0"
 154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 155     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 156     "\0"
 157 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 158     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 159     "\0"
 160 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 161     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 162     "\0"
 163 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 164     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 165     "\0"
 166 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 167     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 168     "\0"
 169 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 170     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 171     "\0"
 172 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 173     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 174     "\0"
 175 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 176     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 177     "\0"
 178 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 179     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 180     "\0"
 181 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 182     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 183     "\0"
 184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 185     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 186   };
 187
 188 const size_t __re_error_msgid_idx[] attribute_hidden =
 189   {
 190     REG_NOERROR_IDX,
 191     REG_NOMATCH_IDX,
 192     REG_BADPAT_IDX,
 193     REG_ECOLLATE_IDX,
 194     REG_ECTYPE_IDX,
 195     REG_EESCAPE_IDX,
 196     REG_ESUBREG_IDX,
 197     REG_EBRACK_IDX,
 198     REG_EPAREN_IDX,
 199     REG_EBRACE_IDX,
 200     REG_BADBR_IDX,
 201     REG_ERANGE_IDX,
 202     REG_ESPACE_IDX,
 203     REG_BADRPT_IDX,
 204     REG_EEND_IDX,
 205     REG_ESIZE_IDX,
 206     REG_ERPAREN_IDX
 207   };
 208 \f
 209 /* Entry points for GNU code.  */
 210
 211 /* re_compile_pattern is the GNU regular expression compiler: it
 212    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 213    Returns 0 if the pattern was valid, otherwise an error string.
 214
 215    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 216    are set in BUFP on entry.  */
 217
 218 const char *
 219 re_compile_pattern (const char *pattern, size_t length,
 220                     struct re_pattern_buffer *bufp)
 221 {
 222   reg_errcode_t ret;
 223
 224   /* And GNU code determines whether or not to get register information
 225      by passing null for the REGS argument to re_match, etc., not by
 226      setting no_sub, unless RE_NO_SUB is set.  */
 227   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 228
 229   /* Match anchors at newline.  */
 230   bufp->newline_anchor = 1;
 231
 232   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 233
 234   if (!ret)
 235     return NULL;
 236   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 237 }
 238 #ifdef _LIBC
 239 weak_alias (__re_compile_pattern, re_compile_pattern)
 240 #endif
 241
 242 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 243    also be assigned to arbitrarily: each pattern buffer stores its own
 244    syntax, so it can be changed between regex compilations.  */
 245 /* This has no initializer because initialized variables in Emacs
 246    become read-only after dumping.  */
 247 reg_syntax_t re_syntax_options;
 248
 249
 250 /* Specify the precise syntax of regexps for compilation.  This provides
 251    for compatibility for various utilities which historically have
 252    different, incompatible syntaxes.
 253
 254    The argument SYNTAX is a bit mask comprised of the various bits
 255    defined in regex.h.  We return the old syntax.  */
 256
 257 reg_syntax_t
 258 re_set_syntax (reg_syntax_t syntax)
 259 {
 260   reg_syntax_t ret = re_syntax_options;
 261
 262   re_syntax_options = syntax;
 263   return ret;
 264 }
 265 #ifdef _LIBC
 266 weak_alias (__re_set_syntax, re_set_syntax)
 267 #endif
 268
 269 int
 270 re_compile_fastmap (struct re_pattern_buffer *bufp)
 271 {
 272   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 273   char *fastmap = bufp->fastmap;
 274
 275   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 276   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 277   if (dfa->init_state != dfa->init_state_word)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 279   if (dfa->init_state != dfa->init_state_nl)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 281   if (dfa->init_state != dfa->init_state_begbuf)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 283   bufp->fastmap_accurate = 1;
 284   return 0;
 285 }
 286 #ifdef _LIBC
 287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 288 #endif
 289
 290 static inline void
 291 __attribute__ ((always_inline))
 292 re_set_fastmap (char *fastmap, bool icase, int ch)
 293 {
 294   fastmap[ch] = 1;
 295   if (icase)
 296     fastmap[tolower (ch)] = 1;
 297 }
 298
 299 /* Helper function for re_compile_fastmap.
 300    Compile fastmap for the initial_state INIT_STATE.  */
 301
 302 static void
 303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 304                          char *fastmap)
 305 {
 306   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 307   int node_cnt;
 308   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 309   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 310     {
 311       int node = init_state->nodes.elems[node_cnt];
 312       re_token_type_t type = dfa->nodes[node].type;
 313
 314       if (type == CHARACTER)
 315         {
 316           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 317 #ifdef RE_ENABLE_I18N
 318           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 319             {
 320               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 321               wchar_t wc;
 322               mbstate_t state;
 323
 324               p = buf;
 325               *p++ = dfa->nodes[node].opr.c;
 326               while (++node < dfa->nodes_len
 327                      && dfa->nodes[node].type == CHARACTER
 328                      && dfa->nodes[node].mb_partial)
 329                 *p++ = dfa->nodes[node].opr.c;
 330               memset (&state, '\0', sizeof (state));
 331               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 332                              &state) == p - buf
 333                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 334                       != (size_t) -1))
 335                 re_set_fastmap (fastmap, 0, buf[0]);
 336             }
 337 #endif
 338         }
 339       else if (type == SIMPLE_BRACKET)
 340         {
 341           int i, ch;
 342           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 343             {
 344               int j;
 345               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 346               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 347                 if (w & ((bitset_word_t) 1 << j))
 348                   re_set_fastmap (fastmap, icase, ch);
 349             }
 350         }
 351 #ifdef RE_ENABLE_I18N
 352       else if (type == COMPLEX_BRACKET)
 353         {
 354           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 355           int i;
 356
 357 # ifdef _LIBC
 358           /* See if we have to try all bytes which start multiple collation
 359              elements.
 360              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 361                   collation element, and don't catch 'b' since 'b' is
 362                   the only collation element which starts from 'b' (and
 363                   it is caught by SIMPLE_BRACKET).  */
 364               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 365                   && (cset->ncoll_syms || cset->nranges))
 366                 {
 367                   const int32_t *table = (const int32_t *)
 368                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 369                   for (i = 0; i < SBC_MAX; ++i)
 370                     if (table[i] < 0)
 371                       re_set_fastmap (fastmap, icase, i);
 372                 }
 373 # endif /* _LIBC */
 374
 375           /* See if we have to start the match at all multibyte characters,
 376              i.e. where we would not find an invalid sequence.  This only
 377              applies to multibyte character sets; for single byte character
 378              sets, the SIMPLE_BRACKET again suffices.  */
 379           if (dfa->mb_cur_max > 1
 380               && (cset->nchar_classes || cset->non_match || cset->nranges
 381 # ifdef _LIBC
 382                   || cset->nequiv_classes
 383 # endif /* _LIBC */
 384                  ))
 385             {
 386               unsigned char c = 0;
 387               do
 388                 {
 389                   mbstate_t mbs;
 390                   memset (&mbs, 0, sizeof (mbs));
 391                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 392                     re_set_fastmap (fastmap, false, (int) c);
 393                 }
 394               while (++c != 0);
 395             }
 396
 397           else
 398             {
 399               /* ... Else catch all bytes which can start the mbchars.  */
 400               for (i = 0; i < cset->nmbchars; ++i)
 401                 {
 402                   char buf[256];
 403                   mbstate_t state;
 404                   memset (&state, '\0', sizeof (state));
 405                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 406                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 407                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 408                     {
 409                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 410                           != (size_t) -1)
 411                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 412                     }
 413                 }
 414             }
 415         }
 416 #endif /* RE_ENABLE_I18N */
 417       else if (type == OP_PERIOD
 418 #ifdef RE_ENABLE_I18N
 419                || type == OP_UTF8_PERIOD
 420 #endif /* RE_ENABLE_I18N */
 421                || type == END_OF_RE)
 422         {
 423           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 424           if (type == END_OF_RE)
 425             bufp->can_be_null = 1;
 426           return;
 427         }
 428     }
 429 }
 430 \f
 431 /* Entry point for POSIX code.  */
 432 /* regcomp takes a regular expression as a string and compiles it.
 433
 434    PREG is a regex_t *.  We do not expect any fields to be initialized,
 435    since POSIX says we shouldn't.  Thus, we set
 436
 437      'buffer' to the compiled pattern;
 438      'used' to the length of the compiled pattern;
 439      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 440        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 441        RE_SYNTAX_POSIX_BASIC;
 442      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 443      'fastmap' to an allocated space for the fastmap;
 444      'fastmap_accurate' to zero;
 445      're_nsub' to the number of subexpressions in PATTERN.
 446
 447    PATTERN is the address of the pattern string.
 448
 449    CFLAGS is a series of bits which affect compilation.
 450
 451      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 452      use POSIX basic syntax.
 453
 454      If REG_NEWLINE is set, then . and [^...] don't match newline.
 455      Also, regexec will try a match beginning after every newline.
 456
 457      If REG_ICASE is set, then we considers upper- and lowercase
 458      versions of letters to be equivalent when matching.
 459
 460      If REG_NOSUB is set, then when PREG is passed to regexec, that
 461      routine will report only success or failure, and nothing about the
 462      registers.
 463
 464    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 465    the return codes and their meanings.)  */
 466
 467 int
 468 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 469 {
 470   reg_errcode_t ret;
 471   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 472                          : RE_SYNTAX_POSIX_BASIC);
 473
 474   preg->buffer = NULL;
 475   preg->allocated = 0;
 476   preg->used = 0;
 477
 478   /* Try to allocate space for the fastmap.  */
 479   preg->fastmap = re_malloc (char, SBC_MAX);
 480   if (BE (preg->fastmap == NULL, 0))
 481     return REG_ESPACE;
 482
 483   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 484
 485   /* If REG_NEWLINE is set, newlines are treated differently.  */
 486   if (cflags & REG_NEWLINE)
 487     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 488       syntax &= ~RE_DOT_NEWLINE;
 489       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 490       /* It also changes the matching behavior.  */
 491       preg->newline_anchor = 1;
 492     }
 493   else
 494     preg->newline_anchor = 0;
 495   preg->no_sub = !!(cflags & REG_NOSUB);
 496   preg->translate = NULL;
 497
 498   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 499
 500   /* POSIX doesn't distinguish between an unmatched open-group and an
 501      unmatched close-group: both are REG_EPAREN.  */
 502   if (ret == REG_ERPAREN)
 503     ret = REG_EPAREN;
 504
 505   /* We have already checked preg->fastmap != NULL.  */
 506   if (BE (ret == REG_NOERROR, 1))
 507     /* Compute the fastmap now, since regexec cannot modify the pattern
 508        buffer.  This function never fails in this implementation.  */
 509     (void) re_compile_fastmap (preg);
 510   else
 511     {
 512       /* Some error occurred while compiling the expression.  */
 513       re_free (preg->fastmap);
 514       preg->fastmap = NULL;
 515     }
 516
 517   return (int) ret;
 518 }
 519 #ifdef _LIBC
 520 weak_alias (__regcomp, regcomp)
 521 #endif
 522
 523 /* Returns a message corresponding to an error code, ERRCODE, returned
 524    from either regcomp or regexec.   We don't use PREG here.  */
 525
 526 size_t
 527 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
 528           size_t errbuf_size)
 529 {
 530   const char *msg;
 531   size_t msg_size;
 532
 533   if (BE (errcode < 0
 534           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 535                                / sizeof (__re_error_msgid_idx[0])), 0))
 536     /* Only error codes returned by the rest of the code should be passed
 537        to this routine.  If we are given anything else, or if other regex
 538        code generates an invalid error code, then the program has a bug.
 539        Dump core so we can fix it.  */
 540     abort ();
 541
 542   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 543
 544   msg_size = strlen (msg) + 1; /* Includes the null.  */
 545
 546   if (BE (errbuf_size != 0, 1))
 547     {
 548       if (BE (msg_size > errbuf_size, 0))
 549         {
 550 #if defined HAVE_MEMPCPY || defined _LIBC
 551           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 552 #else
 553           memcpy (errbuf, msg, errbuf_size - 1);
 554           errbuf[errbuf_size - 1] = 0;
 555 #endif
 556         }
 557       else
 558         memcpy (errbuf, msg, msg_size);
 559     }
 560
 561   return msg_size;
 562 }
 563 #ifdef _LIBC
 564 weak_alias (__regerror, regerror)
 565 #endif
 566
 567
 568 #ifdef RE_ENABLE_I18N
 569 /* This static array is used for the map to single-byte characters when
 570    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 571    it the same all the time.  UTF-8 is the preferred encoding so this is
 572    a worthwhile optimization.  */
 573 static const bitset_t utf8_sb_map =
 574 {
 575   /* Set the first 128 bits.  */
 576   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 577 };
 578 #endif
 579
 580
 581 static void
 582 free_dfa_content (re_dfa_t *dfa)
 583 {
 584   int i, j;
 585
 586   if (dfa->nodes)
 587     for (i = 0; i < dfa->nodes_len; ++i)
 588       free_token (dfa->nodes + i);
 589   re_free (dfa->nexts);
 590   for (i = 0; i < dfa->nodes_len; ++i)
 591     {
 592       if (dfa->eclosures != NULL)
 593         re_node_set_free (dfa->eclosures + i);
 594       if (dfa->inveclosures != NULL)
 595         re_node_set_free (dfa->inveclosures + i);
 596       if (dfa->edests != NULL)
 597         re_node_set_free (dfa->edests + i);
 598     }
 599   re_free (dfa->edests);
 600   re_free (dfa->eclosures);
 601   re_free (dfa->inveclosures);
 602   re_free (dfa->nodes);
 603
 604   if (dfa->state_table)
 605     for (i = 0; i <= dfa->state_hash_mask; ++i)
 606       {
 607         struct re_state_table_entry *entry = dfa->state_table + i;
 608         for (j = 0; j < entry->num; ++j)
 609           {
 610             re_dfastate_t *state = entry->array[j];
 611             free_state (state);
 612           }
 613         re_free (entry->array);
 614       }
 615   re_free (dfa->state_table);
 616 #ifdef RE_ENABLE_I18N
 617   if (dfa->sb_char != utf8_sb_map)
 618     re_free (dfa->sb_char);
 619 #endif
 620   re_free (dfa->subexp_map);
 621 #ifdef DEBUG
 622   re_free (dfa->re_str);
 623 #endif
 624
 625   re_free (dfa);
 626 }
 627
 628
 629 /* Free dynamically allocated space used by PREG.  */
 630
 631 void
 632 regfree (regex_t *preg)
 633 {
 634   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 635   if (BE (dfa != NULL, 1))
 636     free_dfa_content (dfa);
 637   preg->buffer = NULL;
 638   preg->allocated = 0;
 639
 640   re_free (preg->fastmap);
 641   preg->fastmap = NULL;
 642
 643   re_free (preg->translate);
 644   preg->translate = NULL;
 645 }
 646 #ifdef _LIBC
 647 weak_alias (__regfree, regfree)
 648 #endif
 649 \f
 650 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 651    them unless specifically requested.  */
 652
 653 #if defined _REGEX_RE_COMP || defined _LIBC
 654
 655 /* BSD has one and only one pattern buffer.  */
 656 static struct re_pattern_buffer re_comp_buf;
 657
 658 char *
 659 # ifdef _LIBC
 660 /* Make these definitions weak in libc, so POSIX programs can redefine
 661    these names if they don't use our functions, and still use
 662    regcomp/regexec above without link errors.  */
 663 weak_function
 664 # endif
 665 re_comp (const char *s)
 666 {
 667   reg_errcode_t ret;
 668   char *fastmap;
 669
 670   if (!s)
 671     {
 672       if (!re_comp_buf.buffer)
 673         return gettext ("No previous regular expression");
 674       return 0;
 675     }
 676
 677   if (re_comp_buf.buffer)
 678     {
 679       fastmap = re_comp_buf.fastmap;
 680       re_comp_buf.fastmap = NULL;
 681       __regfree (&re_comp_buf);
 682       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 683       re_comp_buf.fastmap = fastmap;
 684     }
 685
 686   if (re_comp_buf.fastmap == NULL)
 687     {
 688       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 689       if (re_comp_buf.fastmap == NULL)
 690         return (char *) gettext (__re_error_msgid
 691                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 692     }
 693
 694   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 695      don't need to initialize the pattern buffer fields which affect it.  */
 696
 697   /* Match anchors at newlines.  */
 698   re_comp_buf.newline_anchor = 1;
 699
 700   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 701
 702   if (!ret)
 703     return NULL;
 704
 705   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 706   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 707 }
 708
 709 #ifdef _LIBC
 710 libc_freeres_fn (free_mem)
 711 {
 712   __regfree (&re_comp_buf);
 713 }
 714 #endif
 715
 716 #endif /* _REGEX_RE_COMP */
 717 \f
 718 /* Internal entry point.
 719    Compile the regular expression PATTERN, whose length is LENGTH.
 720    SYNTAX indicate regular expression's syntax.  */
 721
 722 static reg_errcode_t
 723 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 724                      reg_syntax_t syntax)
 725 {
 726   reg_errcode_t err = REG_NOERROR;
 727   re_dfa_t *dfa;
 728   re_string_t regexp;
 729
 730   /* Initialize the pattern buffer.  */
 731   preg->fastmap_accurate = 0;
 732   preg->syntax = syntax;
 733   preg->not_bol = preg->not_eol = 0;
 734   preg->used = 0;
 735   preg->re_nsub = 0;
 736   preg->can_be_null = 0;
 737   preg->regs_allocated = REGS_UNALLOCATED;
 738
 739   /* Initialize the dfa.  */
 740   dfa = (re_dfa_t *) preg->buffer;
 741   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 742     {
 743       /* If zero allocated, but buffer is non-null, try to realloc
 744          enough space.  This loses if buffer's address is bogus, but
 745          that is the user's responsibility.  If ->buffer is NULL this
 746          is a simple allocation.  */
 747       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 748       if (dfa == NULL)
 749         return REG_ESPACE;
 750       preg->allocated = sizeof (re_dfa_t);
 751       preg->buffer = (unsigned char *) dfa;
 752     }
 753   preg->used = sizeof (re_dfa_t);
 754
 755   err = init_dfa (dfa, length);
 756   if (BE (err != REG_NOERROR, 0))
 757     {
 758       free_dfa_content (dfa);
 759       preg->buffer = NULL;
 760       preg->allocated = 0;
 761       return err;
 762     }
 763 #ifdef DEBUG
 764   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 765   dfa->re_str = re_malloc (char, length + 1);
 766   strncpy (dfa->re_str, pattern, length + 1);
 767 #endif
 768
 769   __libc_lock_init (dfa->lock);
 770
 771   err = re_string_construct (&regexp, pattern, length, preg->translate,
 772                              syntax & RE_ICASE, dfa);
 773   if (BE (err != REG_NOERROR, 0))
 774     {
 775     re_compile_internal_free_return:
 776       free_workarea_compile (preg);
 777       re_string_destruct (&regexp);
 778       free_dfa_content (dfa);
 779       preg->buffer = NULL;
 780       preg->allocated = 0;
 781       return err;
 782     }
 783
 784   /* Parse the regular expression, and build a structure tree.  */
 785   preg->re_nsub = 0;
 786   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 787   if (BE (dfa->str_tree == NULL, 0))
 788     goto re_compile_internal_free_return;
 789
 790   /* Analyze the tree and create the nfa.  */
 791   err = analyze (preg);
 792   if (BE (err != REG_NOERROR, 0))
 793     goto re_compile_internal_free_return;
 794
 795 #ifdef RE_ENABLE_I18N
 796   /* If possible, do searching in single byte encoding to speed things up.  */
 797   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 798     optimize_utf8 (dfa);
 799 #endif
 800
 801   /* Then create the initial state of the dfa.  */
 802   err = create_initial_state (dfa);
 803
 804   /* Release work areas.  */
 805   free_workarea_compile (preg);
 806   re_string_destruct (&regexp);
 807
 808   if (BE (err != REG_NOERROR, 0))
 809     {
 810       free_dfa_content (dfa);
 811       preg->buffer = NULL;
 812       preg->allocated = 0;
 813     }
 814
 815   return err;
 816 }
 817
 818 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 819    as the initial length of some arrays.  */
 820
 821 static reg_errcode_t
 822 init_dfa (re_dfa_t *dfa, size_t pat_len)
 823 {
 824   unsigned int table_size;
 825 #ifndef _LIBC
 826   char *codeset_name;
 827 #endif
 828
 829   memset (dfa, '\0', sizeof (re_dfa_t));
 830
 831   /* Force allocation of str_tree_storage the first time.  */
 832   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 833
 834   /* Avoid overflows.  */
 835   if (pat_len == SIZE_MAX)
 836     return REG_ESPACE;
 837
 838   dfa->nodes_alloc = pat_len + 1;
 839   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 840
 841   /*  table_size = 2 ^ ceil(log pat_len) */
 842   for (table_size = 1; ; table_size <<= 1)
 843     if (table_size > pat_len)
 844       break;
 845
 846   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 847   dfa->state_hash_mask = table_size - 1;
 848
 849   dfa->mb_cur_max = MB_CUR_MAX;
 850 #ifdef _LIBC
 851   if (dfa->mb_cur_max == 6
 852       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 853     dfa->is_utf8 = 1;
 854   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 855                        != 0);
 856 #else
 857 # ifdef HAVE_LANGINFO_CODESET
 858   codeset_name = nl_langinfo (CODESET);
 859 # else
 860   codeset_name = getenv ("LC_ALL");
 861   if (codeset_name == NULL || codeset_name[0] == '\0')
 862     codeset_name = getenv ("LC_CTYPE");
 863   if (codeset_name == NULL || codeset_name[0] == '\0')
 864     codeset_name = getenv ("LANG");
 865   if (codeset_name == NULL)
 866     codeset_name = "";
 867   else if (strchr (codeset_name, '.') !=  NULL)
 868     codeset_name = strchr (codeset_name, '.') + 1;
 869 # endif
 870
 871   if (strcasecmp (codeset_name, "UTF-8") == 0
 872       || strcasecmp (codeset_name, "UTF8") == 0)
 873     dfa->is_utf8 = 1;
 874
 875   /* We check exhaustively in the loop below if this charset is a
 876      superset of ASCII.  */
 877   dfa->map_notascii = 0;
 878 #endif
 879
 880 #ifdef RE_ENABLE_I18N
 881   if (dfa->mb_cur_max > 1)
 882     {
 883       if (dfa->is_utf8)
 884         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 885       else
 886         {
 887           int i, j, ch;
 888
 889           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 890           if (BE (dfa->sb_char == NULL, 0))
 891             return REG_ESPACE;
 892
 893           /* Set the bits corresponding to single byte chars.  */
 894           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 895             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 896               {
 897                 wint_t wch = __btowc (ch);
 898                 if (wch != WEOF)
 899                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 900 # ifndef _LIBC
 901                 if (isascii (ch) && wch != ch)
 902                   dfa->map_notascii = 1;
 903 # endif
 904               }
 905         }
 906     }
 907 #endif
 908
 909   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 910     return REG_ESPACE;
 911   return REG_NOERROR;
 912 }
 913
 914 /* Initialize WORD_CHAR table, which indicate which character is
 915    "word".  In this case "word" means that it is the word construction
 916    character used by some operators like "\<", "\>", etc.  */
 917
 918 static void
 919 init_word_char (re_dfa_t *dfa)
 920 {
 921   dfa->word_ops_used = 1;
 922   int i = 0;
 923   int ch = 0;
 924   if (BE (dfa->map_notascii == 0, 1))
 925     {
 926       if (sizeof (dfa->word_char[0]) == 8)
 927         {
 928           /* The extra temporaries here avoid "implicitly truncated"
 929              warnings in the case when this is dead code, i.e. 32-bit.  */
 930           const uint64_t wc0 = UINT64_C (0x03ff000000000000);
 931           const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
 932           dfa->word_char[0] = wc0;
 933           dfa->word_char[1] = wc1;
 934           i = 2;
 935         }
 936       else if (sizeof (dfa->word_char[0]) == 4)
 937         {
 938           dfa->word_char[0] = UINT32_C (0x00000000);
 939           dfa->word_char[1] = UINT32_C (0x03ff0000);
 940           dfa->word_char[2] = UINT32_C (0x87fffffe);
 941           dfa->word_char[3] = UINT32_C (0x07fffffe);
 942           i = 4;
 943         }
 944       else
 945         abort ();
 946       ch = 128;
 947
 948       if (BE (dfa->is_utf8, 1))
 949         {
 950           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 951           return;
 952         }
 953     }
 954
 955   for (; i < BITSET_WORDS; ++i)
 956     for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 957       if (isalnum (ch) || ch == '_')
 958         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 959 }
 960
 961 /* Free the work area which are only used while compiling.  */
 962
 963 static void
 964 free_workarea_compile (regex_t *preg)
 965 {
 966   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 967   bin_tree_storage_t *storage, *next;
 968   for (storage = dfa->str_tree_storage; storage; storage = next)
 969     {
 970       next = storage->next;
 971       re_free (storage);
 972     }
 973   dfa->str_tree_storage = NULL;
 974   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 975   dfa->str_tree = NULL;
 976   re_free (dfa->org_indices);
 977   dfa->org_indices = NULL;
 978 }
 979
 980 /* Create initial states for all contexts.  */
 981
 982 static reg_errcode_t
 983 create_initial_state (re_dfa_t *dfa)
 984 {
 985   int first, i;
 986   reg_errcode_t err;
 987   re_node_set init_nodes;
 988
 989   /* Initial states have the epsilon closure of the node which is
 990      the first node of the regular expression.  */
 991   first = dfa->str_tree->first->node_idx;
 992   dfa->init_node = first;
 993   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 994   if (BE (err != REG_NOERROR, 0))
 995     return err;
 996
 997   /* The back-references which are in initial states can epsilon transit,
 998      since in this case all of the subexpressions can be null.
 999      Then we add epsilon closures of the nodes which are the next nodes of
1000      the back-references.  */
1001   if (dfa->nbackref > 0)
1002     for (i = 0; i < init_nodes.nelem; ++i)
1003       {
1004         int node_idx = init_nodes.elems[i];
1005         re_token_type_t type = dfa->nodes[node_idx].type;
1006
1007         int clexp_idx;
1008         if (type != OP_BACK_REF)
1009           continue;
1010         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1011           {
1012             re_token_t *clexp_node;
1013             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1014             if (clexp_node->type == OP_CLOSE_SUBEXP
1015                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016               break;
1017           }
1018         if (clexp_idx == init_nodes.nelem)
1019           continue;
1020
1021         if (type == OP_BACK_REF)
1022           {
1023             int dest_idx = dfa->edests[node_idx].elems[0];
1024             if (!re_node_set_contains (&init_nodes, dest_idx))
1025               {
1026                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1027                                                        dfa->eclosures
1028                                                        + dest_idx);
1029                 if (err != REG_NOERROR)
1030                   return err;
1031                 i = 0;
1032               }
1033           }
1034       }
1035
1036   /* It must be the first time to invoke acquire_state.  */
1037   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1038   /* We don't check ERR here, since the initial state must not be NULL.  */
1039   if (BE (dfa->init_state == NULL, 0))
1040     return err;
1041   if (dfa->init_state->has_constraint)
1042     {
1043       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1044                                                        CONTEXT_WORD);
1045       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1046                                                      CONTEXT_NEWLINE);
1047       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1048                                                          &init_nodes,
1049                                                          CONTEXT_NEWLINE
1050                                                          | CONTEXT_BEGBUF);
1051       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1052               || dfa->init_state_begbuf == NULL, 0))
1053         return err;
1054     }
1055   else
1056     dfa->init_state_word = dfa->init_state_nl
1057       = dfa->init_state_begbuf = dfa->init_state;
1058
1059   re_node_set_free (&init_nodes);
1060   return REG_NOERROR;
1061 }
1062 \f
1063 #ifdef RE_ENABLE_I18N
1064 /* If it is possible to do searching in single byte encoding instead of UTF-8
1065    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1066    DFA nodes where needed.  */
1067
1068 static void
1069 optimize_utf8 (re_dfa_t *dfa)
1070 {
1071   int node, i, mb_chars = 0, has_period = 0;
1072
1073   for (node = 0; node < dfa->nodes_len; ++node)
1074     switch (dfa->nodes[node].type)
1075       {
1076       case CHARACTER:
1077         if (dfa->nodes[node].opr.c >= 0x80)
1078           mb_chars = 1;
1079         break;
1080       case ANCHOR:
1081         switch (dfa->nodes[node].opr.ctx_type)
1082           {
1083           case LINE_FIRST:
1084           case LINE_LAST:
1085           case BUF_FIRST:
1086           case BUF_LAST:
1087             break;
1088           default:
1089             /* Word anchors etc. cannot be handled.  It's okay to test
1090                opr.ctx_type since constraints (for all DFA nodes) are
1091                created by ORing one or more opr.ctx_type values.  */
1092             return;
1093           }
1094         break;
1095       case OP_PERIOD:
1096         has_period = 1;
1097         break;
1098       case OP_BACK_REF:
1099       case OP_ALT:
1100       case END_OF_RE:
1101       case OP_DUP_ASTERISK:
1102       case OP_OPEN_SUBEXP:
1103       case OP_CLOSE_SUBEXP:
1104         break;
1105       case COMPLEX_BRACKET:
1106         return;
1107       case SIMPLE_BRACKET:
1108         /* Just double check.  The non-ASCII range starts at 0x80.  */
1109         assert (0x80 % BITSET_WORD_BITS == 0);
1110         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1111           if (dfa->nodes[node].opr.sbcset[i])
1112             return;
1113         break;
1114       default:
1115         abort ();
1116       }
1117
1118   if (mb_chars || has_period)
1119     for (node = 0; node < dfa->nodes_len; ++node)
1120       {
1121         if (dfa->nodes[node].type == CHARACTER
1122             && dfa->nodes[node].opr.c >= 0x80)
1123           dfa->nodes[node].mb_partial = 0;
1124         else if (dfa->nodes[node].type == OP_PERIOD)
1125           dfa->nodes[node].type = OP_UTF8_PERIOD;
1126       }
1127
1128   /* The search can be in single byte locale.  */
1129   dfa->mb_cur_max = 1;
1130   dfa->is_utf8 = 0;
1131   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1132 }
1133 #endif
1134 \f
1135 /* Analyze the structure tree, and calculate "first", "next", "edest",
1136    "eclosure", and "inveclosure".  */
1137
1138 static reg_errcode_t
1139 analyze (regex_t *preg)
1140 {
1141   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1142   reg_errcode_t ret;
1143
1144   /* Allocate arrays.  */
1145   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1146   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1147   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1148   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1149   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1150           || dfa->eclosures == NULL, 0))
1151     return REG_ESPACE;
1152
1153   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1154   if (dfa->subexp_map != NULL)
1155     {
1156       int i;
1157       for (i = 0; i < preg->re_nsub; i++)
1158         dfa->subexp_map[i] = i;
1159       preorder (dfa->str_tree, optimize_subexps, dfa);
1160       for (i = 0; i < preg->re_nsub; i++)
1161         if (dfa->subexp_map[i] != i)
1162           break;
1163       if (i == preg->re_nsub)
1164         {
1165           free (dfa->subexp_map);
1166           dfa->subexp_map = NULL;
1167         }
1168     }
1169
1170   ret = postorder (dfa->str_tree, lower_subexps, preg);
1171   if (BE (ret != REG_NOERROR, 0))
1172     return ret;
1173   ret = postorder (dfa->str_tree, calc_first, dfa);
1174   if (BE (ret != REG_NOERROR, 0))
1175     return ret;
1176   preorder (dfa->str_tree, calc_next, dfa);
1177   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1178   if (BE (ret != REG_NOERROR, 0))
1179     return ret;
1180   ret = calc_eclosure (dfa);
1181   if (BE (ret != REG_NOERROR, 0))
1182     return ret;
1183
1184   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1185      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1186   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1187       || dfa->nbackref)
1188     {
1189       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1190       if (BE (dfa->inveclosures == NULL, 0))
1191         return REG_ESPACE;
1192       ret = calc_inveclosure (dfa);
1193     }
1194
1195   return ret;
1196 }
1197
1198 /* Our parse trees are very unbalanced, so we cannot use a stack to
1199    implement parse tree visits.  Instead, we use parent pointers and
1200    some hairy code in these two functions.  */
1201 static reg_errcode_t
1202 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1203            void *extra)
1204 {
1205   bin_tree_t *node, *prev;
1206
1207   for (node = root; ; )
1208     {
1209       /* Descend down the tree, preferably to the left (or to the right
1210          if that's the only child).  */
1211       while (node->left || node->right)
1212         if (node->left)
1213           node = node->left;
1214         else
1215           node = node->right;
1216
1217       do
1218         {
1219           reg_errcode_t err = fn (extra, node);
1220           if (BE (err != REG_NOERROR, 0))
1221             return err;
1222           if (node->parent == NULL)
1223             return REG_NOERROR;
1224           prev = node;
1225           node = node->parent;
1226         }
1227       /* Go up while we have a node that is reached from the right.  */
1228       while (node->right == prev || node->right == NULL);
1229       node = node->right;
1230     }
1231 }
1232
1233 static reg_errcode_t
1234 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1235           void *extra)
1236 {
1237   bin_tree_t *node;
1238
1239   for (node = root; ; )
1240     {
1241       reg_errcode_t err = fn (extra, node);
1242       if (BE (err != REG_NOERROR, 0))
1243         return err;
1244
1245       /* Go to the left node, or up and to the right.  */
1246       if (node->left)
1247         node = node->left;
1248       else
1249         {
1250           bin_tree_t *prev = NULL;
1251           while (node->right == prev || node->right == NULL)
1252             {
1253               prev = node;
1254               node = node->parent;
1255               if (!node)
1256                 return REG_NOERROR;
1257             }
1258           node = node->right;
1259         }
1260     }
1261 }
1262
1263 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1264    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1265    backreferences as well.  Requires a preorder visit.  */
1266 static reg_errcode_t
1267 optimize_subexps (void *extra, bin_tree_t *node)
1268 {
1269   re_dfa_t *dfa = (re_dfa_t *) extra;
1270
1271   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1272     {
1273       int idx = node->token.opr.idx;
1274       node->token.opr.idx = dfa->subexp_map[idx];
1275       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1276     }
1277
1278   else if (node->token.type == SUBEXP
1279            && node->left && node->left->token.type == SUBEXP)
1280     {
1281       int other_idx = node->left->token.opr.idx;
1282
1283       node->left = node->left->left;
1284       if (node->left)
1285         node->left->parent = node;
1286
1287       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1288       if (other_idx < BITSET_WORD_BITS)
1289           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1290     }
1291
1292   return REG_NOERROR;
1293 }
1294
1295 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1296    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1297 static reg_errcode_t
1298 lower_subexps (void *extra, bin_tree_t *node)
1299 {
1300   regex_t *preg = (regex_t *) extra;
1301   reg_errcode_t err = REG_NOERROR;
1302
1303   if (node->left && node->left->token.type == SUBEXP)
1304     {
1305       node->left = lower_subexp (&err, preg, node->left);
1306       if (node->left)
1307         node->left->parent = node;
1308     }
1309   if (node->right && node->right->token.type == SUBEXP)
1310     {
1311       node->right = lower_subexp (&err, preg, node->right);
1312       if (node->right)
1313         node->right->parent = node;
1314     }
1315
1316   return err;
1317 }
1318
1319 static bin_tree_t *
1320 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1321 {
1322   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1323   bin_tree_t *body = node->left;
1324   bin_tree_t *op, *cls, *tree1, *tree;
1325
1326   if (preg->no_sub
1327       /* We do not optimize empty subexpressions, because otherwise we may
1328          have bad CONCAT nodes with NULL children.  This is obviously not
1329          very common, so we do not lose much.  An example that triggers
1330          this case is the sed "script" /\(\)/x.  */
1331       && node->left != NULL
1332       && (node->token.opr.idx >= BITSET_WORD_BITS
1333           || !(dfa->used_bkref_map
1334                & ((bitset_word_t) 1 << node->token.opr.idx))))
1335     return node->left;
1336
1337   /* Convert the SUBEXP node to the concatenation of an
1338      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1339   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1340   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1341   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1342   tree = create_tree (dfa, op, tree1, CONCAT);
1343   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1344     {
1345       *err = REG_ESPACE;
1346       return NULL;
1347     }
1348
1349   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1350   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1351   return tree;
1352 }
1353
1354 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1355    nodes.  Requires a postorder visit.  */
1356 static reg_errcode_t
1357 calc_first (void *extra, bin_tree_t *node)
1358 {
1359   re_dfa_t *dfa = (re_dfa_t *) extra;
1360   if (node->token.type == CONCAT)
1361     {
1362       node->first = node->left->first;
1363       node->node_idx = node->left->node_idx;
1364     }
1365   else
1366     {
1367       node->first = node;
1368       node->node_idx = re_dfa_add_node (dfa, node->token);
1369       if (BE (node->node_idx == -1, 0))
1370         return REG_ESPACE;
1371       if (node->token.type == ANCHOR)
1372         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1373     }
1374   return REG_NOERROR;
1375 }
1376
1377 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1378 static reg_errcode_t
1379 calc_next (void *extra, bin_tree_t *node)
1380 {
1381   switch (node->token.type)
1382     {
1383     case OP_DUP_ASTERISK:
1384       node->left->next = node;
1385       break;
1386     case CONCAT:
1387       node->left->next = node->right->first;
1388       node->right->next = node->next;
1389       break;
1390     default:
1391       if (node->left)
1392         node->left->next = node->next;
1393       if (node->right)
1394         node->right->next = node->next;
1395       break;
1396     }
1397   return REG_NOERROR;
1398 }
1399
1400 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1401 static reg_errcode_t
1402 link_nfa_nodes (void *extra, bin_tree_t *node)
1403 {
1404   re_dfa_t *dfa = (re_dfa_t *) extra;
1405   int idx = node->node_idx;
1406   reg_errcode_t err = REG_NOERROR;
1407
1408   switch (node->token.type)
1409     {
1410     case CONCAT:
1411       break;
1412
1413     case END_OF_RE:
1414       assert (node->next == NULL);
1415       break;
1416
1417     case OP_DUP_ASTERISK:
1418     case OP_ALT:
1419       {
1420         int left, right;
1421         dfa->has_plural_match = 1;
1422         if (node->left != NULL)
1423           left = node->left->first->node_idx;
1424         else
1425           left = node->next->node_idx;
1426         if (node->right != NULL)
1427           right = node->right->first->node_idx;
1428         else
1429           right = node->next->node_idx;
1430         assert (left > -1);
1431         assert (right > -1);
1432         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1433       }
1434       break;
1435
1436     case ANCHOR:
1437     case OP_OPEN_SUBEXP:
1438     case OP_CLOSE_SUBEXP:
1439       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1440       break;
1441
1442     case OP_BACK_REF:
1443       dfa->nexts[idx] = node->next->node_idx;
1444       if (node->token.type == OP_BACK_REF)
1445         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1446       break;
1447
1448     default:
1449       assert (!IS_EPSILON_NODE (node->token.type));
1450       dfa->nexts[idx] = node->next->node_idx;
1451       break;
1452     }
1453
1454   return err;
1455 }
1456
1457 /* Duplicate the epsilon closure of the node ROOT_NODE.
1458    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1459    to their own constraint.  */
1460
1461 static reg_errcode_t
1462 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1463                         int root_node, unsigned int init_constraint)
1464 {
1465   int org_node, clone_node, ret;
1466   unsigned int constraint = init_constraint;
1467   for (org_node = top_org_node, clone_node = top_clone_node;;)
1468     {
1469       int org_dest, clone_dest;
1470       if (dfa->nodes[org_node].type == OP_BACK_REF)
1471         {
1472           /* If the back reference epsilon-transit, its destination must
1473              also have the constraint.  Then duplicate the epsilon closure
1474              of the destination of the back reference, and store it in
1475              edests of the back reference.  */
1476           org_dest = dfa->nexts[org_node];
1477           re_node_set_empty (dfa->edests + clone_node);
1478           clone_dest = duplicate_node (dfa, org_dest, constraint);
1479           if (BE (clone_dest == -1, 0))
1480             return REG_ESPACE;
1481           dfa->nexts[clone_node] = dfa->nexts[org_node];
1482           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1483           if (BE (ret < 0, 0))
1484             return REG_ESPACE;
1485         }
1486       else if (dfa->edests[org_node].nelem == 0)
1487         {
1488           /* In case of the node can't epsilon-transit, don't duplicate the
1489              destination and store the original destination as the
1490              destination of the node.  */
1491           dfa->nexts[clone_node] = dfa->nexts[org_node];
1492           break;
1493         }
1494       else if (dfa->edests[org_node].nelem == 1)
1495         {
1496           /* In case of the node can epsilon-transit, and it has only one
1497              destination.  */
1498           org_dest = dfa->edests[org_node].elems[0];
1499           re_node_set_empty (dfa->edests + clone_node);
1500           /* If the node is root_node itself, it means the epsilon closure
1501              has a loop.   Then tie it to the destination of the root_node.  */
1502           if (org_node == root_node && clone_node != org_node)
1503             {
1504               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1505               if (BE (ret < 0, 0))
1506                 return REG_ESPACE;
1507               break;
1508             }
1509           /* In case the node has another constraint, append it.  */
1510           constraint |= dfa->nodes[org_node].constraint;
1511           clone_dest = duplicate_node (dfa, org_dest, constraint);
1512           if (BE (clone_dest == -1, 0))
1513             return REG_ESPACE;
1514           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1515           if (BE (ret < 0, 0))
1516             return REG_ESPACE;
1517         }
1518       else /* dfa->edests[org_node].nelem == 2 */
1519         {
1520           /* In case of the node can epsilon-transit, and it has two
1521              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1522           org_dest = dfa->edests[org_node].elems[0];
1523           re_node_set_empty (dfa->edests + clone_node);
1524           /* Search for a duplicated node which satisfies the constraint.  */
1525           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1526           if (clone_dest == -1)
1527             {
1528               /* There is no such duplicated node, create a new one.  */
1529               reg_errcode_t err;
1530               clone_dest = duplicate_node (dfa, org_dest, constraint);
1531               if (BE (clone_dest == -1, 0))
1532                 return REG_ESPACE;
1533               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534               if (BE (ret < 0, 0))
1535                 return REG_ESPACE;
1536               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1537                                             root_node, constraint);
1538               if (BE (err != REG_NOERROR, 0))
1539                 return err;
1540             }
1541           else
1542             {
1543               /* There is a duplicated node which satisfies the constraint,
1544                  use it to avoid infinite loop.  */
1545               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1546               if (BE (ret < 0, 0))
1547                 return REG_ESPACE;
1548             }
1549
1550           org_dest = dfa->edests[org_node].elems[1];
1551           clone_dest = duplicate_node (dfa, org_dest, constraint);
1552           if (BE (clone_dest == -1, 0))
1553             return REG_ESPACE;
1554           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555           if (BE (ret < 0, 0))
1556             return REG_ESPACE;
1557         }
1558       org_node = org_dest;
1559       clone_node = clone_dest;
1560     }
1561   return REG_NOERROR;
1562 }
1563
1564 /* Search for a node which is duplicated from the node ORG_NODE, and
1565    satisfies the constraint CONSTRAINT.  */
1566
1567 static int
1568 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1569                         unsigned int constraint)
1570 {
1571   int idx;
1572   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1573     {
1574       if (org_node == dfa->org_indices[idx]
1575           && constraint == dfa->nodes[idx].constraint)
1576         return idx; /* Found.  */
1577     }
1578   return -1; /* Not found.  */
1579 }
1580
1581 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1582    Return the index of the new node, or -1 if insufficient storage is
1583    available.  */
1584
1585 static int
1586 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1587 {
1588   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1589   if (BE (dup_idx != -1, 1))
1590     {
1591       dfa->nodes[dup_idx].constraint = constraint;
1592       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1593       dfa->nodes[dup_idx].duplicated = 1;
1594
1595       /* Store the index of the original node.  */
1596       dfa->org_indices[dup_idx] = org_idx;
1597     }
1598   return dup_idx;
1599 }
1600
1601 static reg_errcode_t
1602 calc_inveclosure (re_dfa_t *dfa)
1603 {
1604   int src, idx, ret;
1605   for (idx = 0; idx < dfa->nodes_len; ++idx)
1606     re_node_set_init_empty (dfa->inveclosures + idx);
1607
1608   for (src = 0; src < dfa->nodes_len; ++src)
1609     {
1610       int *elems = dfa->eclosures[src].elems;
1611       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1612         {
1613           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1614           if (BE (ret == -1, 0))
1615             return REG_ESPACE;
1616         }
1617     }
1618
1619   return REG_NOERROR;
1620 }
1621
1622 /* Calculate "eclosure" for all the node in DFA.  */
1623
1624 static reg_errcode_t
1625 calc_eclosure (re_dfa_t *dfa)
1626 {
1627   int node_idx, incomplete;
1628 #ifdef DEBUG
1629   assert (dfa->nodes_len > 0);
1630 #endif
1631   incomplete = 0;
1632   /* For each nodes, calculate epsilon closure.  */
1633   for (node_idx = 0; ; ++node_idx)
1634     {
1635       reg_errcode_t err;
1636       re_node_set eclosure_elem;
1637       if (node_idx == dfa->nodes_len)
1638         {
1639           if (!incomplete)
1640             break;
1641           incomplete = 0;
1642           node_idx = 0;
1643         }
1644
1645 #ifdef DEBUG
1646       assert (dfa->eclosures[node_idx].nelem != -1);
1647 #endif
1648
1649       /* If we have already calculated, skip it.  */
1650       if (dfa->eclosures[node_idx].nelem != 0)
1651         continue;
1652       /* Calculate epsilon closure of 'node_idx'.  */
1653       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1654       if (BE (err != REG_NOERROR, 0))
1655         return err;
1656
1657       if (dfa->eclosures[node_idx].nelem == 0)
1658         {
1659           incomplete = 1;
1660           re_node_set_free (&eclosure_elem);
1661         }
1662     }
1663   return REG_NOERROR;
1664 }
1665
1666 /* Calculate epsilon closure of NODE.  */
1667
1668 static reg_errcode_t
1669 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1670 {
1671   reg_errcode_t err;
1672   int i;
1673   re_node_set eclosure;
1674   int ret;
1675   int incomplete = 0;
1676   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1677   if (BE (err != REG_NOERROR, 0))
1678     return err;
1679
1680   /* This indicates that we are calculating this node now.
1681      We reference this value to avoid infinite loop.  */
1682   dfa->eclosures[node].nelem = -1;
1683
1684   /* If the current node has constraints, duplicate all nodes
1685      since they must inherit the constraints.  */
1686   if (dfa->nodes[node].constraint
1687       && dfa->edests[node].nelem
1688       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1689     {
1690       err = duplicate_node_closure (dfa, node, node, node,
1691                                     dfa->nodes[node].constraint);
1692       if (BE (err != REG_NOERROR, 0))
1693         return err;
1694     }
1695
1696   /* Expand each epsilon destination nodes.  */
1697   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1698     for (i = 0; i < dfa->edests[node].nelem; ++i)
1699       {
1700         re_node_set eclosure_elem;
1701         int edest = dfa->edests[node].elems[i];
1702         /* If calculating the epsilon closure of `edest' is in progress,
1703            return intermediate result.  */
1704         if (dfa->eclosures[edest].nelem == -1)
1705           {
1706             incomplete = 1;
1707             continue;
1708           }
1709         /* If we haven't calculated the epsilon closure of `edest' yet,
1710            calculate now. Otherwise use calculated epsilon closure.  */
1711         if (dfa->eclosures[edest].nelem == 0)
1712           {
1713             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1714             if (BE (err != REG_NOERROR, 0))
1715               return err;
1716           }
1717         else
1718           eclosure_elem = dfa->eclosures[edest];
1719         /* Merge the epsilon closure of 'edest'.  */
1720         err = re_node_set_merge (&eclosure, &eclosure_elem);
1721         if (BE (err != REG_NOERROR, 0))
1722           return err;
1723         /* If the epsilon closure of 'edest' is incomplete,
1724            the epsilon closure of this node is also incomplete.  */
1725         if (dfa->eclosures[edest].nelem == 0)
1726           {
1727             incomplete = 1;
1728             re_node_set_free (&eclosure_elem);
1729           }
1730       }
1731
1732   /* An epsilon closure includes itself.  */
1733   ret = re_node_set_insert (&eclosure, node);
1734   if (BE (ret < 0, 0))
1735     return REG_ESPACE;
1736   if (incomplete && !root)
1737     dfa->eclosures[node].nelem = 0;
1738   else
1739     dfa->eclosures[node] = eclosure;
1740   *new_set = eclosure;
1741   return REG_NOERROR;
1742 }
1743 \f
1744 /* Functions for token which are used in the parser.  */
1745
1746 /* Fetch a token from INPUT.
1747    We must not use this function inside bracket expressions.  */
1748
1749 static void
1750 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1751 {
1752   re_string_skip_bytes (input, peek_token (result, input, syntax));
1753 }
1754
1755 /* Peek a token from INPUT, and return the length of the token.
1756    We must not use this function inside bracket expressions.  */
1757
1758 static int
1759 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1760 {
1761   unsigned char c;
1762
1763   if (re_string_eoi (input))
1764     {
1765       token->type = END_OF_RE;
1766       return 0;
1767     }
1768
1769   c = re_string_peek_byte (input, 0);
1770   token->opr.c = c;
1771
1772   token->word_char = 0;
1773 #ifdef RE_ENABLE_I18N
1774   token->mb_partial = 0;
1775   if (input->mb_cur_max > 1 &&
1776       !re_string_first_byte (input, re_string_cur_idx (input)))
1777     {
1778       token->type = CHARACTER;
1779       token->mb_partial = 1;
1780       return 1;
1781     }
1782 #endif
1783   if (c == '\\')
1784     {
1785       unsigned char c2;
1786       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1787         {
1788           token->type = BACK_SLASH;
1789           return 1;
1790         }
1791
1792       c2 = re_string_peek_byte_case (input, 1);
1793       token->opr.c = c2;
1794       token->type = CHARACTER;
1795 #ifdef RE_ENABLE_I18N
1796       if (input->mb_cur_max > 1)
1797         {
1798           wint_t wc = re_string_wchar_at (input,
1799                                           re_string_cur_idx (input) + 1);
1800           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1801         }
1802       else
1803 #endif
1804         token->word_char = IS_WORD_CHAR (c2) != 0;
1805
1806       switch (c2)
1807         {
1808         case '|':
1809           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1810             token->type = OP_ALT;
1811           break;
1812         case '1': case '2': case '3': case '4': case '5':
1813         case '6': case '7': case '8': case '9':
1814           if (!(syntax & RE_NO_BK_REFS))
1815             {
1816               token->type = OP_BACK_REF;
1817               token->opr.idx = c2 - '1';
1818             }
1819           break;
1820         case '<':
1821           if (!(syntax & RE_NO_GNU_OPS))
1822             {
1823               token->type = ANCHOR;
1824               token->opr.ctx_type = WORD_FIRST;
1825             }
1826           break;
1827         case '>':
1828           if (!(syntax & RE_NO_GNU_OPS))
1829             {
1830               token->type = ANCHOR;
1831               token->opr.ctx_type = WORD_LAST;
1832             }
1833           break;
1834         case 'b':
1835           if (!(syntax & RE_NO_GNU_OPS))
1836             {
1837               token->type = ANCHOR;
1838               token->opr.ctx_type = WORD_DELIM;
1839             }
1840           break;
1841         case 'B':
1842           if (!(syntax & RE_NO_GNU_OPS))
1843             {
1844               token->type = ANCHOR;
1845               token->opr.ctx_type = NOT_WORD_DELIM;
1846             }
1847           break;
1848         case 'w':
1849           if (!(syntax & RE_NO_GNU_OPS))
1850             token->type = OP_WORD;
1851           break;
1852         case 'W':
1853           if (!(syntax & RE_NO_GNU_OPS))
1854             token->type = OP_NOTWORD;
1855           break;
1856         case 's':
1857           if (!(syntax & RE_NO_GNU_OPS))
1858             token->type = OP_SPACE;
1859           break;
1860         case 'S':
1861           if (!(syntax & RE_NO_GNU_OPS))
1862             token->type = OP_NOTSPACE;
1863           break;
1864         case '`':
1865           if (!(syntax & RE_NO_GNU_OPS))
1866             {
1867               token->type = ANCHOR;
1868               token->opr.ctx_type = BUF_FIRST;
1869             }
1870           break;
1871         case '\'':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             {
1874               token->type = ANCHOR;
1875               token->opr.ctx_type = BUF_LAST;
1876             }
1877           break;
1878         case '(':
1879           if (!(syntax & RE_NO_BK_PARENS))
1880             token->type = OP_OPEN_SUBEXP;
1881           break;
1882         case ')':
1883           if (!(syntax & RE_NO_BK_PARENS))
1884             token->type = OP_CLOSE_SUBEXP;
1885           break;
1886         case '+':
1887           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1888             token->type = OP_DUP_PLUS;
1889           break;
1890         case '?':
1891           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1892             token->type = OP_DUP_QUESTION;
1893           break;
1894         case '{':
1895           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1896             token->type = OP_OPEN_DUP_NUM;
1897           break;
1898         case '}':
1899           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1900             token->type = OP_CLOSE_DUP_NUM;
1901           break;
1902         default:
1903           break;
1904         }
1905       return 2;
1906     }
1907
1908   token->type = CHARACTER;
1909 #ifdef RE_ENABLE_I18N
1910   if (input->mb_cur_max > 1)
1911     {
1912       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1913       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1914     }
1915   else
1916 #endif
1917     token->word_char = IS_WORD_CHAR (token->opr.c);
1918
1919   switch (c)
1920     {
1921     case '\n':
1922       if (syntax & RE_NEWLINE_ALT)
1923         token->type = OP_ALT;
1924       break;
1925     case '|':
1926       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1927         token->type = OP_ALT;
1928       break;
1929     case '*':
1930       token->type = OP_DUP_ASTERISK;
1931       break;
1932     case '+':
1933       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1934         token->type = OP_DUP_PLUS;
1935       break;
1936     case '?':
1937       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1938         token->type = OP_DUP_QUESTION;
1939       break;
1940     case '{':
1941       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1942         token->type = OP_OPEN_DUP_NUM;
1943       break;
1944     case '}':
1945       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1946         token->type = OP_CLOSE_DUP_NUM;
1947       break;
1948     case '(':
1949       if (syntax & RE_NO_BK_PARENS)
1950         token->type = OP_OPEN_SUBEXP;
1951       break;
1952     case ')':
1953       if (syntax & RE_NO_BK_PARENS)
1954         token->type = OP_CLOSE_SUBEXP;
1955       break;
1956     case '[':
1957       token->type = OP_OPEN_BRACKET;
1958       break;
1959     case '.':
1960       token->type = OP_PERIOD;
1961       break;
1962     case '^':
1963       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1964           re_string_cur_idx (input) != 0)
1965         {
1966           char prev = re_string_peek_byte (input, -1);
1967           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1968             break;
1969         }
1970       token->type = ANCHOR;
1971       token->opr.ctx_type = LINE_FIRST;
1972       break;
1973     case '$':
1974       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1975           re_string_cur_idx (input) + 1 != re_string_length (input))
1976         {
1977           re_token_t next;
1978           re_string_skip_bytes (input, 1);
1979           peek_token (&next, input, syntax);
1980           re_string_skip_bytes (input, -1);
1981           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1982             break;
1983         }
1984       token->type = ANCHOR;
1985       token->opr.ctx_type = LINE_LAST;
1986       break;
1987     default:
1988       break;
1989     }
1990   return 1;
1991 }
1992
1993 /* Peek a token from INPUT, and return the length of the token.
1994    We must not use this function out of bracket expressions.  */
1995
1996 static int
1997 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1998 {
1999   unsigned char c;
2000   if (re_string_eoi (input))
2001     {
2002       token->type = END_OF_RE;
2003       return 0;
2004     }
2005   c = re_string_peek_byte (input, 0);
2006   token->opr.c = c;
2007
2008 #ifdef RE_ENABLE_I18N
2009   if (input->mb_cur_max > 1 &&
2010       !re_string_first_byte (input, re_string_cur_idx (input)))
2011     {
2012       token->type = CHARACTER;
2013       return 1;
2014     }
2015 #endif /* RE_ENABLE_I18N */
2016
2017   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2018       && re_string_cur_idx (input) + 1 < re_string_length (input))
2019     {
2020       /* In this case, '\' escape a character.  */
2021       unsigned char c2;
2022       re_string_skip_bytes (input, 1);
2023       c2 = re_string_peek_byte (input, 0);
2024       token->opr.c = c2;
2025       token->type = CHARACTER;
2026       return 1;
2027     }
2028   if (c == '[') /* '[' is a special char in a bracket exps.  */
2029     {
2030       unsigned char c2;
2031       int token_len;
2032       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2033         c2 = re_string_peek_byte (input, 1);
2034       else
2035         c2 = 0;
2036       token->opr.c = c2;
2037       token_len = 2;
2038       switch (c2)
2039         {
2040         case '.':
2041           token->type = OP_OPEN_COLL_ELEM;
2042           break;
2043         case '=':
2044           token->type = OP_OPEN_EQUIV_CLASS;
2045           break;
2046         case ':':
2047           if (syntax & RE_CHAR_CLASSES)
2048             {
2049               token->type = OP_OPEN_CHAR_CLASS;
2050               break;
2051             }
2052           /* else fall through.  */
2053         default:
2054           token->type = CHARACTER;
2055           token->opr.c = c;
2056           token_len = 1;
2057           break;
2058         }
2059       return token_len;
2060     }
2061   switch (c)
2062     {
2063     case '-':
2064       token->type = OP_CHARSET_RANGE;
2065       break;
2066     case ']':
2067       token->type = OP_CLOSE_BRACKET;
2068       break;
2069     case '^':
2070       token->type = OP_NON_MATCH_LIST;
2071       break;
2072     default:
2073       token->type = CHARACTER;
2074     }
2075   return 1;
2076 }
2077 \f
2078 /* Functions for parser.  */
2079
2080 /* Entry point of the parser.
2081    Parse the regular expression REGEXP and return the structure tree.
2082    If an error occurs, ERR is set by error code, and return NULL.
2083    This function build the following tree, from regular expression <reg_exp>:
2084            CAT
2085            / \
2086           /   \
2087    <reg_exp>  EOR
2088
2089    CAT means concatenation.
2090    EOR means end of regular expression.  */
2091
2092 static bin_tree_t *
2093 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2094        reg_errcode_t *err)
2095 {
2096   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2097   bin_tree_t *tree, *eor, *root;
2098   re_token_t current_token;
2099   dfa->syntax = syntax;
2100   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2101   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2102   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2103     return NULL;
2104   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2105   if (tree != NULL)
2106     root = create_tree (dfa, tree, eor, CONCAT);
2107   else
2108     root = eor;
2109   if (BE (eor == NULL || root == NULL, 0))
2110     {
2111       *err = REG_ESPACE;
2112       return NULL;
2113     }
2114   return root;
2115 }
2116
2117 /* This function build the following tree, from regular expression
2118    <branch1>|<branch2>:
2119            ALT
2120            / \
2121           /   \
2122    <branch1> <branch2>
2123
2124    ALT means alternative, which represents the operator '|'.  */
2125
2126 static bin_tree_t *
2127 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2128                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2129 {
2130   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2131   bin_tree_t *tree, *branch = NULL;
2132   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2133   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2134     return NULL;
2135
2136   while (token->type == OP_ALT)
2137     {
2138       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2139       if (token->type != OP_ALT && token->type != END_OF_RE
2140           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2141         {
2142           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2143           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2144             {
2145               if (tree != NULL)
2146                 postorder (tree, free_tree, NULL);
2147               return NULL;
2148             }
2149         }
2150       else
2151         branch = NULL;
2152       tree = create_tree (dfa, tree, branch, OP_ALT);
2153       if (BE (tree == NULL, 0))
2154         {
2155           *err = REG_ESPACE;
2156           return NULL;
2157         }
2158     }
2159   return tree;
2160 }
2161
2162 /* This function build the following tree, from regular expression
2163    <exp1><exp2>:
2164         CAT
2165         / \
2166        /   \
2167    <exp1> <exp2>
2168
2169    CAT means concatenation.  */
2170
2171 static bin_tree_t *
2172 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2173               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2174 {
2175   bin_tree_t *tree, *exp;
2176   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2177   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2178   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2179     return NULL;
2180
2181   while (token->type != OP_ALT && token->type != END_OF_RE
2182          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2183     {
2184       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2185       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2186         {
2187           if (tree != NULL)
2188             postorder (tree, free_tree, NULL);
2189           return NULL;
2190         }
2191       if (tree != NULL && exp != NULL)
2192         {
2193           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2194           if (newtree == NULL)
2195             {
2196               postorder (exp, free_tree, NULL);
2197               postorder (tree, free_tree, NULL);
2198               *err = REG_ESPACE;
2199               return NULL;
2200             }
2201           tree = newtree;
2202         }
2203       else if (tree == NULL)
2204         tree = exp;
2205       /* Otherwise exp == NULL, we don't need to create new tree.  */
2206     }
2207   return tree;
2208 }
2209
2210 /* This function build the following tree, from regular expression a*:
2211          *
2212          |
2213          a
2214 */
2215
2216 static bin_tree_t *
2217 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2218                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2219 {
2220   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2221   bin_tree_t *tree;
2222   switch (token->type)
2223     {
2224     case CHARACTER:
2225       tree = create_token_tree (dfa, NULL, NULL, token);
2226       if (BE (tree == NULL, 0))
2227         {
2228           *err = REG_ESPACE;
2229           return NULL;
2230         }
2231 #ifdef RE_ENABLE_I18N
2232       if (dfa->mb_cur_max > 1)
2233         {
2234           while (!re_string_eoi (regexp)
2235                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2236             {
2237               bin_tree_t *mbc_remain;
2238               fetch_token (token, regexp, syntax);
2239               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2240               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2241               if (BE (mbc_remain == NULL || tree == NULL, 0))
2242                 {
2243                   *err = REG_ESPACE;
2244                   return NULL;
2245                 }
2246             }
2247         }
2248 #endif
2249       break;
2250     case OP_OPEN_SUBEXP:
2251       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2252       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2253         return NULL;
2254       break;
2255     case OP_OPEN_BRACKET:
2256       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2257       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2258         return NULL;
2259       break;
2260     case OP_BACK_REF:
2261       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2262         {
2263           *err = REG_ESUBREG;
2264           return NULL;
2265         }
2266       dfa->used_bkref_map |= 1 << token->opr.idx;
2267       tree = create_token_tree (dfa, NULL, NULL, token);
2268       if (BE (tree == NULL, 0))
2269         {
2270           *err = REG_ESPACE;
2271           return NULL;
2272         }
2273       ++dfa->nbackref;
2274       dfa->has_mb_node = 1;
2275       break;
2276     case OP_OPEN_DUP_NUM:
2277       if (syntax & RE_CONTEXT_INVALID_DUP)
2278         {
2279           *err = REG_BADRPT;
2280           return NULL;
2281         }
2282       /* FALLTHROUGH */
2283     case OP_DUP_ASTERISK:
2284     case OP_DUP_PLUS:
2285     case OP_DUP_QUESTION:
2286       if (syntax & RE_CONTEXT_INVALID_OPS)
2287         {
2288           *err = REG_BADRPT;
2289           return NULL;
2290         }
2291       else if (syntax & RE_CONTEXT_INDEP_OPS)
2292         {
2293           fetch_token (token, regexp, syntax);
2294           return parse_expression (regexp, preg, token, syntax, nest, err);
2295         }
2296       /* else fall through  */
2297     case OP_CLOSE_SUBEXP:
2298       if ((token->type == OP_CLOSE_SUBEXP) &&
2299           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2300         {
2301           *err = REG_ERPAREN;
2302           return NULL;
2303         }
2304       /* else fall through  */
2305     case OP_CLOSE_DUP_NUM:
2306       /* We treat it as a normal character.  */
2307
2308       /* Then we can these characters as normal characters.  */
2309       token->type = CHARACTER;
2310       /* mb_partial and word_char bits should be initialized already
2311          by peek_token.  */
2312       tree = create_token_tree (dfa, NULL, NULL, token);
2313       if (BE (tree == NULL, 0))
2314         {
2315           *err = REG_ESPACE;
2316           return NULL;
2317         }
2318       break;
2319     case ANCHOR:
2320       if ((token->opr.ctx_type
2321            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2322           && dfa->word_ops_used == 0)
2323         init_word_char (dfa);
2324       if (token->opr.ctx_type == WORD_DELIM
2325           || token->opr.ctx_type == NOT_WORD_DELIM)
2326         {
2327           bin_tree_t *tree_first, *tree_last;
2328           if (token->opr.ctx_type == WORD_DELIM)
2329             {
2330               token->opr.ctx_type = WORD_FIRST;
2331               tree_first = create_token_tree (dfa, NULL, NULL, token);
2332               token->opr.ctx_type = WORD_LAST;
2333             }
2334           else
2335             {
2336               token->opr.ctx_type = INSIDE_WORD;
2337               tree_first = create_token_tree (dfa, NULL, NULL, token);
2338               token->opr.ctx_type = INSIDE_NOTWORD;
2339             }
2340           tree_last = create_token_tree (dfa, NULL, NULL, token);
2341           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2342           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2343             {
2344               *err = REG_ESPACE;
2345               return NULL;
2346             }
2347         }
2348       else
2349         {
2350           tree = create_token_tree (dfa, NULL, NULL, token);
2351           if (BE (tree == NULL, 0))
2352             {
2353               *err = REG_ESPACE;
2354               return NULL;
2355             }
2356         }
2357       /* We must return here, since ANCHORs can't be followed
2358          by repetition operators.
2359          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2360              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2361       fetch_token (token, regexp, syntax);
2362       return tree;
2363     case OP_PERIOD:
2364       tree = create_token_tree (dfa, NULL, NULL, token);
2365       if (BE (tree == NULL, 0))
2366         {
2367           *err = REG_ESPACE;
2368           return NULL;
2369         }
2370       if (dfa->mb_cur_max > 1)
2371         dfa->has_mb_node = 1;
2372       break;
2373     case OP_WORD:
2374     case OP_NOTWORD:
2375       tree = build_charclass_op (dfa, regexp->trans,
2376                                  (const unsigned char *) "alnum",
2377                                  (const unsigned char *) "_",
2378                                  token->type == OP_NOTWORD, err);
2379       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2380         return NULL;
2381       break;
2382     case OP_SPACE:
2383     case OP_NOTSPACE:
2384       tree = build_charclass_op (dfa, regexp->trans,
2385                                  (const unsigned char *) "space",
2386                                  (const unsigned char *) "",
2387                                  token->type == OP_NOTSPACE, err);
2388       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389         return NULL;
2390       break;
2391     case OP_ALT:
2392     case END_OF_RE:
2393       return NULL;
2394     case BACK_SLASH:
2395       *err = REG_EESCAPE;
2396       return NULL;
2397     default:
2398       /* Must not happen?  */
2399 #ifdef DEBUG
2400       assert (0);
2401 #endif
2402       return NULL;
2403     }
2404   fetch_token (token, regexp, syntax);
2405
2406   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2407          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2408     {
2409       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2410       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2411         {
2412           if (tree != NULL)
2413             postorder (tree, free_tree, NULL);
2414           return NULL;
2415         }
2416       tree = dup_tree;
2417       /* In BRE consecutive duplications are not allowed.  */
2418       if ((syntax & RE_CONTEXT_INVALID_DUP)
2419           && (token->type == OP_DUP_ASTERISK
2420               || token->type == OP_OPEN_DUP_NUM))
2421         {
2422           if (tree != NULL)
2423             postorder (tree, free_tree, NULL);
2424           *err = REG_BADRPT;
2425           return NULL;
2426         }
2427     }
2428
2429   return tree;
2430 }
2431
2432 /* This function build the following tree, from regular expression
2433    (<reg_exp>):
2434          SUBEXP
2435             |
2436         <reg_exp>
2437 */
2438
2439 static bin_tree_t *
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2442 {
2443   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444   bin_tree_t *tree;
2445   size_t cur_nsub;
2446   cur_nsub = preg->re_nsub++;
2447
2448   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449
2450   /* The subexpression may be a null string.  */
2451   if (token->type == OP_CLOSE_SUBEXP)
2452     tree = NULL;
2453   else
2454     {
2455       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457         {
2458           if (tree != NULL)
2459             postorder (tree, free_tree, NULL);
2460           *err = REG_EPAREN;
2461         }
2462       if (BE (*err != REG_NOERROR, 0))
2463         return NULL;
2464     }
2465
2466   if (cur_nsub <= '9' - '1')
2467     dfa->completed_bkref_map |= 1 << cur_nsub;
2468
2469   tree = create_tree (dfa, tree, NULL, SUBEXP);
2470   if (BE (tree == NULL, 0))
2471     {
2472       *err = REG_ESPACE;
2473       return NULL;
2474     }
2475   tree->token.opr.idx = cur_nsub;
2476   return tree;
2477 }
2478
2479 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2480
2481 static bin_tree_t *
2482 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2483               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2484 {
2485   bin_tree_t *tree = NULL, *old_tree = NULL;
2486   int i, start, end, start_idx = re_string_cur_idx (regexp);
2487   re_token_t start_token = *token;
2488
2489   if (token->type == OP_OPEN_DUP_NUM)
2490     {
2491       end = 0;
2492       start = fetch_number (regexp, token, syntax);
2493       if (start == -1)
2494         {
2495           if (token->type == CHARACTER && token->opr.c == ',')
2496             start = 0; /* We treat "{,m}" as "{0,m}".  */
2497           else
2498             {
2499               *err = REG_BADBR; /* <re>{} is invalid.  */
2500               return NULL;
2501             }
2502         }
2503       if (BE (start != -2, 1))
2504         {
2505           /* We treat "{n}" as "{n,n}".  */
2506           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2507                  : ((token->type == CHARACTER && token->opr.c == ',')
2508                     ? fetch_number (regexp, token, syntax) : -2));
2509         }
2510       if (BE (start == -2 || end == -2, 0))
2511         {
2512           /* Invalid sequence.  */
2513           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2514             {
2515               if (token->type == END_OF_RE)
2516                 *err = REG_EBRACE;
2517               else
2518                 *err = REG_BADBR;
2519
2520               return NULL;
2521             }
2522
2523           /* If the syntax bit is set, rollback.  */
2524           re_string_set_index (regexp, start_idx);
2525           *token = start_token;
2526           token->type = CHARACTER;
2527           /* mb_partial and word_char bits should be already initialized by
2528              peek_token.  */
2529           return elem;
2530         }
2531
2532       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2533         {
2534           /* First number greater than second.  */
2535           *err = REG_BADBR;
2536           return NULL;
2537         }
2538     }
2539   else
2540     {
2541       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2542       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2543     }
2544
2545   fetch_token (token, regexp, syntax);
2546
2547   if (BE (elem == NULL, 0))
2548     return NULL;
2549   if (BE (start == 0 && end == 0, 0))
2550     {
2551       postorder (elem, free_tree, NULL);
2552       return NULL;
2553     }
2554
2555   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2556   if (BE (start > 0, 0))
2557     {
2558       tree = elem;
2559       for (i = 2; i <= start; ++i)
2560         {
2561           elem = duplicate_tree (elem, dfa);
2562           tree = create_tree (dfa, tree, elem, CONCAT);
2563           if (BE (elem == NULL || tree == NULL, 0))
2564             goto parse_dup_op_espace;
2565         }
2566
2567       if (start == end)
2568         return tree;
2569
2570       /* Duplicate ELEM before it is marked optional.  */
2571       elem = duplicate_tree (elem, dfa);
2572       if (BE (elem == NULL, 0))
2573         goto parse_dup_op_espace;
2574       old_tree = tree;
2575     }
2576   else
2577     old_tree = NULL;
2578
2579   if (elem->token.type == SUBEXP)
2580     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2581
2582   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2583   if (BE (tree == NULL, 0))
2584     goto parse_dup_op_espace;
2585
2586   /* This loop is actually executed only when end != -1,
2587      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2588      already created the start+1-th copy.  */
2589   for (i = start + 2; i <= end; ++i)
2590     {
2591       elem = duplicate_tree (elem, dfa);
2592       tree = create_tree (dfa, tree, elem, CONCAT);
2593       if (BE (elem == NULL || tree == NULL, 0))
2594         goto parse_dup_op_espace;
2595
2596       tree = create_tree (dfa, tree, NULL, OP_ALT);
2597       if (BE (tree == NULL, 0))
2598         goto parse_dup_op_espace;
2599     }
2600
2601   if (old_tree)
2602     tree = create_tree (dfa, old_tree, tree, CONCAT);
2603
2604   return tree;
2605
2606  parse_dup_op_espace:
2607   *err = REG_ESPACE;
2608   return NULL;
2609 }
2610
2611 /* Size of the names for collating symbol/equivalence_class/character_class.
2612    I'm not sure, but maybe enough.  */
2613 #define BRACKET_NAME_BUF_SIZE 32
2614
2615 #ifndef _LIBC
2616   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2617      Build the range expression which starts from START_ELEM, and ends
2618      at END_ELEM.  The result are written to MBCSET and SBCSET.
2619      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2620      mbcset->range_ends, is a pointer argument since we may
2621      update it.  */
2622
2623 static reg_errcode_t
2624 # ifdef RE_ENABLE_I18N
2625 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2626                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2627 # else /* not RE_ENABLE_I18N */
2628 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2629                  bracket_elem_t *end_elem)
2630 # endif /* not RE_ENABLE_I18N */
2631 {
2632   unsigned int start_ch, end_ch;
2633   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2634   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2635           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2636           0))
2637     return REG_ERANGE;
2638
2639   /* We can handle no multi character collating elements without libc
2640      support.  */
2641   if (BE ((start_elem->type == COLL_SYM
2642            && strlen ((char *) start_elem->opr.name) > 1)
2643           || (end_elem->type == COLL_SYM
2644               && strlen ((char *) end_elem->opr.name) > 1), 0))
2645     return REG_ECOLLATE;
2646
2647 # ifdef RE_ENABLE_I18N
2648   {
2649     wchar_t wc;
2650     wint_t start_wc;
2651     wint_t end_wc;
2652     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2653
2654     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2655                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2656                    : 0));
2657     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2658               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2659                  : 0));
2660     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2661                 ? __btowc (start_ch) : start_elem->opr.wch);
2662     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2663               ? __btowc (end_ch) : end_elem->opr.wch);
2664     if (start_wc == WEOF || end_wc == WEOF)
2665       return REG_ECOLLATE;
2666     cmp_buf[0] = start_wc;
2667     cmp_buf[4] = end_wc;
2668     if (__wcscoll (cmp_buf, cmp_buf + 4) > 0)
2669       return REG_ERANGE;
2670
2671     /* Got valid collation sequence values, add them as a new entry.
2672        However, for !_LIBC we have no collation elements: if the
2673        character set is single byte, the single byte character set
2674        that we build below suffices.  parse_bracket_exp passes
2675        no MBCSET if dfa->mb_cur_max == 1.  */
2676     if (mbcset)
2677       {
2678         /* Check the space of the arrays.  */
2679         if (BE (*range_alloc == mbcset->nranges, 0))
2680           {
2681             /* There is not enough space, need realloc.  */
2682             wchar_t *new_array_start, *new_array_end;
2683             int new_nranges;
2684
2685             /* +1 in case of mbcset->nranges is 0.  */
2686             new_nranges = 2 * mbcset->nranges + 1;
2687             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2688                are NULL if *range_alloc == 0.  */
2689             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2690                                           new_nranges);
2691             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2692                                         new_nranges);
2693
2694             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2695               return REG_ESPACE;
2696
2697             mbcset->range_starts = new_array_start;
2698             mbcset->range_ends = new_array_end;
2699             *range_alloc = new_nranges;
2700           }
2701
2702         mbcset->range_starts[mbcset->nranges] = start_wc;
2703         mbcset->range_ends[mbcset->nranges++] = end_wc;
2704       }
2705
2706     /* Build the table for single byte characters.  */
2707     for (wc = 0; wc < SBC_MAX; ++wc)
2708       {
2709         cmp_buf[2] = wc;
2710         if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
2711             && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2712           bitset_set (sbcset, wc);
2713       }
2714   }
2715 # else /* not RE_ENABLE_I18N */
2716   {
2717     unsigned int ch;
2718     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2719                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2720                    : 0));
2721     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2722               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2723                  : 0));
2724     if (start_ch > end_ch)
2725       return REG_ERANGE;
2726     /* Build the table for single byte characters.  */
2727     for (ch = 0; ch < SBC_MAX; ++ch)
2728       if (start_ch <= ch  && ch <= end_ch)
2729         bitset_set (sbcset, ch);
2730   }
2731 # endif /* not RE_ENABLE_I18N */
2732   return REG_NOERROR;
2733 }
2734 #endif /* not _LIBC */
2735
2736 #ifndef _LIBC
2737 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2738    Build the collating element which is represented by NAME.
2739    The result are written to MBCSET and SBCSET.
2740    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2741    pointer argument since we may update it.  */
2742
2743 static reg_errcode_t
2744 # ifdef RE_ENABLE_I18N
2745 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2746                         int *coll_sym_alloc, const unsigned char *name)
2747 # else /* not RE_ENABLE_I18N */
2748 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2749 # endif /* not RE_ENABLE_I18N */
2750 {
2751   size_t name_len = strlen ((const char *) name);
2752   if (BE (name_len != 1, 0))
2753     return REG_ECOLLATE;
2754   else
2755     {
2756       bitset_set (sbcset, name[0]);
2757       return REG_NOERROR;
2758     }
2759 }
2760 #endif /* not _LIBC */
2761
2762 /* This function parse bracket expression like "[abc]", "[a-c]",
2763    "[[.a-a.]]" etc.  */
2764
2765 static bin_tree_t *
2766 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2767                    reg_syntax_t syntax, reg_errcode_t *err)
2768 {
2769 #ifdef _LIBC
2770   const unsigned char *collseqmb;
2771   const char *collseqwc;
2772   uint32_t nrules;
2773   int32_t table_size;
2774   const int32_t *symb_table;
2775   const unsigned char *extra;
2776
2777   /* Local function for parse_bracket_exp used in _LIBC environment.
2778      Seek the collating symbol entry corresponding to NAME.
2779      Return the index of the symbol in the SYMB_TABLE,
2780      or -1 if not found.  */
2781
2782   auto inline int32_t
2783   __attribute__ ((always_inline))
2784   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2785     {
2786       int32_t elem;
2787
2788       for (elem = 0; elem < table_size; elem++)
2789         if (symb_table[2 * elem] != 0)
2790           {
2791             int32_t idx = symb_table[2 * elem + 1];
2792             /* Skip the name of collating element name.  */
2793             idx += 1 + extra[idx];
2794             if (/* Compare the length of the name.  */
2795                 name_len == extra[idx]
2796                 /* Compare the name.  */
2797                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2798               /* Yep, this is the entry.  */
2799               return elem;
2800           }
2801       return -1;
2802     }
2803
2804   /* Local function for parse_bracket_exp used in _LIBC environment.
2805      Look up the collation sequence value of BR_ELEM.
2806      Return the value if succeeded, UINT_MAX otherwise.  */
2807
2808   auto inline unsigned int
2809   __attribute__ ((always_inline))
2810   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2811     {
2812       if (br_elem->type == SB_CHAR)
2813         {
2814           /*
2815           if (MB_CUR_MAX == 1)
2816           */
2817           if (nrules == 0)
2818             return collseqmb[br_elem->opr.ch];
2819           else
2820             {
2821               wint_t wc = __btowc (br_elem->opr.ch);
2822               return __collseq_table_lookup (collseqwc, wc);
2823             }
2824         }
2825       else if (br_elem->type == MB_CHAR)
2826         {
2827           if (nrules != 0)
2828             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2829         }
2830       else if (br_elem->type == COLL_SYM)
2831         {
2832           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2833           if (nrules != 0)
2834             {
2835               int32_t elem, idx;
2836               elem = seek_collating_symbol_entry (br_elem->opr.name,
2837                                                   sym_name_len);
2838               if (elem != -1)
2839                 {
2840                   /* We found the entry.  */
2841                   idx = symb_table[2 * elem + 1];
2842                   /* Skip the name of collating element name.  */
2843                   idx += 1 + extra[idx];
2844                   /* Skip the byte sequence of the collating element.  */
2845                   idx += 1 + extra[idx];
2846                   /* Adjust for the alignment.  */
2847                   idx = (idx + 3) & ~3;
2848                   /* Skip the multibyte collation sequence value.  */
2849                   idx += sizeof (unsigned int);
2850                   /* Skip the wide char sequence of the collating element.  */
2851                   idx += sizeof (unsigned int) *
2852                     (1 + *(unsigned int *) (extra + idx));
2853                   /* Return the collation sequence value.  */
2854                   return *(unsigned int *) (extra + idx);
2855                 }
2856               else if (sym_name_len == 1)
2857                 {
2858                   /* No valid character.  Match it as a single byte
2859                      character.  */
2860                   return collseqmb[br_elem->opr.name[0]];
2861                 }
2862             }
2863           else if (sym_name_len == 1)
2864             return collseqmb[br_elem->opr.name[0]];
2865         }
2866       return UINT_MAX;
2867     }
2868
2869   /* Local function for parse_bracket_exp used in _LIBC environment.
2870      Build the range expression which starts from START_ELEM, and ends
2871      at END_ELEM.  The result are written to MBCSET and SBCSET.
2872      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2873      mbcset->range_ends, is a pointer argument since we may
2874      update it.  */
2875
2876   auto inline reg_errcode_t
2877   __attribute__ ((always_inline))
2878   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2879                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2880     {
2881       unsigned int ch;
2882       uint32_t start_collseq;
2883       uint32_t end_collseq;
2884
2885       /* Equivalence Classes and Character Classes can't be a range
2886          start/end.  */
2887       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2888               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2889               0))
2890         return REG_ERANGE;
2891
2892       start_collseq = lookup_collation_sequence_value (start_elem);
2893       end_collseq = lookup_collation_sequence_value (end_elem);
2894       /* Check start/end collation sequence values.  */
2895       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2896         return REG_ECOLLATE;
2897       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2898         return REG_ERANGE;
2899
2900       /* Got valid collation sequence values, add them as a new entry.
2901          However, if we have no collation elements, and the character set
2902          is single byte, the single byte character set that we
2903          build below suffices. */
2904       if (nrules > 0 || dfa->mb_cur_max > 1)
2905         {
2906           /* Check the space of the arrays.  */
2907           if (BE (*range_alloc == mbcset->nranges, 0))
2908             {
2909               /* There is not enough space, need realloc.  */
2910               uint32_t *new_array_start;
2911               uint32_t *new_array_end;
2912               int new_nranges;
2913
2914               /* +1 in case of mbcset->nranges is 0.  */
2915               new_nranges = 2 * mbcset->nranges + 1;
2916               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2917                                             new_nranges);
2918               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2919                                           new_nranges);
2920
2921               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2922                 return REG_ESPACE;
2923
2924               mbcset->range_starts = new_array_start;
2925               mbcset->range_ends = new_array_end;
2926               *range_alloc = new_nranges;
2927             }
2928
2929           mbcset->range_starts[mbcset->nranges] = start_collseq;
2930           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2931         }
2932
2933       /* Build the table for single byte characters.  */
2934       for (ch = 0; ch < SBC_MAX; ch++)
2935         {
2936           uint32_t ch_collseq;
2937           /*
2938           if (MB_CUR_MAX == 1)
2939           */
2940           if (nrules == 0)
2941             ch_collseq = collseqmb[ch];
2942           else
2943             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2944           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2945             bitset_set (sbcset, ch);
2946         }
2947       return REG_NOERROR;
2948     }
2949
2950   /* Local function for parse_bracket_exp used in _LIBC environment.
2951      Build the collating element which is represented by NAME.
2952      The result are written to MBCSET and SBCSET.
2953      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2954      pointer argument since we may update it.  */
2955
2956   auto inline reg_errcode_t
2957   __attribute__ ((always_inline))
2958   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2959                           int *coll_sym_alloc, const unsigned char *name)
2960     {
2961       int32_t elem, idx;
2962       size_t name_len = strlen ((const char *) name);
2963       if (nrules != 0)
2964         {
2965           elem = seek_collating_symbol_entry (name, name_len);
2966           if (elem != -1)
2967             {
2968               /* We found the entry.  */
2969               idx = symb_table[2 * elem + 1];
2970               /* Skip the name of collating element name.  */
2971               idx += 1 + extra[idx];
2972             }
2973           else if (name_len == 1)
2974             {
2975               /* No valid character, treat it as a normal
2976                  character.  */
2977               bitset_set (sbcset, name[0]);
2978               return REG_NOERROR;
2979             }
2980           else
2981             return REG_ECOLLATE;
2982
2983           /* Got valid collation sequence, add it as a new entry.  */
2984           /* Check the space of the arrays.  */
2985           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2986             {
2987               /* Not enough, realloc it.  */
2988               /* +1 in case of mbcset->ncoll_syms is 0.  */
2989               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2990               /* Use realloc since mbcset->coll_syms is NULL
2991                  if *alloc == 0.  */
2992               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2993                                                    new_coll_sym_alloc);
2994               if (BE (new_coll_syms == NULL, 0))
2995                 return REG_ESPACE;
2996               mbcset->coll_syms = new_coll_syms;
2997               *coll_sym_alloc = new_coll_sym_alloc;
2998             }
2999           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3000           return REG_NOERROR;
3001         }
3002       else
3003         {
3004           if (BE (name_len != 1, 0))
3005             return REG_ECOLLATE;
3006           else
3007             {
3008               bitset_set (sbcset, name[0]);
3009               return REG_NOERROR;
3010             }
3011         }
3012     }
3013 #endif
3014
3015   re_token_t br_token;
3016   re_bitset_ptr_t sbcset;
3017 #ifdef RE_ENABLE_I18N
3018   re_charset_t *mbcset;
3019   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3020   int equiv_class_alloc = 0, char_class_alloc = 0;
3021 #endif /* not RE_ENABLE_I18N */
3022   int non_match = 0;
3023   bin_tree_t *work_tree;
3024   int token_len;
3025   int first_round = 1;
3026 #ifdef _LIBC
3027   collseqmb = (const unsigned char *)
3028     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3029   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3030   if (nrules)
3031     {
3032       /*
3033       if (MB_CUR_MAX > 1)
3034       */
3035       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3036       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3037       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3038                                                   _NL_COLLATE_SYMB_TABLEMB);
3039       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3040                                                    _NL_COLLATE_SYMB_EXTRAMB);
3041     }
3042 #endif
3043   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3044 #ifdef RE_ENABLE_I18N
3045   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3046 #endif /* RE_ENABLE_I18N */
3047 #ifdef RE_ENABLE_I18N
3048   if (BE (sbcset == NULL || mbcset == NULL, 0))
3049 #else
3050   if (BE (sbcset == NULL, 0))
3051 #endif /* RE_ENABLE_I18N */
3052     {
3053       re_free (sbcset);
3054 #ifdef RE_ENABLE_I18N
3055       re_free (mbcset);
3056 #endif
3057       *err = REG_ESPACE;
3058       return NULL;
3059     }
3060
3061   token_len = peek_token_bracket (token, regexp, syntax);
3062   if (BE (token->type == END_OF_RE, 0))
3063     {
3064       *err = REG_BADPAT;
3065       goto parse_bracket_exp_free_return;
3066     }
3067   if (token->type == OP_NON_MATCH_LIST)
3068     {
3069 #ifdef RE_ENABLE_I18N
3070       mbcset->non_match = 1;
3071 #endif /* not RE_ENABLE_I18N */
3072       non_match = 1;
3073       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3074         bitset_set (sbcset, '\n');
3075       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3076       token_len = peek_token_bracket (token, regexp, syntax);
3077       if (BE (token->type == END_OF_RE, 0))
3078         {
3079           *err = REG_BADPAT;
3080           goto parse_bracket_exp_free_return;
3081         }
3082     }
3083
3084   /* We treat the first ']' as a normal character.  */
3085   if (token->type == OP_CLOSE_BRACKET)
3086     token->type = CHARACTER;
3087
3088   while (1)
3089     {
3090       bracket_elem_t start_elem, end_elem;
3091       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3092       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3093       reg_errcode_t ret;
3094       int token_len2 = 0, is_range_exp = 0;
3095       re_token_t token2;
3096
3097       start_elem.opr.name = start_name_buf;
3098       start_elem.type = COLL_SYM;
3099       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3100                                    syntax, first_round);
3101       if (BE (ret != REG_NOERROR, 0))
3102         {
3103           *err = ret;
3104           goto parse_bracket_exp_free_return;
3105         }
3106       first_round = 0;
3107
3108       /* Get information about the next token.  We need it in any case.  */
3109       token_len = peek_token_bracket (token, regexp, syntax);
3110
3111       /* Do not check for ranges if we know they are not allowed.  */
3112       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3113         {
3114           if (BE (token->type == END_OF_RE, 0))
3115             {
3116               *err = REG_EBRACK;
3117               goto parse_bracket_exp_free_return;
3118             }
3119           if (token->type == OP_CHARSET_RANGE)
3120             {
3121               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3122               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3123               if (BE (token2.type == END_OF_RE, 0))
3124                 {
3125                   *err = REG_EBRACK;
3126                   goto parse_bracket_exp_free_return;
3127                 }
3128               if (token2.type == OP_CLOSE_BRACKET)
3129                 {
3130                   /* We treat the last '-' as a normal character.  */
3131                   re_string_skip_bytes (regexp, -token_len);
3132                   token->type = CHARACTER;
3133                 }
3134               else
3135                 is_range_exp = 1;
3136             }
3137         }
3138
3139       if (is_range_exp == 1)
3140         {
3141           end_elem.opr.name = end_name_buf;
3142           end_elem.type = COLL_SYM;
3143           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3144                                        dfa, syntax, 1);
3145           if (BE (ret != REG_NOERROR, 0))
3146             {
3147               *err = ret;
3148               goto parse_bracket_exp_free_return;
3149             }
3150
3151           token_len = peek_token_bracket (token, regexp, syntax);
3152
3153 #ifdef _LIBC
3154           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3155                                   &start_elem, &end_elem);
3156 #else
3157 # ifdef RE_ENABLE_I18N
3158           *err = build_range_exp (sbcset,
3159                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3160                                   &range_alloc, &start_elem, &end_elem);
3161 # else
3162           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3163 # endif
3164 #endif /* RE_ENABLE_I18N */
3165           if (BE (*err != REG_NOERROR, 0))
3166             goto parse_bracket_exp_free_return;
3167         }
3168       else
3169         {
3170           switch (start_elem.type)
3171             {
3172             case SB_CHAR:
3173               bitset_set (sbcset, start_elem.opr.ch);
3174               break;
3175 #ifdef RE_ENABLE_I18N
3176             case MB_CHAR:
3177               /* Check whether the array has enough space.  */
3178               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3179                 {
3180                   wchar_t *new_mbchars;
3181                   /* Not enough, realloc it.  */
3182                   /* +1 in case of mbcset->nmbchars is 0.  */
3183                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3184                   /* Use realloc since array is NULL if *alloc == 0.  */
3185                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3186                                             mbchar_alloc);
3187                   if (BE (new_mbchars == NULL, 0))
3188                     goto parse_bracket_exp_espace;
3189                   mbcset->mbchars = new_mbchars;
3190                 }
3191               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3192               break;
3193 #endif /* RE_ENABLE_I18N */
3194             case EQUIV_CLASS:
3195               *err = build_equiv_class (sbcset,
3196 #ifdef RE_ENABLE_I18N
3197                                         mbcset, &equiv_class_alloc,
3198 #endif /* RE_ENABLE_I18N */
3199                                         start_elem.opr.name);
3200               if (BE (*err != REG_NOERROR, 0))
3201                 goto parse_bracket_exp_free_return;
3202               break;
3203             case COLL_SYM:
3204               *err = build_collating_symbol (sbcset,
3205 #ifdef RE_ENABLE_I18N
3206                                              mbcset, &coll_sym_alloc,
3207 #endif /* RE_ENABLE_I18N */
3208                                              start_elem.opr.name);
3209               if (BE (*err != REG_NOERROR, 0))
3210                 goto parse_bracket_exp_free_return;
3211               break;
3212             case CHAR_CLASS:
3213               *err = build_charclass (regexp->trans, sbcset,
3214 #ifdef RE_ENABLE_I18N
3215                                       mbcset, &char_class_alloc,
3216 #endif /* RE_ENABLE_I18N */
3217                                       start_elem.opr.name, syntax);
3218               if (BE (*err != REG_NOERROR, 0))
3219                goto parse_bracket_exp_free_return;
3220               break;
3221             default:
3222               assert (0);
3223               break;
3224             }
3225         }
3226       if (BE (token->type == END_OF_RE, 0))
3227         {
3228           *err = REG_EBRACK;
3229           goto parse_bracket_exp_free_return;
3230         }
3231       if (token->type == OP_CLOSE_BRACKET)
3232         break;
3233     }
3234
3235   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3236
3237   /* If it is non-matching list.  */
3238   if (non_match)
3239     bitset_not (sbcset);
3240
3241 #ifdef RE_ENABLE_I18N
3242   /* Ensure only single byte characters are set.  */
3243   if (dfa->mb_cur_max > 1)
3244     bitset_mask (sbcset, dfa->sb_char);
3245
3246   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3247       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3248                                                      || mbcset->non_match)))
3249     {
3250       bin_tree_t *mbc_tree;
3251       int sbc_idx;
3252       /* Build a tree for complex bracket.  */
3253       dfa->has_mb_node = 1;
3254       br_token.type = COMPLEX_BRACKET;
3255       br_token.opr.mbcset = mbcset;
3256       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3257       if (BE (mbc_tree == NULL, 0))
3258         goto parse_bracket_exp_espace;
3259       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3260         if (sbcset[sbc_idx])
3261           break;
3262       /* If there are no bits set in sbcset, there is no point
3263          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3264       if (sbc_idx < BITSET_WORDS)
3265         {
3266           /* Build a tree for simple bracket.  */
3267           br_token.type = SIMPLE_BRACKET;
3268           br_token.opr.sbcset = sbcset;
3269           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3270           if (BE (work_tree == NULL, 0))
3271             goto parse_bracket_exp_espace;
3272
3273           /* Then join them by ALT node.  */
3274           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3275           if (BE (work_tree == NULL, 0))
3276             goto parse_bracket_exp_espace;
3277         }
3278       else
3279         {
3280           re_free (sbcset);
3281           work_tree = mbc_tree;
3282         }
3283     }
3284   else
3285 #endif /* not RE_ENABLE_I18N */
3286     {
3287 #ifdef RE_ENABLE_I18N
3288       free_charset (mbcset);
3289 #endif
3290       /* Build a tree for simple bracket.  */
3291       br_token.type = SIMPLE_BRACKET;
3292       br_token.opr.sbcset = sbcset;
3293       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3294       if (BE (work_tree == NULL, 0))
3295         goto parse_bracket_exp_espace;
3296     }
3297   return work_tree;
3298
3299  parse_bracket_exp_espace:
3300   *err = REG_ESPACE;
3301  parse_bracket_exp_free_return:
3302   re_free (sbcset);
3303 #ifdef RE_ENABLE_I18N
3304   free_charset (mbcset);
3305 #endif /* RE_ENABLE_I18N */
3306   return NULL;
3307 }
3308
3309 /* Parse an element in the bracket expression.  */
3310
3311 static reg_errcode_t
3312 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3313                        re_token_t *token, int token_len, re_dfa_t *dfa,
3314                        reg_syntax_t syntax, int accept_hyphen)
3315 {
3316 #ifdef RE_ENABLE_I18N
3317   int cur_char_size;
3318   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3319   if (cur_char_size > 1)
3320     {
3321       elem->type = MB_CHAR;
3322       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3323       re_string_skip_bytes (regexp, cur_char_size);
3324       return REG_NOERROR;
3325     }
3326 #endif /* RE_ENABLE_I18N */
3327   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3328   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3329       || token->type == OP_OPEN_EQUIV_CLASS)
3330     return parse_bracket_symbol (elem, regexp, token);
3331   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3332     {
3333       /* A '-' must only appear as anything but a range indicator before
3334          the closing bracket.  Everything else is an error.  */
3335       re_token_t token2;
3336       (void) peek_token_bracket (&token2, regexp, syntax);
3337       if (token2.type != OP_CLOSE_BRACKET)
3338         /* The actual error value is not standardized since this whole
3339            case is undefined.  But ERANGE makes good sense.  */
3340         return REG_ERANGE;
3341     }
3342   elem->type = SB_CHAR;
3343   elem->opr.ch = token->opr.c;
3344   return REG_NOERROR;
3345 }
3346
3347 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3348    such as [:<character_class>:], [.<collating_element>.], and
3349    [=<equivalent_class>=].  */
3350
3351 static reg_errcode_t
3352 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3353                       re_token_t *token)
3354 {
3355   unsigned char ch, delim = token->opr.c;
3356   int i = 0;
3357   if (re_string_eoi(regexp))
3358     return REG_EBRACK;
3359   for (;; ++i)
3360     {
3361       if (i >= BRACKET_NAME_BUF_SIZE)
3362         return REG_EBRACK;
3363       if (token->type == OP_OPEN_CHAR_CLASS)
3364         ch = re_string_fetch_byte_case (regexp);
3365       else
3366         ch = re_string_fetch_byte (regexp);
3367       if (re_string_eoi(regexp))
3368         return REG_EBRACK;
3369       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3370         break;
3371       elem->opr.name[i] = ch;
3372     }
3373   re_string_skip_bytes (regexp, 1);
3374   elem->opr.name[i] = '\0';
3375   switch (token->type)
3376     {
3377     case OP_OPEN_COLL_ELEM:
3378       elem->type = COLL_SYM;
3379       break;
3380     case OP_OPEN_EQUIV_CLASS:
3381       elem->type = EQUIV_CLASS;
3382       break;
3383     case OP_OPEN_CHAR_CLASS:
3384       elem->type = CHAR_CLASS;
3385       break;
3386     default:
3387       break;
3388     }
3389   return REG_NOERROR;
3390 }
3391
3392   /* Helper function for parse_bracket_exp.
3393      Build the equivalence class which is represented by NAME.
3394      The result are written to MBCSET and SBCSET.
3395      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3396      is a pointer argument since we may update it.  */
3397
3398 static reg_errcode_t
3399 #ifdef RE_ENABLE_I18N
3400 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3401                    int *equiv_class_alloc, const unsigned char *name)
3402 #else /* not RE_ENABLE_I18N */
3403 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3404 #endif /* not RE_ENABLE_I18N */
3405 {
3406 #ifdef _LIBC
3407   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3408   if (nrules != 0)
3409     {
3410       const int32_t *table, *indirect;
3411       const unsigned char *weights, *extra, *cp;
3412       unsigned char char_buf[2];
3413       int32_t idx1, idx2;
3414       unsigned int ch;
3415       size_t len;
3416       /* Calculate the index for equivalence class.  */
3417       cp = name;
3418       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3419       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3420                                                _NL_COLLATE_WEIGHTMB);
3421       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422                                                    _NL_COLLATE_EXTRAMB);
3423       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3424                                                 _NL_COLLATE_INDIRECTMB);
3425       idx1 = findidx (table, indirect, extra, &cp, -1);
3426       if (BE (idx1 == 0 || *cp != '\0', 0))
3427         /* This isn't a valid character.  */
3428         return REG_ECOLLATE;
3429
3430       /* Build single byte matcing table for this equivalence class.  */
3431       len = weights[idx1 & 0xffffff];
3432       for (ch = 0; ch < SBC_MAX; ++ch)
3433         {
3434           char_buf[0] = ch;
3435           cp = char_buf;
3436           idx2 = findidx (table, indirect, extra, &cp, 1);
3437 /*
3438           idx2 = table[ch];
3439 */
3440           if (idx2 == 0)
3441             /* This isn't a valid character.  */
3442             continue;
3443           /* Compare only if the length matches and the collation rule
3444              index is the same.  */
3445           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3446             {
3447               int cnt = 0;
3448
3449               while (cnt <= len &&
3450                      weights[(idx1 & 0xffffff) + 1 + cnt]
3451                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3452                 ++cnt;
3453
3454               if (cnt > len)
3455                 bitset_set (sbcset, ch);
3456             }
3457         }
3458       /* Check whether the array has enough space.  */
3459       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3460         {
3461           /* Not enough, realloc it.  */
3462           /* +1 in case of mbcset->nequiv_classes is 0.  */
3463           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3464           /* Use realloc since the array is NULL if *alloc == 0.  */
3465           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3466                                                    int32_t,
3467                                                    new_equiv_class_alloc);
3468           if (BE (new_equiv_classes == NULL, 0))
3469             return REG_ESPACE;
3470           mbcset->equiv_classes = new_equiv_classes;
3471           *equiv_class_alloc = new_equiv_class_alloc;
3472         }
3473       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3474     }
3475   else
3476 #endif /* _LIBC */
3477     {
3478       if (BE (strlen ((const char *) name) != 1, 0))
3479         return REG_ECOLLATE;
3480       bitset_set (sbcset, *name);
3481     }
3482   return REG_NOERROR;
3483 }
3484
3485   /* Helper function for parse_bracket_exp.
3486      Build the character class which is represented by NAME.
3487      The result are written to MBCSET and SBCSET.
3488      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3489      is a pointer argument since we may update it.  */
3490
3491 static reg_errcode_t
3492 #ifdef RE_ENABLE_I18N
3493 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3494                  re_charset_t *mbcset, int *char_class_alloc,
3495                  const unsigned char *class_name, reg_syntax_t syntax)
3496 #else /* not RE_ENABLE_I18N */
3497 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3498                  const unsigned char *class_name, reg_syntax_t syntax)
3499 #endif /* not RE_ENABLE_I18N */
3500 {
3501   int i;
3502   const char *name = (const char *) class_name;
3503
3504   /* In case of REG_ICASE "upper" and "lower" match the both of
3505      upper and lower cases.  */
3506   if ((syntax & RE_ICASE)
3507       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3508     name = "alpha";
3509
3510 #ifdef RE_ENABLE_I18N
3511   /* Check the space of the arrays.  */
3512   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3513     {
3514       /* Not enough, realloc it.  */
3515       /* +1 in case of mbcset->nchar_classes is 0.  */
3516       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3517       /* Use realloc since array is NULL if *alloc == 0.  */
3518       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3519                                                new_char_class_alloc);
3520       if (BE (new_char_classes == NULL, 0))
3521         return REG_ESPACE;
3522       mbcset->char_classes = new_char_classes;
3523       *char_class_alloc = new_char_class_alloc;
3524     }
3525   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3526 #endif /* RE_ENABLE_I18N */
3527
3528 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3529   do {                                          \
3530     if (BE (trans != NULL, 0))                  \
3531       {                                         \
3532         for (i = 0; i < SBC_MAX; ++i)           \
3533           if (ctype_func (i))                   \
3534             bitset_set (sbcset, trans[i]);      \
3535       }                                         \
3536     else                                        \
3537       {                                         \
3538         for (i = 0; i < SBC_MAX; ++i)           \
3539           if (ctype_func (i))                   \
3540             bitset_set (sbcset, i);             \
3541       }                                         \
3542   } while (0)
3543
3544   if (strcmp (name, "alnum") == 0)
3545     BUILD_CHARCLASS_LOOP (isalnum);
3546   else if (strcmp (name, "cntrl") == 0)
3547     BUILD_CHARCLASS_LOOP (iscntrl);
3548   else if (strcmp (name, "lower") == 0)
3549     BUILD_CHARCLASS_LOOP (islower);
3550   else if (strcmp (name, "space") == 0)
3551     BUILD_CHARCLASS_LOOP (isspace);
3552   else if (strcmp (name, "alpha") == 0)
3553     BUILD_CHARCLASS_LOOP (isalpha);
3554   else if (strcmp (name, "digit") == 0)
3555     BUILD_CHARCLASS_LOOP (isdigit);
3556   else if (strcmp (name, "print") == 0)
3557     BUILD_CHARCLASS_LOOP (isprint);
3558   else if (strcmp (name, "upper") == 0)
3559     BUILD_CHARCLASS_LOOP (isupper);
3560   else if (strcmp (name, "blank") == 0)
3561     BUILD_CHARCLASS_LOOP (isblank);
3562   else if (strcmp (name, "graph") == 0)
3563     BUILD_CHARCLASS_LOOP (isgraph);
3564   else if (strcmp (name, "punct") == 0)
3565     BUILD_CHARCLASS_LOOP (ispunct);
3566   else if (strcmp (name, "xdigit") == 0)
3567     BUILD_CHARCLASS_LOOP (isxdigit);
3568   else
3569     return REG_ECTYPE;
3570
3571   return REG_NOERROR;
3572 }
3573
3574 static bin_tree_t *
3575 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3576                     const unsigned char *class_name,
3577                     const unsigned char *extra, int non_match,
3578                     reg_errcode_t *err)
3579 {
3580   re_bitset_ptr_t sbcset;
3581 #ifdef RE_ENABLE_I18N
3582   re_charset_t *mbcset;
3583   int alloc = 0;
3584 #endif /* not RE_ENABLE_I18N */
3585   reg_errcode_t ret;
3586   re_token_t br_token;
3587   bin_tree_t *tree;
3588
3589   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3590 #ifdef RE_ENABLE_I18N
3591   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3592 #endif /* RE_ENABLE_I18N */
3593
3594 #ifdef RE_ENABLE_I18N
3595   if (BE (sbcset == NULL || mbcset == NULL, 0))
3596 #else /* not RE_ENABLE_I18N */
3597   if (BE (sbcset == NULL, 0))
3598 #endif /* not RE_ENABLE_I18N */
3599     {
3600       *err = REG_ESPACE;
3601       return NULL;
3602     }
3603
3604   if (non_match)
3605     {
3606 #ifdef RE_ENABLE_I18N
3607       mbcset->non_match = 1;
3608 #endif /* not RE_ENABLE_I18N */
3609     }
3610
3611   /* We don't care the syntax in this case.  */
3612   ret = build_charclass (trans, sbcset,
3613 #ifdef RE_ENABLE_I18N
3614                          mbcset, &alloc,
3615 #endif /* RE_ENABLE_I18N */
3616                          class_name, 0);
3617
3618   if (BE (ret != REG_NOERROR, 0))
3619     {
3620       re_free (sbcset);
3621 #ifdef RE_ENABLE_I18N
3622       free_charset (mbcset);
3623 #endif /* RE_ENABLE_I18N */
3624       *err = ret;
3625       return NULL;
3626     }
3627   /* \w match '_' also.  */
3628   for (; *extra; extra++)
3629     bitset_set (sbcset, *extra);
3630
3631   /* If it is non-matching list.  */
3632   if (non_match)
3633     bitset_not (sbcset);
3634
3635 #ifdef RE_ENABLE_I18N
3636   /* Ensure only single byte characters are set.  */
3637   if (dfa->mb_cur_max > 1)
3638     bitset_mask (sbcset, dfa->sb_char);
3639 #endif
3640
3641   /* Build a tree for simple bracket.  */
3642   br_token.type = SIMPLE_BRACKET;
3643   br_token.opr.sbcset = sbcset;
3644   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3645   if (BE (tree == NULL, 0))
3646     goto build_word_op_espace;
3647
3648 #ifdef RE_ENABLE_I18N
3649   if (dfa->mb_cur_max > 1)
3650     {
3651       bin_tree_t *mbc_tree;
3652       /* Build a tree for complex bracket.  */
3653       br_token.type = COMPLEX_BRACKET;
3654       br_token.opr.mbcset = mbcset;
3655       dfa->has_mb_node = 1;
3656       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3657       if (BE (mbc_tree == NULL, 0))
3658         goto build_word_op_espace;
3659       /* Then join them by ALT node.  */
3660       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3661       if (BE (mbc_tree != NULL, 1))
3662         return tree;
3663     }
3664   else
3665     {
3666       free_charset (mbcset);
3667       return tree;
3668     }
3669 #else /* not RE_ENABLE_I18N */
3670   return tree;
3671 #endif /* not RE_ENABLE_I18N */
3672
3673  build_word_op_espace:
3674   re_free (sbcset);
3675 #ifdef RE_ENABLE_I18N
3676   free_charset (mbcset);
3677 #endif /* RE_ENABLE_I18N */
3678   *err = REG_ESPACE;
3679   return NULL;
3680 }
3681
3682 /* This is intended for the expressions like "a{1,3}".
3683    Fetch a number from `input', and return the number.
3684    Return -1, if the number field is empty like "{,1}".
3685    Return -2, If an error is occured.  */
3686
3687 static int
3688 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3689 {
3690   int num = -1;
3691   unsigned char c;
3692   while (1)
3693     {
3694       fetch_token (token, input, syntax);
3695       c = token->opr.c;
3696       if (BE (token->type == END_OF_RE, 0))
3697         return -2;
3698       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3699         break;
3700       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3701              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3702       num = (num > RE_DUP_MAX) ? -2 : num;
3703     }
3704   return num;
3705 }
3706 \f
3707 #ifdef RE_ENABLE_I18N
3708 static void
3709 free_charset (re_charset_t *cset)
3710 {
3711   re_free (cset->mbchars);
3712 # ifdef _LIBC
3713   re_free (cset->coll_syms);
3714   re_free (cset->equiv_classes);
3715   re_free (cset->range_starts);
3716   re_free (cset->range_ends);
3717 # endif
3718   re_free (cset->char_classes);
3719   re_free (cset);
3720 }
3721 #endif /* RE_ENABLE_I18N */
3722 \f
3723 /* Functions for binary tree operation.  */
3724
3725 /* Create a tree node.  */
3726
3727 static bin_tree_t *
3728 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3729              re_token_type_t type)
3730 {
3731   re_token_t t;
3732   t.type = type;
3733   return create_token_tree (dfa, left, right, &t);
3734 }
3735
3736 static bin_tree_t *
3737 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3738                    const re_token_t *token)
3739 {
3740   bin_tree_t *tree;
3741   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3742     {
3743       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3744
3745       if (storage == NULL)
3746         return NULL;
3747       storage->next = dfa->str_tree_storage;
3748       dfa->str_tree_storage = storage;
3749       dfa->str_tree_storage_idx = 0;
3750     }
3751   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3752
3753   tree->parent = NULL;
3754   tree->left = left;
3755   tree->right = right;
3756   tree->token = *token;
3757   tree->token.duplicated = 0;
3758   tree->token.opt_subexp = 0;
3759   tree->first = NULL;
3760   tree->next = NULL;
3761   tree->node_idx = -1;
3762
3763   if (left != NULL)
3764     left->parent = tree;
3765   if (right != NULL)
3766     right->parent = tree;
3767   return tree;
3768 }
3769
3770 /* Mark the tree SRC as an optional subexpression.
3771    To be called from preorder or postorder.  */
3772
3773 static reg_errcode_t
3774 mark_opt_subexp (void *extra, bin_tree_t *node)
3775 {
3776   int idx = (int) (long) extra;
3777   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3778     node->token.opt_subexp = 1;
3779
3780   return REG_NOERROR;
3781 }
3782
3783 /* Free the allocated memory inside NODE. */
3784
3785 static void
3786 free_token (re_token_t *node)
3787 {
3788 #ifdef RE_ENABLE_I18N
3789   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3790     free_charset (node->opr.mbcset);
3791   else
3792 #endif /* RE_ENABLE_I18N */
3793     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3794       re_free (node->opr.sbcset);
3795 }
3796
3797 /* Worker function for tree walking.  Free the allocated memory inside NODE
3798    and its children. */
3799
3800 static reg_errcode_t
3801 free_tree (void *extra, bin_tree_t *node)
3802 {
3803   free_token (&node->token);
3804   return REG_NOERROR;
3805 }
3806
3807
3808 /* Duplicate the node SRC, and return new node.  This is a preorder
3809    visit similar to the one implemented by the generic visitor, but
3810    we need more infrastructure to maintain two parallel trees --- so,
3811    it's easier to duplicate.  */
3812
3813 static bin_tree_t *
3814 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3815 {
3816   const bin_tree_t *node;
3817   bin_tree_t *dup_root;
3818   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3819
3820   for (node = root; ; )
3821     {
3822       /* Create a new tree and link it back to the current parent.  */
3823       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3824       if (*p_new == NULL)
3825         return NULL;
3826       (*p_new)->parent = dup_node;
3827       (*p_new)->token.duplicated = 1;
3828       dup_node = *p_new;
3829
3830       /* Go to the left node, or up and to the right.  */
3831       if (node->left)
3832         {
3833           node = node->left;
3834           p_new = &dup_node->left;
3835         }
3836       else
3837         {
3838           const bin_tree_t *prev = NULL;
3839           while (node->right == prev || node->right == NULL)
3840             {
3841               prev = node;
3842               node = node->parent;
3843               dup_node = dup_node->parent;
3844               if (!node)
3845                 return dup_root;
3846             }
3847           node = node->right;
3848           p_new = &dup_node->right;
3849         }
3850     }
3851 }