compat/regex/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19    02110-1301 USA.  */
  20
  21 #include <stdint.h>
  22
  23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  24                                           size_t length, reg_syntax_t syntax);
  25 static void re_compile_fastmap_iter (regex_t *bufp,
  26                                      const re_dfastate_t *init_state,
  27                                      char *fastmap);
  28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  29 #ifdef RE_ENABLE_I18N
  30 static void free_charset (re_charset_t *cset);
  31 #endif /* RE_ENABLE_I18N */
  32 static void free_workarea_compile (regex_t *preg);
  33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  34 #ifdef RE_ENABLE_I18N
  35 static void optimize_utf8 (re_dfa_t *dfa);
  36 #endif
  37 static reg_errcode_t analyze (regex_t *preg);
  38 static reg_errcode_t preorder (bin_tree_t *root,
  39                                reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                void *extra);
  41 static reg_errcode_t postorder (bin_tree_t *root,
  42                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                 void *extra);
  44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  47                                  bin_tree_t *node);
  48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  51 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  52 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  53                                    unsigned int constraint);
  54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  56                                          int node, int root);
  57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  58 static int fetch_number (re_string_t *input, re_token_t *token,
  59                          reg_syntax_t syntax);
  60 static int peek_token (re_token_t *token, re_string_t *input,
  61                         reg_syntax_t syntax) internal_function;
  62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  63                           reg_syntax_t syntax, reg_errcode_t *err);
  64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  65                                   re_token_t *token, reg_syntax_t syntax,
  66                                   int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  68                                  re_token_t *token, reg_syntax_t syntax,
  69                                  int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  71                                      re_token_t *token, reg_syntax_t syntax,
  72                                      int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  74                                   re_token_t *token, reg_syntax_t syntax,
  75                                   int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  77                                  re_dfa_t *dfa, re_token_t *token,
  78                                  reg_syntax_t syntax, reg_errcode_t *err);
  79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  80                                       re_token_t *token, reg_syntax_t syntax,
  81                                       reg_errcode_t *err);
  82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  83                                             re_string_t *regexp,
  84                                             re_token_t *token, int token_len,
  85                                             re_dfa_t *dfa,
  86                                             reg_syntax_t syntax,
  87                                             int accept_hyphen);
  88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  89                                           re_string_t *regexp,
  90                                           re_token_t *token);
  91 #ifdef RE_ENABLE_I18N
  92 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  93                                         re_charset_t *mbcset,
  94                                         int *equiv_class_alloc,
  95                                         const unsigned char *name);
  96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  97                                       bitset_t sbcset,
  98                                       re_charset_t *mbcset,
  99                                       int *char_class_alloc,
 100                                       const char *class_name,
 101                                       reg_syntax_t syntax);
 102 #else  /* not RE_ENABLE_I18N */
 103 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 104                                         const unsigned char *name);
 105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 106                                       bitset_t sbcset,
 107                                       const char *class_name,
 108                                       reg_syntax_t syntax);
 109 #endif /* not RE_ENABLE_I18N */
 110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 111                                        RE_TRANSLATE_TYPE trans,
 112                                        const char *class_name,
 113                                        const char *extra,
 114                                        int non_match, reg_errcode_t *err);
 115 static bin_tree_t *create_tree (re_dfa_t *dfa,
 116                                 bin_tree_t *left, bin_tree_t *right,
 117                                 re_token_type_t type);
 118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 119                                       bin_tree_t *left, bin_tree_t *right,
 120                                       const re_token_t *token);
 121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 122 static void free_token (re_token_t *node);
 123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 125 \f
 126 /* This table gives an error message for each of the error codes listed
 127    in regex.h.  Obviously the order here has to be same as there.
 128    POSIX doesn't require that we do anything for REG_NOERROR,
 129    but why not be nice?  */
 130
 131 const char __re_error_msgid[] attribute_hidden =
 132   {
 133 #define REG_NOERROR_IDX 0
 134     gettext_noop ("Success")    /* REG_NOERROR */
 135     "\0"
 136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 137     gettext_noop ("No match")   /* REG_NOMATCH */
 138     "\0"
 139 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 140     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 141     "\0"
 142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 143     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 144     "\0"
 145 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 146     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 147     "\0"
 148 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 149     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 150     "\0"
 151 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 152     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 153     "\0"
 154 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 155     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 156     "\0"
 157 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 158     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 159     "\0"
 160 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 161     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 162     "\0"
 163 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 164     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 165     "\0"
 166 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 167     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 168     "\0"
 169 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 170     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 171     "\0"
 172 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 173     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 174     "\0"
 175 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 176     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 177     "\0"
 178 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 179     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 180     "\0"
 181 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 182     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 183   };
 184
 185 const size_t __re_error_msgid_idx[] attribute_hidden =
 186   {
 187     REG_NOERROR_IDX,
 188     REG_NOMATCH_IDX,
 189     REG_BADPAT_IDX,
 190     REG_ECOLLATE_IDX,
 191     REG_ECTYPE_IDX,
 192     REG_EESCAPE_IDX,
 193     REG_ESUBREG_IDX,
 194     REG_EBRACK_IDX,
 195     REG_EPAREN_IDX,
 196     REG_EBRACE_IDX,
 197     REG_BADBR_IDX,
 198     REG_ERANGE_IDX,
 199     REG_ESPACE_IDX,
 200     REG_BADRPT_IDX,
 201     REG_EEND_IDX,
 202     REG_ESIZE_IDX,
 203     REG_ERPAREN_IDX
 204   };
 205 \f
 206 /* Entry points for GNU code.  */
 207
 208
 209 #ifdef ZOS_USS
 210
 211 /* For ZOS USS we must define btowc */
 212
 213 wchar_t
 214 btowc (int c)
 215 {
 216    wchar_t wtmp[2];
 217    char tmp[2];
 218
 219    tmp[0] = c;
 220    tmp[1] = 0;
 221
 222    mbtowc (wtmp, tmp, 1);
 223    return wtmp[0];
 224 }
 225 #endif
 226
 227 /* re_compile_pattern is the GNU regular expression compiler: it
 228    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 229    Returns 0 if the pattern was valid, otherwise an error string.
 230
 231    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 232    are set in BUFP on entry.  */
 233
 234 const char *
 235 re_compile_pattern (const char *pattern,
 236                     size_t length,
 237                     struct re_pattern_buffer *bufp)
 238 {
 239   reg_errcode_t ret;
 240
 241   /* And GNU code determines whether or not to get register information
 242      by passing null for the REGS argument to re_match, etc., not by
 243      setting no_sub, unless RE_NO_SUB is set.  */
 244   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 245
 246   /* Match anchors at newline.  */
 247   bufp->newline_anchor = 1;
 248
 249   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 250
 251   if (!ret)
 252     return NULL;
 253   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 254 }
 255 #ifdef _LIBC
 256 weak_alias (__re_compile_pattern, re_compile_pattern)
 257 #endif
 258
 259 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 260    also be assigned to arbitrarily: each pattern buffer stores its own
 261    syntax, so it can be changed between regex compilations.  */
 262 /* This has no initializer because initialized variables in Emacs
 263    become read-only after dumping.  */
 264 reg_syntax_t re_syntax_options;
 265
 266
 267 /* Specify the precise syntax of regexps for compilation.  This provides
 268    for compatibility for various utilities which historically have
 269    different, incompatible syntaxes.
 270
 271    The argument SYNTAX is a bit mask comprised of the various bits
 272    defined in regex.h.  We return the old syntax.  */
 273
 274 reg_syntax_t
 275 re_set_syntax (reg_syntax_t syntax)
 276 {
 277   reg_syntax_t ret = re_syntax_options;
 278
 279   re_syntax_options = syntax;
 280   return ret;
 281 }
 282 #ifdef _LIBC
 283 weak_alias (__re_set_syntax, re_set_syntax)
 284 #endif
 285
 286 int
 287 re_compile_fastmap (struct re_pattern_buffer *bufp)
 288 {
 289   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 290   char *fastmap = bufp->fastmap;
 291
 292   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 293   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 294   if (dfa->init_state != dfa->init_state_word)
 295     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 296   if (dfa->init_state != dfa->init_state_nl)
 297     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 298   if (dfa->init_state != dfa->init_state_begbuf)
 299     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 300   bufp->fastmap_accurate = 1;
 301   return 0;
 302 }
 303 #ifdef _LIBC
 304 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 305 #endif
 306
 307 static inline void
 308 __attribute ((always_inline))
 309 re_set_fastmap (char *fastmap, int icase, int ch)
 310 {
 311   fastmap[ch] = 1;
 312   if (icase)
 313     fastmap[tolower (ch)] = 1;
 314 }
 315
 316 /* Helper function for re_compile_fastmap.
 317    Compile fastmap for the initial_state INIT_STATE.  */
 318
 319 static void
 320 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 321                          char *fastmap)
 322 {
 323   volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 324   int node_cnt;
 325   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 326   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 327     {
 328       int node = init_state->nodes.elems[node_cnt];
 329       re_token_type_t type = dfa->nodes[node].type;
 330
 331       if (type == CHARACTER)
 332         {
 333           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 334 #ifdef RE_ENABLE_I18N
 335           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 336             {
 337               unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 338               wchar_t wc;
 339               mbstate_t state;
 340
 341               p = buf;
 342               *p++ = dfa->nodes[node].opr.c;
 343               while (++node < dfa->nodes_len
 344                      && dfa->nodes[node].type == CHARACTER
 345                      && dfa->nodes[node].mb_partial)
 346                 *p++ = dfa->nodes[node].opr.c;
 347               memset (&state, '\0', sizeof (state));
 348               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 349                              &state) == p - buf
 350                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 351                       != (size_t) -1))
 352                 re_set_fastmap (fastmap, 0, buf[0]);
 353               re_free (buf);
 354             }
 355 #endif
 356         }
 357       else if (type == SIMPLE_BRACKET)
 358         {
 359           int i, ch;
 360           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 361             {
 362               int j;
 363               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 364               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 365                 if (w & ((bitset_word_t) 1 << j))
 366                   re_set_fastmap (fastmap, icase, ch);
 367             }
 368         }
 369 #ifdef RE_ENABLE_I18N
 370       else if (type == COMPLEX_BRACKET)
 371         {
 372           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 373           int i;
 374
 375 # ifdef _LIBC
 376           /* See if we have to try all bytes which start multiple collation
 377              elements.
 378              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 379                   collation element, and don't catch 'b' since 'b' is
 380                   the only collation element which starts from 'b' (and
 381                   it is caught by SIMPLE_BRACKET).  */
 382               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 383                   && (cset->ncoll_syms || cset->nranges))
 384                 {
 385                   const int32_t *table = (const int32_t *)
 386                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 387                   for (i = 0; i < SBC_MAX; ++i)
 388                     if (table[i] < 0)
 389                       re_set_fastmap (fastmap, icase, i);
 390                 }
 391 # endif /* _LIBC */
 392
 393           /* See if we have to start the match at all multibyte characters,
 394              i.e. where we would not find an invalid sequence.  This only
 395              applies to multibyte character sets; for single byte character
 396              sets, the SIMPLE_BRACKET again suffices.  */
 397           if (dfa->mb_cur_max > 1
 398               && (cset->nchar_classes || cset->non_match || cset->nranges
 399 # ifdef _LIBC
 400                   || cset->nequiv_classes
 401 # endif /* _LIBC */
 402                  ))
 403             {
 404               unsigned char c = 0;
 405               do
 406                 {
 407                   mbstate_t mbs;
 408                   memset (&mbs, 0, sizeof (mbs));
 409                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 410                     re_set_fastmap (fastmap, false, (int) c);
 411                 }
 412               while (++c != 0);
 413             }
 414
 415           else
 416             {
 417               /* ... Else catch all bytes which can start the mbchars.  */
 418               for (i = 0; i < cset->nmbchars; ++i)
 419                 {
 420                   char buf[256];
 421                   mbstate_t state;
 422                   memset (&state, '\0', sizeof (state));
 423                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 424                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 425                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 426                     {
 427                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 428                           != (size_t) -1)
 429                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 430                     }
 431                 }
 432             }
 433         }
 434 #endif /* RE_ENABLE_I18N */
 435       else if (type == OP_PERIOD
 436 #ifdef RE_ENABLE_I18N
 437                || type == OP_UTF8_PERIOD
 438 #endif /* RE_ENABLE_I18N */
 439                || type == END_OF_RE)
 440         {
 441           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 442           if (type == END_OF_RE)
 443             bufp->can_be_null = 1;
 444           return;
 445         }
 446     }
 447 }
 448 \f
 449 /* Entry point for POSIX code.  */
 450 /* regcomp takes a regular expression as a string and compiles it.
 451
 452    PREG is a regex_t *.  We do not expect any fields to be initialized,
 453    since POSIX says we shouldn't.  Thus, we set
 454
 455      `buffer' to the compiled pattern;
 456      `used' to the length of the compiled pattern;
 457      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 458        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 459        RE_SYNTAX_POSIX_BASIC;
 460      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 461      `fastmap' to an allocated space for the fastmap;
 462      `fastmap_accurate' to zero;
 463      `re_nsub' to the number of subexpressions in PATTERN.
 464
 465    PATTERN is the address of the pattern string.
 466
 467    CFLAGS is a series of bits which affect compilation.
 468
 469      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 470      use POSIX basic syntax.
 471
 472      If REG_NEWLINE is set, then . and [^...] don't match newline.
 473      Also, regexec will try a match beginning after every newline.
 474
 475      If REG_ICASE is set, then we considers upper- and lowercase
 476      versions of letters to be equivalent when matching.
 477
 478      If REG_NOSUB is set, then when PREG is passed to regexec, that
 479      routine will report only success or failure, and nothing about the
 480      registers.
 481
 482    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 483    the return codes and their meanings.)  */
 484
 485 int
 486 regcomp (regex_t *__restrict preg,
 487          const char *__restrict pattern,
 488          int cflags)
 489 {
 490   reg_errcode_t ret;
 491   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 492                          : RE_SYNTAX_POSIX_BASIC);
 493
 494   preg->buffer = NULL;
 495   preg->allocated = 0;
 496   preg->used = 0;
 497
 498   /* Try to allocate space for the fastmap.  */
 499   preg->fastmap = re_malloc (char, SBC_MAX);
 500   if (BE (preg->fastmap == NULL, 0))
 501     return REG_ESPACE;
 502
 503   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 504
 505   /* If REG_NEWLINE is set, newlines are treated differently.  */
 506   if (cflags & REG_NEWLINE)
 507     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 508       syntax &= ~RE_DOT_NEWLINE;
 509       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 510       /* It also changes the matching behavior.  */
 511       preg->newline_anchor = 1;
 512     }
 513   else
 514     preg->newline_anchor = 0;
 515   preg->no_sub = !!(cflags & REG_NOSUB);
 516   preg->translate = NULL;
 517
 518   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 519
 520   /* POSIX doesn't distinguish between an unmatched open-group and an
 521      unmatched close-group: both are REG_EPAREN.  */
 522   if (ret == REG_ERPAREN)
 523     ret = REG_EPAREN;
 524
 525   /* We have already checked preg->fastmap != NULL.  */
 526   if (BE (ret == REG_NOERROR, 1))
 527     /* Compute the fastmap now, since regexec cannot modify the pattern
 528        buffer.  This function never fails in this implementation.  */
 529     (void) re_compile_fastmap (preg);
 530   else
 531     {
 532       /* Some error occurred while compiling the expression.  */
 533       re_free (preg->fastmap);
 534       preg->fastmap = NULL;
 535     }
 536
 537   return (int) ret;
 538 }
 539 #ifdef _LIBC
 540 weak_alias (__regcomp, regcomp)
 541 #endif
 542
 543 /* Returns a message corresponding to an error code, ERRCODE, returned
 544    from either regcomp or regexec.   We don't use PREG here.  */
 545
 546 size_t
 547 regerror(int errcode, const regex_t *__restrict preg,
 548          char *__restrict errbuf, size_t errbuf_size)
 549 {
 550   const char *msg;
 551   size_t msg_size;
 552
 553   if (BE (errcode < 0
 554           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 555                                / sizeof (__re_error_msgid_idx[0])), 0))
 556     /* Only error codes returned by the rest of the code should be passed
 557        to this routine.  If we are given anything else, or if other regex
 558        code generates an invalid error code, then the program has a bug.
 559        Dump core so we can fix it.  */
 560     abort ();
 561
 562   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 563
 564   msg_size = strlen (msg) + 1; /* Includes the null.  */
 565
 566   if (BE (errbuf_size != 0, 1))
 567     {
 568       if (BE (msg_size > errbuf_size, 0))
 569         {
 570           memcpy (errbuf, msg, errbuf_size - 1);
 571           errbuf[errbuf_size - 1] = 0;
 572         }
 573       else
 574         memcpy (errbuf, msg, msg_size);
 575     }
 576
 577   return msg_size;
 578 }
 579 #ifdef _LIBC
 580 weak_alias (__regerror, regerror)
 581 #endif
 582
 583
 584 #ifdef RE_ENABLE_I18N
 585 /* This static array is used for the map to single-byte characters when
 586    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 587    it the same all the time.  UTF-8 is the preferred encoding so this is
 588    a worthwhile optimization.  */
 589 #if __GNUC__ >= 3
 590 static const bitset_t utf8_sb_map = {
 591   /* Set the first 128 bits.  */
 592   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 593 };
 594 #else /* ! (__GNUC__ >= 3) */
 595 static bitset_t utf8_sb_map;
 596 #endif /* __GNUC__ >= 3 */
 597 #endif /* RE_ENABLE_I18N */
 598
 599
 600 static void
 601 free_dfa_content (re_dfa_t *dfa)
 602 {
 603   int i, j;
 604
 605   if (dfa->nodes)
 606     for (i = 0; i < dfa->nodes_len; ++i)
 607       free_token (dfa->nodes + i);
 608   re_free (dfa->nexts);
 609   for (i = 0; i < dfa->nodes_len; ++i)
 610     {
 611       if (dfa->eclosures != NULL)
 612         re_node_set_free (dfa->eclosures + i);
 613       if (dfa->inveclosures != NULL)
 614         re_node_set_free (dfa->inveclosures + i);
 615       if (dfa->edests != NULL)
 616         re_node_set_free (dfa->edests + i);
 617     }
 618   re_free (dfa->edests);
 619   re_free (dfa->eclosures);
 620   re_free (dfa->inveclosures);
 621   re_free (dfa->nodes);
 622
 623   if (dfa->state_table)
 624     for (i = 0; i <= dfa->state_hash_mask; ++i)
 625       {
 626         struct re_state_table_entry *entry = dfa->state_table + i;
 627         for (j = 0; j < entry->num; ++j)
 628           {
 629             re_dfastate_t *state = entry->array[j];
 630             free_state (state);
 631           }
 632         re_free (entry->array);
 633       }
 634   re_free (dfa->state_table);
 635 #ifdef RE_ENABLE_I18N
 636   if (dfa->sb_char != utf8_sb_map)
 637     re_free (dfa->sb_char);
 638 #endif
 639   re_free (dfa->subexp_map);
 640 #ifdef DEBUG
 641   re_free (dfa->re_str);
 642 #endif
 643
 644   re_free (dfa);
 645 }
 646
 647
 648 /* Free dynamically allocated space used by PREG.  */
 649
 650 void
 651 regfree (regex_t *preg)
 652 {
 653   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 654   if (BE (dfa != NULL, 1))
 655     free_dfa_content (dfa);
 656   preg->buffer = NULL;
 657   preg->allocated = 0;
 658
 659   re_free (preg->fastmap);
 660   preg->fastmap = NULL;
 661
 662   re_free (preg->translate);
 663   preg->translate = NULL;
 664 }
 665 #ifdef _LIBC
 666 weak_alias (__regfree, regfree)
 667 #endif
 668 \f
 669 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 670    them unless specifically requested.  */
 671
 672 #if defined _REGEX_RE_COMP || defined _LIBC
 673
 674 /* BSD has one and only one pattern buffer.  */
 675 static struct re_pattern_buffer re_comp_buf;
 676
 677 char *
 678 # ifdef _LIBC
 679 /* Make these definitions weak in libc, so POSIX programs can redefine
 680    these names if they don't use our functions, and still use
 681    regcomp/regexec above without link errors.  */
 682 weak_function
 683 # endif
 684 re_comp (s)
 685      const char *s;
 686 {
 687   reg_errcode_t ret;
 688   char *fastmap;
 689
 690   if (!s)
 691     {
 692       if (!re_comp_buf.buffer)
 693         return gettext ("No previous regular expression");
 694       return 0;
 695     }
 696
 697   if (re_comp_buf.buffer)
 698     {
 699       fastmap = re_comp_buf.fastmap;
 700       re_comp_buf.fastmap = NULL;
 701       __regfree (&re_comp_buf);
 702       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 703       re_comp_buf.fastmap = fastmap;
 704     }
 705
 706   if (re_comp_buf.fastmap == NULL)
 707     {
 708       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 709       if (re_comp_buf.fastmap == NULL)
 710         return (char *) gettext (__re_error_msgid
 711                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 712     }
 713
 714   /* Since `re_exec' always passes NULL for the `regs' argument, we
 715      don't need to initialize the pattern buffer fields which affect it.  */
 716
 717   /* Match anchors at newlines.  */
 718   re_comp_buf.newline_anchor = 1;
 719
 720   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 721
 722   if (!ret)
 723     return NULL;
 724
 725   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 726   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 727 }
 728
 729 #ifdef _LIBC
 730 libc_freeres_fn (free_mem)
 731 {
 732   __regfree (&re_comp_buf);
 733 }
 734 #endif
 735
 736 #endif /* _REGEX_RE_COMP */
 737 \f
 738 /* Internal entry point.
 739    Compile the regular expression PATTERN, whose length is LENGTH.
 740    SYNTAX indicate regular expression's syntax.  */
 741
 742 static reg_errcode_t
 743 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 744                      reg_syntax_t syntax)
 745 {
 746   reg_errcode_t err = REG_NOERROR;
 747   re_dfa_t *dfa;
 748   re_string_t regexp;
 749
 750   /* Initialize the pattern buffer.  */
 751   preg->fastmap_accurate = 0;
 752   preg->syntax = syntax;
 753   preg->not_bol = preg->not_eol = 0;
 754   preg->used = 0;
 755   preg->re_nsub = 0;
 756   preg->can_be_null = 0;
 757   preg->regs_allocated = REGS_UNALLOCATED;
 758
 759   /* Initialize the dfa.  */
 760   dfa = (re_dfa_t *) preg->buffer;
 761   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 762     {
 763       /* If zero allocated, but buffer is non-null, try to realloc
 764          enough space.  This loses if buffer's address is bogus, but
 765          that is the user's responsibility.  If ->buffer is NULL this
 766          is a simple allocation.  */
 767       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 768       if (dfa == NULL)
 769         return REG_ESPACE;
 770       preg->allocated = sizeof (re_dfa_t);
 771       preg->buffer = (unsigned char *) dfa;
 772     }
 773   preg->used = sizeof (re_dfa_t);
 774
 775   err = init_dfa (dfa, length);
 776   if (BE (err != REG_NOERROR, 0))
 777     {
 778       free_dfa_content (dfa);
 779       preg->buffer = NULL;
 780       preg->allocated = 0;
 781       return err;
 782     }
 783 #ifdef DEBUG
 784   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 785   dfa->re_str = re_malloc (char, length + 1);
 786   strncpy (dfa->re_str, pattern, length + 1);
 787 #endif
 788
 789   __libc_lock_init (dfa->lock);
 790
 791   err = re_string_construct (&regexp, pattern, length, preg->translate,
 792                              syntax & RE_ICASE, dfa);
 793   if (BE (err != REG_NOERROR, 0))
 794     {
 795     re_compile_internal_free_return:
 796       free_workarea_compile (preg);
 797       re_string_destruct (&regexp);
 798       free_dfa_content (dfa);
 799       preg->buffer = NULL;
 800       preg->allocated = 0;
 801       return err;
 802     }
 803
 804   /* Parse the regular expression, and build a structure tree.  */
 805   preg->re_nsub = 0;
 806   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 807   if (BE (dfa->str_tree == NULL, 0))
 808     goto re_compile_internal_free_return;
 809
 810   /* Analyze the tree and create the nfa.  */
 811   err = analyze (preg);
 812   if (BE (err != REG_NOERROR, 0))
 813     goto re_compile_internal_free_return;
 814
 815 #ifdef RE_ENABLE_I18N
 816   /* If possible, do searching in single byte encoding to speed things up.  */
 817   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 818     optimize_utf8 (dfa);
 819 #endif
 820
 821   /* Then create the initial state of the dfa.  */
 822   err = create_initial_state (dfa);
 823
 824   /* Release work areas.  */
 825   free_workarea_compile (preg);
 826   re_string_destruct (&regexp);
 827
 828   if (BE (err != REG_NOERROR, 0))
 829     {
 830       free_dfa_content (dfa);
 831       preg->buffer = NULL;
 832       preg->allocated = 0;
 833     }
 834
 835   return err;
 836 }
 837
 838 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 839    as the initial length of some arrays.  */
 840
 841 static reg_errcode_t
 842 init_dfa (re_dfa_t *dfa, size_t pat_len)
 843 {
 844   unsigned int table_size;
 845 #ifndef _LIBC
 846   char *codeset_name;
 847 #endif
 848
 849   memset (dfa, '\0', sizeof (re_dfa_t));
 850
 851   /* Force allocation of str_tree_storage the first time.  */
 852   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 853
 854   /* Avoid overflows.  */
 855   if (pat_len == SIZE_MAX)
 856     return REG_ESPACE;
 857
 858   dfa->nodes_alloc = pat_len + 1;
 859   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 860
 861   /*  table_size = 2 ^ ceil(log pat_len) */
 862   for (table_size = 1; ; table_size <<= 1)
 863     if (table_size > pat_len)
 864       break;
 865
 866   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 867   dfa->state_hash_mask = table_size - 1;
 868
 869   dfa->mb_cur_max = MB_CUR_MAX;
 870 #ifdef _LIBC
 871   if (dfa->mb_cur_max == 6
 872       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 873     dfa->is_utf8 = 1;
 874   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 875                        != 0);
 876 #else
 877 # ifdef HAVE_LANGINFO_CODESET
 878   codeset_name = nl_langinfo (CODESET);
 879 # else
 880   codeset_name = getenv ("LC_ALL");
 881   if (codeset_name == NULL || codeset_name[0] == '\0')
 882     codeset_name = getenv ("LC_CTYPE");
 883   if (codeset_name == NULL || codeset_name[0] == '\0')
 884     codeset_name = getenv ("LANG");
 885   if (codeset_name == NULL)
 886     codeset_name = "";
 887   else if (strchr (codeset_name, '.') !=  NULL)
 888     codeset_name = strchr (codeset_name, '.') + 1;
 889 # endif
 890
 891   /* strcasecmp isn't a standard interface. brute force check */
 892 #if 0
 893   if (strcasecmp (codeset_name, "UTF-8") == 0
 894       || strcasecmp (codeset_name, "UTF8") == 0)
 895     dfa->is_utf8 = 1;
 896 #else
 897   if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 898       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 899       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 900       && (codeset_name[3] == '-'
 901           ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 902           : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 903     dfa->is_utf8 = 1;
 904 #endif
 905
 906   /* We check exhaustively in the loop below if this charset is a
 907      superset of ASCII.  */
 908   dfa->map_notascii = 0;
 909 #endif
 910
 911 #ifdef RE_ENABLE_I18N
 912   if (dfa->mb_cur_max > 1)
 913     {
 914       if (dfa->is_utf8)
 915         {
 916 #if !defined(__GNUC__) || __GNUC__ < 3
 917           static short utf8_sb_map_inited = 0;
 918
 919           if (! utf8_sb_map_inited)
 920             {
 921                 int i;
 922
 923                 utf8_sb_map_inited = 0;
 924                 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 925                   utf8_sb_map[i] = BITSET_WORD_MAX;
 926             }
 927 #endif
 928           dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 929         }
 930       else
 931         {
 932           int i, j, ch;
 933
 934           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 935           if (BE (dfa->sb_char == NULL, 0))
 936             return REG_ESPACE;
 937
 938           /* Set the bits corresponding to single byte chars.  */
 939           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 940             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 941               {
 942                 wint_t wch = __btowc (ch);
 943                 if (wch != WEOF)
 944                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 945 # ifndef _LIBC
 946                 if (isascii (ch) && wch != ch)
 947                   dfa->map_notascii = 1;
 948 # endif
 949               }
 950         }
 951     }
 952 #endif
 953
 954   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 955     return REG_ESPACE;
 956   return REG_NOERROR;
 957 }
 958
 959 /* Initialize WORD_CHAR table, which indicate which character is
 960    "word".  In this case "word" means that it is the word construction
 961    character used by some operators like "\<", "\>", etc.  */
 962
 963 static void
 964 internal_function
 965 init_word_char (re_dfa_t *dfa)
 966 {
 967   int i, j, ch;
 968   dfa->word_ops_used = 1;
 969   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 970     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 971       if (isalnum (ch) || ch == '_')
 972         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 973 }
 974
 975 /* Free the work area which are only used while compiling.  */
 976
 977 static void
 978 free_workarea_compile (regex_t *preg)
 979 {
 980   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 981   bin_tree_storage_t *storage, *next;
 982   for (storage = dfa->str_tree_storage; storage; storage = next)
 983     {
 984       next = storage->next;
 985       re_free (storage);
 986     }
 987   dfa->str_tree_storage = NULL;
 988   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 989   dfa->str_tree = NULL;
 990   re_free (dfa->org_indices);
 991   dfa->org_indices = NULL;
 992 }
 993
 994 /* Create initial states for all contexts.  */
 995
 996 static reg_errcode_t
 997 create_initial_state (re_dfa_t *dfa)
 998 {
 999   int first, i;
1000   reg_errcode_t err;
1001   re_node_set init_nodes;
1002
1003   /* Initial states have the epsilon closure of the node which is
1004      the first node of the regular expression.  */
1005   first = dfa->str_tree->first->node_idx;
1006   dfa->init_node = first;
1007   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008   if (BE (err != REG_NOERROR, 0))
1009     return err;
1010
1011   /* The back-references which are in initial states can epsilon transit,
1012      since in this case all of the subexpressions can be null.
1013      Then we add epsilon closures of the nodes which are the next nodes of
1014      the back-references.  */
1015   if (dfa->nbackref > 0)
1016     for (i = 0; i < init_nodes.nelem; ++i)
1017       {
1018         int node_idx = init_nodes.elems[i];
1019         re_token_type_t type = dfa->nodes[node_idx].type;
1020
1021         int clexp_idx;
1022         if (type != OP_BACK_REF)
1023           continue;
1024         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025           {
1026             re_token_t *clexp_node;
1027             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028             if (clexp_node->type == OP_CLOSE_SUBEXP
1029                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030               break;
1031           }
1032         if (clexp_idx == init_nodes.nelem)
1033           continue;
1034
1035         if (type == OP_BACK_REF)
1036           {
1037             int dest_idx = dfa->edests[node_idx].elems[0];
1038             if (!re_node_set_contains (&init_nodes, dest_idx))
1039               {
1040                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1041                                                        dfa->eclosures
1042                                                        + dest_idx);
1043                 if (err != REG_NOERROR)
1044                   return err;
1045                 i = 0;
1046               }
1047           }
1048       }
1049
1050   /* It must be the first time to invoke acquire_state.  */
1051   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1052   /* We don't check ERR here, since the initial state must not be NULL.  */
1053   if (BE (dfa->init_state == NULL, 0))
1054     return err;
1055   if (dfa->init_state->has_constraint)
1056     {
1057       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1058                                                        CONTEXT_WORD);
1059       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1060                                                      CONTEXT_NEWLINE);
1061       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1062                                                          &init_nodes,
1063                                                          CONTEXT_NEWLINE
1064                                                          | CONTEXT_BEGBUF);
1065       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1066               || dfa->init_state_begbuf == NULL, 0))
1067         return err;
1068     }
1069   else
1070     dfa->init_state_word = dfa->init_state_nl
1071       = dfa->init_state_begbuf = dfa->init_state;
1072
1073   re_node_set_free (&init_nodes);
1074   return REG_NOERROR;
1075 }
1076 \f
1077 #ifdef RE_ENABLE_I18N
1078 /* If it is possible to do searching in single byte encoding instead of UTF-8
1079    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080    DFA nodes where needed.  */
1081
1082 static void
1083 optimize_utf8 (re_dfa_t *dfa)
1084 {
1085   int node, i, mb_chars = 0, has_period = 0;
1086
1087   for (node = 0; node < dfa->nodes_len; ++node)
1088     switch (dfa->nodes[node].type)
1089       {
1090       case CHARACTER:
1091         if (dfa->nodes[node].opr.c >= 0x80)
1092           mb_chars = 1;
1093         break;
1094       case ANCHOR:
1095         switch (dfa->nodes[node].opr.ctx_type)
1096           {
1097           case LINE_FIRST:
1098           case LINE_LAST:
1099           case BUF_FIRST:
1100           case BUF_LAST:
1101             break;
1102           default:
1103             /* Word anchors etc. cannot be handled.  It's okay to test
1104                opr.ctx_type since constraints (for all DFA nodes) are
1105                created by ORing one or more opr.ctx_type values.  */
1106             return;
1107           }
1108         break;
1109       case OP_PERIOD:
1110         has_period = 1;
1111         break;
1112       case OP_BACK_REF:
1113       case OP_ALT:
1114       case END_OF_RE:
1115       case OP_DUP_ASTERISK:
1116       case OP_OPEN_SUBEXP:
1117       case OP_CLOSE_SUBEXP:
1118         break;
1119       case COMPLEX_BRACKET:
1120         return;
1121       case SIMPLE_BRACKET:
1122         /* Just double check.  The non-ASCII range starts at 0x80.  */
1123         assert (0x80 % BITSET_WORD_BITS == 0);
1124         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1125           if (dfa->nodes[node].opr.sbcset[i])
1126             return;
1127         break;
1128       default:
1129         abort ();
1130       }
1131
1132   if (mb_chars || has_period)
1133     for (node = 0; node < dfa->nodes_len; ++node)
1134       {
1135         if (dfa->nodes[node].type == CHARACTER
1136             && dfa->nodes[node].opr.c >= 0x80)
1137           dfa->nodes[node].mb_partial = 0;
1138         else if (dfa->nodes[node].type == OP_PERIOD)
1139           dfa->nodes[node].type = OP_UTF8_PERIOD;
1140       }
1141
1142   /* The search can be in single byte locale.  */
1143   dfa->mb_cur_max = 1;
1144   dfa->is_utf8 = 0;
1145   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1146 }
1147 #endif
1148 \f
1149 /* Analyze the structure tree, and calculate "first", "next", "edest",
1150    "eclosure", and "inveclosure".  */
1151
1152 static reg_errcode_t
1153 analyze (regex_t *preg)
1154 {
1155   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1156   reg_errcode_t ret;
1157
1158   /* Allocate arrays.  */
1159   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1160   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1161   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1162   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1163   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1164           || dfa->eclosures == NULL, 0))
1165     return REG_ESPACE;
1166
1167   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1168   if (dfa->subexp_map != NULL)
1169     {
1170       int i;
1171       for (i = 0; i < preg->re_nsub; i++)
1172         dfa->subexp_map[i] = i;
1173       preorder (dfa->str_tree, optimize_subexps, dfa);
1174       for (i = 0; i < preg->re_nsub; i++)
1175         if (dfa->subexp_map[i] != i)
1176           break;
1177       if (i == preg->re_nsub)
1178         {
1179           free (dfa->subexp_map);
1180           dfa->subexp_map = NULL;
1181         }
1182     }
1183
1184   ret = postorder (dfa->str_tree, lower_subexps, preg);
1185   if (BE (ret != REG_NOERROR, 0))
1186     return ret;
1187   ret = postorder (dfa->str_tree, calc_first, dfa);
1188   if (BE (ret != REG_NOERROR, 0))
1189     return ret;
1190   preorder (dfa->str_tree, calc_next, dfa);
1191   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1192   if (BE (ret != REG_NOERROR, 0))
1193     return ret;
1194   ret = calc_eclosure (dfa);
1195   if (BE (ret != REG_NOERROR, 0))
1196     return ret;
1197
1198   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1199      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1200   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1201       || dfa->nbackref)
1202     {
1203       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1204       if (BE (dfa->inveclosures == NULL, 0))
1205         return REG_ESPACE;
1206       ret = calc_inveclosure (dfa);
1207     }
1208
1209   return ret;
1210 }
1211
1212 /* Our parse trees are very unbalanced, so we cannot use a stack to
1213    implement parse tree visits.  Instead, we use parent pointers and
1214    some hairy code in these two functions.  */
1215 static reg_errcode_t
1216 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1217            void *extra)
1218 {
1219   bin_tree_t *node, *prev;
1220
1221   for (node = root; ; )
1222     {
1223       /* Descend down the tree, preferably to the left (or to the right
1224          if that's the only child).  */
1225       while (node->left || node->right)
1226         if (node->left)
1227           node = node->left;
1228         else
1229           node = node->right;
1230
1231       do
1232         {
1233           reg_errcode_t err = fn (extra, node);
1234           if (BE (err != REG_NOERROR, 0))
1235             return err;
1236           if (node->parent == NULL)
1237             return REG_NOERROR;
1238           prev = node;
1239           node = node->parent;
1240         }
1241       /* Go up while we have a node that is reached from the right.  */
1242       while (node->right == prev || node->right == NULL);
1243       node = node->right;
1244     }
1245 }
1246
1247 static reg_errcode_t
1248 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1249           void *extra)
1250 {
1251   bin_tree_t *node;
1252
1253   for (node = root; ; )
1254     {
1255       reg_errcode_t err = fn (extra, node);
1256       if (BE (err != REG_NOERROR, 0))
1257         return err;
1258
1259       /* Go to the left node, or up and to the right.  */
1260       if (node->left)
1261         node = node->left;
1262       else
1263         {
1264           bin_tree_t *prev = NULL;
1265           while (node->right == prev || node->right == NULL)
1266             {
1267               prev = node;
1268               node = node->parent;
1269               if (!node)
1270                 return REG_NOERROR;
1271             }
1272           node = node->right;
1273         }
1274     }
1275 }
1276
1277 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1278    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1279    backreferences as well.  Requires a preorder visit.  */
1280 static reg_errcode_t
1281 optimize_subexps (void *extra, bin_tree_t *node)
1282 {
1283   re_dfa_t *dfa = (re_dfa_t *) extra;
1284
1285   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1286     {
1287       int idx = node->token.opr.idx;
1288       node->token.opr.idx = dfa->subexp_map[idx];
1289       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1290     }
1291
1292   else if (node->token.type == SUBEXP
1293            && node->left && node->left->token.type == SUBEXP)
1294     {
1295       int other_idx = node->left->token.opr.idx;
1296
1297       node->left = node->left->left;
1298       if (node->left)
1299         node->left->parent = node;
1300
1301       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1302       if (other_idx < BITSET_WORD_BITS)
1303           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1304     }
1305
1306   return REG_NOERROR;
1307 }
1308
1309 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1310    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1311 static reg_errcode_t
1312 lower_subexps (void *extra, bin_tree_t *node)
1313 {
1314   regex_t *preg = (regex_t *) extra;
1315   reg_errcode_t err = REG_NOERROR;
1316
1317   if (node->left && node->left->token.type == SUBEXP)
1318     {
1319       node->left = lower_subexp (&err, preg, node->left);
1320       if (node->left)
1321         node->left->parent = node;
1322     }
1323   if (node->right && node->right->token.type == SUBEXP)
1324     {
1325       node->right = lower_subexp (&err, preg, node->right);
1326       if (node->right)
1327         node->right->parent = node;
1328     }
1329
1330   return err;
1331 }
1332
1333 static bin_tree_t *
1334 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1335 {
1336   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1337   bin_tree_t *body = node->left;
1338   bin_tree_t *op, *cls, *tree1, *tree;
1339
1340   if (preg->no_sub
1341       /* We do not optimize empty subexpressions, because otherwise we may
1342          have bad CONCAT nodes with NULL children.  This is obviously not
1343          very common, so we do not lose much.  An example that triggers
1344          this case is the sed "script" /\(\)/x.  */
1345       && node->left != NULL
1346       && (node->token.opr.idx >= BITSET_WORD_BITS
1347           || !(dfa->used_bkref_map
1348                & ((bitset_word_t) 1 << node->token.opr.idx))))
1349     return node->left;
1350
1351   /* Convert the SUBEXP node to the concatenation of an
1352      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1353   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1354   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1355   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1356   tree = create_tree (dfa, op, tree1, CONCAT);
1357   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1358     {
1359       *err = REG_ESPACE;
1360       return NULL;
1361     }
1362
1363   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1364   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1365   return tree;
1366 }
1367
1368 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1369    nodes.  Requires a postorder visit.  */
1370 static reg_errcode_t
1371 calc_first (void *extra, bin_tree_t *node)
1372 {
1373   re_dfa_t *dfa = (re_dfa_t *) extra;
1374   if (node->token.type == CONCAT)
1375     {
1376       node->first = node->left->first;
1377       node->node_idx = node->left->node_idx;
1378     }
1379   else
1380     {
1381       node->first = node;
1382       node->node_idx = re_dfa_add_node (dfa, node->token);
1383       if (BE (node->node_idx == -1, 0))
1384         return REG_ESPACE;
1385       if (node->token.type == ANCHOR)
1386         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1387     }
1388   return REG_NOERROR;
1389 }
1390
1391 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1392 static reg_errcode_t
1393 calc_next (void *extra, bin_tree_t *node)
1394 {
1395   switch (node->token.type)
1396     {
1397     case OP_DUP_ASTERISK:
1398       node->left->next = node;
1399       break;
1400     case CONCAT:
1401       node->left->next = node->right->first;
1402       node->right->next = node->next;
1403       break;
1404     default:
1405       if (node->left)
1406         node->left->next = node->next;
1407       if (node->right)
1408         node->right->next = node->next;
1409       break;
1410     }
1411   return REG_NOERROR;
1412 }
1413
1414 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1415 static reg_errcode_t
1416 link_nfa_nodes (void *extra, bin_tree_t *node)
1417 {
1418   re_dfa_t *dfa = (re_dfa_t *) extra;
1419   int idx = node->node_idx;
1420   reg_errcode_t err = REG_NOERROR;
1421
1422   switch (node->token.type)
1423     {
1424     case CONCAT:
1425       break;
1426
1427     case END_OF_RE:
1428       assert (node->next == NULL);
1429       break;
1430
1431     case OP_DUP_ASTERISK:
1432     case OP_ALT:
1433       {
1434         int left, right;
1435         dfa->has_plural_match = 1;
1436         if (node->left != NULL)
1437           left = node->left->first->node_idx;
1438         else
1439           left = node->next->node_idx;
1440         if (node->right != NULL)
1441           right = node->right->first->node_idx;
1442         else
1443           right = node->next->node_idx;
1444         assert (left > -1);
1445         assert (right > -1);
1446         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1447       }
1448       break;
1449
1450     case ANCHOR:
1451     case OP_OPEN_SUBEXP:
1452     case OP_CLOSE_SUBEXP:
1453       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1454       break;
1455
1456     case OP_BACK_REF:
1457       dfa->nexts[idx] = node->next->node_idx;
1458       if (node->token.type == OP_BACK_REF)
1459         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1460       break;
1461
1462     default:
1463       assert (!IS_EPSILON_NODE (node->token.type));
1464       dfa->nexts[idx] = node->next->node_idx;
1465       break;
1466     }
1467
1468   return err;
1469 }
1470
1471 /* Duplicate the epsilon closure of the node ROOT_NODE.
1472    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1473    to their own constraint.  */
1474
1475 static reg_errcode_t
1476 internal_function
1477 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1478                         int root_node, unsigned int init_constraint)
1479 {
1480   int org_node, clone_node, ret;
1481   unsigned int constraint = init_constraint;
1482   for (org_node = top_org_node, clone_node = top_clone_node;;)
1483     {
1484       int org_dest, clone_dest;
1485       if (dfa->nodes[org_node].type == OP_BACK_REF)
1486         {
1487           /* If the back reference epsilon-transit, its destination must
1488              also have the constraint.  Then duplicate the epsilon closure
1489              of the destination of the back reference, and store it in
1490              edests of the back reference.  */
1491           org_dest = dfa->nexts[org_node];
1492           re_node_set_empty (dfa->edests + clone_node);
1493           clone_dest = duplicate_node (dfa, org_dest, constraint);
1494           if (BE (clone_dest == -1, 0))
1495             return REG_ESPACE;
1496           dfa->nexts[clone_node] = dfa->nexts[org_node];
1497           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498           if (BE (ret < 0, 0))
1499             return REG_ESPACE;
1500         }
1501       else if (dfa->edests[org_node].nelem == 0)
1502         {
1503           /* In case of the node can't epsilon-transit, don't duplicate the
1504              destination and store the original destination as the
1505              destination of the node.  */
1506           dfa->nexts[clone_node] = dfa->nexts[org_node];
1507           break;
1508         }
1509       else if (dfa->edests[org_node].nelem == 1)
1510         {
1511           /* In case of the node can epsilon-transit, and it has only one
1512              destination.  */
1513           org_dest = dfa->edests[org_node].elems[0];
1514           re_node_set_empty (dfa->edests + clone_node);
1515           /* If the node is root_node itself, it means the epsilon clsoure
1516              has a loop.   Then tie it to the destination of the root_node.  */
1517           if (org_node == root_node && clone_node != org_node)
1518             {
1519               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1520               if (BE (ret < 0, 0))
1521                 return REG_ESPACE;
1522               break;
1523             }
1524           /* In case of the node has another constraint, add it.  */
1525           constraint |= dfa->nodes[org_node].constraint;
1526           clone_dest = duplicate_node (dfa, org_dest, constraint);
1527           if (BE (clone_dest == -1, 0))
1528             return REG_ESPACE;
1529           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1530           if (BE (ret < 0, 0))
1531             return REG_ESPACE;
1532         }
1533       else /* dfa->edests[org_node].nelem == 2 */
1534         {
1535           /* In case of the node can epsilon-transit, and it has two
1536              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1537           org_dest = dfa->edests[org_node].elems[0];
1538           re_node_set_empty (dfa->edests + clone_node);
1539           /* Search for a duplicated node which satisfies the constraint.  */
1540           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1541           if (clone_dest == -1)
1542             {
1543               /* There is no such duplicated node, create a new one.  */
1544               reg_errcode_t err;
1545               clone_dest = duplicate_node (dfa, org_dest, constraint);
1546               if (BE (clone_dest == -1, 0))
1547                 return REG_ESPACE;
1548               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1549               if (BE (ret < 0, 0))
1550                 return REG_ESPACE;
1551               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1552                                             root_node, constraint);
1553               if (BE (err != REG_NOERROR, 0))
1554                 return err;
1555             }
1556           else
1557             {
1558               /* There is a duplicated node which satisfies the constraint,
1559                  use it to avoid infinite loop.  */
1560               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1561               if (BE (ret < 0, 0))
1562                 return REG_ESPACE;
1563             }
1564
1565           org_dest = dfa->edests[org_node].elems[1];
1566           clone_dest = duplicate_node (dfa, org_dest, constraint);
1567           if (BE (clone_dest == -1, 0))
1568             return REG_ESPACE;
1569           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1570           if (BE (ret < 0, 0))
1571             return REG_ESPACE;
1572         }
1573       org_node = org_dest;
1574       clone_node = clone_dest;
1575     }
1576   return REG_NOERROR;
1577 }
1578
1579 /* Search for a node which is duplicated from the node ORG_NODE, and
1580    satisfies the constraint CONSTRAINT.  */
1581
1582 static int
1583 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1584                         unsigned int constraint)
1585 {
1586   int idx;
1587   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1588     {
1589       if (org_node == dfa->org_indices[idx]
1590           && constraint == dfa->nodes[idx].constraint)
1591         return idx; /* Found.  */
1592     }
1593   return -1; /* Not found.  */
1594 }
1595
1596 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1597    Return the index of the new node, or -1 if insufficient storage is
1598    available.  */
1599
1600 static int
1601 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1602 {
1603   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1604   if (BE (dup_idx != -1, 1))
1605     {
1606       dfa->nodes[dup_idx].constraint = constraint;
1607       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1608       dfa->nodes[dup_idx].duplicated = 1;
1609
1610       /* Store the index of the original node.  */
1611       dfa->org_indices[dup_idx] = org_idx;
1612     }
1613   return dup_idx;
1614 }
1615
1616 static reg_errcode_t
1617 calc_inveclosure (re_dfa_t *dfa)
1618 {
1619   int src, idx, ret;
1620   for (idx = 0; idx < dfa->nodes_len; ++idx)
1621     re_node_set_init_empty (dfa->inveclosures + idx);
1622
1623   for (src = 0; src < dfa->nodes_len; ++src)
1624     {
1625       int *elems = dfa->eclosures[src].elems;
1626       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1627         {
1628           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1629           if (BE (ret == -1, 0))
1630             return REG_ESPACE;
1631         }
1632     }
1633
1634   return REG_NOERROR;
1635 }
1636
1637 /* Calculate "eclosure" for all the node in DFA.  */
1638
1639 static reg_errcode_t
1640 calc_eclosure (re_dfa_t *dfa)
1641 {
1642   int node_idx, incomplete;
1643 #ifdef DEBUG
1644   assert (dfa->nodes_len > 0);
1645 #endif
1646   incomplete = 0;
1647   /* For each nodes, calculate epsilon closure.  */
1648   for (node_idx = 0; ; ++node_idx)
1649     {
1650       reg_errcode_t err;
1651       re_node_set eclosure_elem;
1652       if (node_idx == dfa->nodes_len)
1653         {
1654           if (!incomplete)
1655             break;
1656           incomplete = 0;
1657           node_idx = 0;
1658         }
1659
1660 #ifdef DEBUG
1661       assert (dfa->eclosures[node_idx].nelem != -1);
1662 #endif
1663
1664       /* If we have already calculated, skip it.  */
1665       if (dfa->eclosures[node_idx].nelem != 0)
1666         continue;
1667       /* Calculate epsilon closure of `node_idx'.  */
1668       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1669       if (BE (err != REG_NOERROR, 0))
1670         return err;
1671
1672       if (dfa->eclosures[node_idx].nelem == 0)
1673         {
1674           incomplete = 1;
1675           re_node_set_free (&eclosure_elem);
1676         }
1677     }
1678   return REG_NOERROR;
1679 }
1680
1681 /* Calculate epsilon closure of NODE.  */
1682
1683 static reg_errcode_t
1684 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1685 {
1686   reg_errcode_t err;
1687   int i;
1688   re_node_set eclosure;
1689   int ret;
1690   int incomplete = 0;
1691   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1692   if (BE (err != REG_NOERROR, 0))
1693     return err;
1694
1695   /* This indicates that we are calculating this node now.
1696      We reference this value to avoid infinite loop.  */
1697   dfa->eclosures[node].nelem = -1;
1698
1699   /* If the current node has constraints, duplicate all nodes
1700      since they must inherit the constraints.  */
1701   if (dfa->nodes[node].constraint
1702       && dfa->edests[node].nelem
1703       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1704     {
1705       err = duplicate_node_closure (dfa, node, node, node,
1706                                     dfa->nodes[node].constraint);
1707       if (BE (err != REG_NOERROR, 0))
1708         return err;
1709     }
1710
1711   /* Expand each epsilon destination nodes.  */
1712   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1713     for (i = 0; i < dfa->edests[node].nelem; ++i)
1714       {
1715         re_node_set eclosure_elem;
1716         int edest = dfa->edests[node].elems[i];
1717         /* If calculating the epsilon closure of `edest' is in progress,
1718            return intermediate result.  */
1719         if (dfa->eclosures[edest].nelem == -1)
1720           {
1721             incomplete = 1;
1722             continue;
1723           }
1724         /* If we haven't calculated the epsilon closure of `edest' yet,
1725            calculate now. Otherwise use calculated epsilon closure.  */
1726         if (dfa->eclosures[edest].nelem == 0)
1727           {
1728             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1729             if (BE (err != REG_NOERROR, 0))
1730               return err;
1731           }
1732         else
1733           eclosure_elem = dfa->eclosures[edest];
1734         /* Merge the epsilon closure of `edest'.  */
1735         err = re_node_set_merge (&eclosure, &eclosure_elem);
1736         if (BE (err != REG_NOERROR, 0))
1737           return err;
1738         /* If the epsilon closure of `edest' is incomplete,
1739            the epsilon closure of this node is also incomplete.  */
1740         if (dfa->eclosures[edest].nelem == 0)
1741           {
1742             incomplete = 1;
1743             re_node_set_free (&eclosure_elem);
1744           }
1745       }
1746
1747   /* An epsilon closure includes itself.  */
1748   ret = re_node_set_insert (&eclosure, node);
1749   if (BE (ret < 0, 0))
1750     return REG_ESPACE;
1751   if (incomplete && !root)
1752     dfa->eclosures[node].nelem = 0;
1753   else
1754     dfa->eclosures[node] = eclosure;
1755   *new_set = eclosure;
1756   return REG_NOERROR;
1757 }
1758 \f
1759 /* Functions for token which are used in the parser.  */
1760
1761 /* Fetch a token from INPUT.
1762    We must not use this function inside bracket expressions.  */
1763
1764 static void
1765 internal_function
1766 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1767 {
1768   re_string_skip_bytes (input, peek_token (result, input, syntax));
1769 }
1770
1771 /* Peek a token from INPUT, and return the length of the token.
1772    We must not use this function inside bracket expressions.  */
1773
1774 static int
1775 internal_function
1776 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1777 {
1778   unsigned char c;
1779
1780   if (re_string_eoi (input))
1781     {
1782       token->type = END_OF_RE;
1783       return 0;
1784     }
1785
1786   c = re_string_peek_byte (input, 0);
1787   token->opr.c = c;
1788
1789   token->word_char = 0;
1790 #ifdef RE_ENABLE_I18N
1791   token->mb_partial = 0;
1792   if (input->mb_cur_max > 1 &&
1793       !re_string_first_byte (input, re_string_cur_idx (input)))
1794     {
1795       token->type = CHARACTER;
1796       token->mb_partial = 1;
1797       return 1;
1798     }
1799 #endif
1800   if (c == '\\')
1801     {
1802       unsigned char c2;
1803       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1804         {
1805           token->type = BACK_SLASH;
1806           return 1;
1807         }
1808
1809       c2 = re_string_peek_byte_case (input, 1);
1810       token->opr.c = c2;
1811       token->type = CHARACTER;
1812 #ifdef RE_ENABLE_I18N
1813       if (input->mb_cur_max > 1)
1814         {
1815           wint_t wc = re_string_wchar_at (input,
1816                                           re_string_cur_idx (input) + 1);
1817           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1818         }
1819       else
1820 #endif
1821         token->word_char = IS_WORD_CHAR (c2) != 0;
1822
1823       switch (c2)
1824         {
1825         case '|':
1826           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1827             token->type = OP_ALT;
1828           break;
1829         case '1': case '2': case '3': case '4': case '5':
1830         case '6': case '7': case '8': case '9':
1831           if (!(syntax & RE_NO_BK_REFS))
1832             {
1833               token->type = OP_BACK_REF;
1834               token->opr.idx = c2 - '1';
1835             }
1836           break;
1837         case '<':
1838           if (!(syntax & RE_NO_GNU_OPS))
1839             {
1840               token->type = ANCHOR;
1841               token->opr.ctx_type = WORD_FIRST;
1842             }
1843           break;
1844         case '>':
1845           if (!(syntax & RE_NO_GNU_OPS))
1846             {
1847               token->type = ANCHOR;
1848               token->opr.ctx_type = WORD_LAST;
1849             }
1850           break;
1851         case 'b':
1852           if (!(syntax & RE_NO_GNU_OPS))
1853             {
1854               token->type = ANCHOR;
1855               token->opr.ctx_type = WORD_DELIM;
1856             }
1857           break;
1858         case 'B':
1859           if (!(syntax & RE_NO_GNU_OPS))
1860             {
1861               token->type = ANCHOR;
1862               token->opr.ctx_type = NOT_WORD_DELIM;
1863             }
1864           break;
1865         case 'w':
1866           if (!(syntax & RE_NO_GNU_OPS))
1867             token->type = OP_WORD;
1868           break;
1869         case 'W':
1870           if (!(syntax & RE_NO_GNU_OPS))
1871             token->type = OP_NOTWORD;
1872           break;
1873         case 's':
1874           if (!(syntax & RE_NO_GNU_OPS))
1875             token->type = OP_SPACE;
1876           break;
1877         case 'S':
1878           if (!(syntax & RE_NO_GNU_OPS))
1879             token->type = OP_NOTSPACE;
1880           break;
1881         case '`':
1882           if (!(syntax & RE_NO_GNU_OPS))
1883             {
1884               token->type = ANCHOR;
1885               token->opr.ctx_type = BUF_FIRST;
1886             }
1887           break;
1888         case '\'':
1889           if (!(syntax & RE_NO_GNU_OPS))
1890             {
1891               token->type = ANCHOR;
1892               token->opr.ctx_type = BUF_LAST;
1893             }
1894           break;
1895         case '(':
1896           if (!(syntax & RE_NO_BK_PARENS))
1897             token->type = OP_OPEN_SUBEXP;
1898           break;
1899         case ')':
1900           if (!(syntax & RE_NO_BK_PARENS))
1901             token->type = OP_CLOSE_SUBEXP;
1902           break;
1903         case '+':
1904           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1905             token->type = OP_DUP_PLUS;
1906           break;
1907         case '?':
1908           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1909             token->type = OP_DUP_QUESTION;
1910           break;
1911         case '{':
1912           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1913             token->type = OP_OPEN_DUP_NUM;
1914           break;
1915         case '}':
1916           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1917             token->type = OP_CLOSE_DUP_NUM;
1918           break;
1919         default:
1920           break;
1921         }
1922       return 2;
1923     }
1924
1925   token->type = CHARACTER;
1926 #ifdef RE_ENABLE_I18N
1927   if (input->mb_cur_max > 1)
1928     {
1929       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1930       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1931     }
1932   else
1933 #endif
1934     token->word_char = IS_WORD_CHAR (token->opr.c);
1935
1936   switch (c)
1937     {
1938     case '\n':
1939       if (syntax & RE_NEWLINE_ALT)
1940         token->type = OP_ALT;
1941       break;
1942     case '|':
1943       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1944         token->type = OP_ALT;
1945       break;
1946     case '*':
1947       token->type = OP_DUP_ASTERISK;
1948       break;
1949     case '+':
1950       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1951         token->type = OP_DUP_PLUS;
1952       break;
1953     case '?':
1954       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1955         token->type = OP_DUP_QUESTION;
1956       break;
1957     case '{':
1958       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1959         token->type = OP_OPEN_DUP_NUM;
1960       break;
1961     case '}':
1962       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1963         token->type = OP_CLOSE_DUP_NUM;
1964       break;
1965     case '(':
1966       if (syntax & RE_NO_BK_PARENS)
1967         token->type = OP_OPEN_SUBEXP;
1968       break;
1969     case ')':
1970       if (syntax & RE_NO_BK_PARENS)
1971         token->type = OP_CLOSE_SUBEXP;
1972       break;
1973     case '[':
1974       token->type = OP_OPEN_BRACKET;
1975       break;
1976     case '.':
1977       token->type = OP_PERIOD;
1978       break;
1979     case '^':
1980       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1981           re_string_cur_idx (input) != 0)
1982         {
1983           char prev = re_string_peek_byte (input, -1);
1984           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1985             break;
1986         }
1987       token->type = ANCHOR;
1988       token->opr.ctx_type = LINE_FIRST;
1989       break;
1990     case '$':
1991       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1992           re_string_cur_idx (input) + 1 != re_string_length (input))
1993         {
1994           re_token_t next;
1995           re_string_skip_bytes (input, 1);
1996           peek_token (&next, input, syntax);
1997           re_string_skip_bytes (input, -1);
1998           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1999             break;
2000         }
2001       token->type = ANCHOR;
2002       token->opr.ctx_type = LINE_LAST;
2003       break;
2004     default:
2005       break;
2006     }
2007   return 1;
2008 }
2009
2010 /* Peek a token from INPUT, and return the length of the token.
2011    We must not use this function out of bracket expressions.  */
2012
2013 static int
2014 internal_function
2015 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2016 {
2017   unsigned char c;
2018   if (re_string_eoi (input))
2019     {
2020       token->type = END_OF_RE;
2021       return 0;
2022     }
2023   c = re_string_peek_byte (input, 0);
2024   token->opr.c = c;
2025
2026 #ifdef RE_ENABLE_I18N
2027   if (input->mb_cur_max > 1 &&
2028       !re_string_first_byte (input, re_string_cur_idx (input)))
2029     {
2030       token->type = CHARACTER;
2031       return 1;
2032     }
2033 #endif /* RE_ENABLE_I18N */
2034
2035   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2036       && re_string_cur_idx (input) + 1 < re_string_length (input))
2037     {
2038       /* In this case, '\' escape a character.  */
2039       unsigned char c2;
2040       re_string_skip_bytes (input, 1);
2041       c2 = re_string_peek_byte (input, 0);
2042       token->opr.c = c2;
2043       token->type = CHARACTER;
2044       return 1;
2045     }
2046   if (c == '[') /* '[' is a special char in a bracket exps.  */
2047     {
2048       unsigned char c2;
2049       int token_len;
2050       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2051         c2 = re_string_peek_byte (input, 1);
2052       else
2053         c2 = 0;
2054       token->opr.c = c2;
2055       token_len = 2;
2056       switch (c2)
2057         {
2058         case '.':
2059           token->type = OP_OPEN_COLL_ELEM;
2060           break;
2061         case '=':
2062           token->type = OP_OPEN_EQUIV_CLASS;
2063           break;
2064         case ':':
2065           if (syntax & RE_CHAR_CLASSES)
2066             {
2067               token->type = OP_OPEN_CHAR_CLASS;
2068               break;
2069             }
2070           /* else fall through.  */
2071         default:
2072           token->type = CHARACTER;
2073           token->opr.c = c;
2074           token_len = 1;
2075           break;
2076         }
2077       return token_len;
2078     }
2079   switch (c)
2080     {
2081     case '-':
2082       token->type = OP_CHARSET_RANGE;
2083       break;
2084     case ']':
2085       token->type = OP_CLOSE_BRACKET;
2086       break;
2087     case '^':
2088       token->type = OP_NON_MATCH_LIST;
2089       break;
2090     default:
2091       token->type = CHARACTER;
2092     }
2093   return 1;
2094 }
2095 \f
2096 /* Functions for parser.  */
2097
2098 /* Entry point of the parser.
2099    Parse the regular expression REGEXP and return the structure tree.
2100    If an error has occurred, ERR is set by error code, and return NULL.
2101    This function build the following tree, from regular expression <reg_exp>:
2102            CAT
2103            / \
2104           /   \
2105    <reg_exp>  EOR
2106
2107    CAT means concatenation.
2108    EOR means end of regular expression.  */
2109
2110 static bin_tree_t *
2111 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2112        reg_errcode_t *err)
2113 {
2114   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2115   bin_tree_t *tree, *eor, *root;
2116   re_token_t current_token;
2117   dfa->syntax = syntax;
2118   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2119   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2120   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121     return NULL;
2122   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2123   if (tree != NULL)
2124     root = create_tree (dfa, tree, eor, CONCAT);
2125   else
2126     root = eor;
2127   if (BE (eor == NULL || root == NULL, 0))
2128     {
2129       *err = REG_ESPACE;
2130       return NULL;
2131     }
2132   return root;
2133 }
2134
2135 /* This function build the following tree, from regular expression
2136    <branch1>|<branch2>:
2137            ALT
2138            / \
2139           /   \
2140    <branch1> <branch2>
2141
2142    ALT means alternative, which represents the operator `|'.  */
2143
2144 static bin_tree_t *
2145 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2146                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2147 {
2148   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2149   bin_tree_t *tree, *branch = NULL;
2150   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2151   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2152     return NULL;
2153
2154   while (token->type == OP_ALT)
2155     {
2156       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2157       if (token->type != OP_ALT && token->type != END_OF_RE
2158           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2159         {
2160           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2161           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2162             return NULL;
2163         }
2164       else
2165         branch = NULL;
2166       tree = create_tree (dfa, tree, branch, OP_ALT);
2167       if (BE (tree == NULL, 0))
2168         {
2169           *err = REG_ESPACE;
2170           return NULL;
2171         }
2172     }
2173   return tree;
2174 }
2175
2176 /* This function build the following tree, from regular expression
2177    <exp1><exp2>:
2178         CAT
2179         / \
2180        /   \
2181    <exp1> <exp2>
2182
2183    CAT means concatenation.  */
2184
2185 static bin_tree_t *
2186 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2187               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2188 {
2189   bin_tree_t *tree, *exp;
2190   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2191   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2192   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193     return NULL;
2194
2195   while (token->type != OP_ALT && token->type != END_OF_RE
2196          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2197     {
2198       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2199       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2200         {
2201           return NULL;
2202         }
2203       if (tree != NULL && exp != NULL)
2204         {
2205           tree = create_tree (dfa, tree, exp, CONCAT);
2206           if (tree == NULL)
2207             {
2208               *err = REG_ESPACE;
2209               return NULL;
2210             }
2211         }
2212       else if (tree == NULL)
2213         tree = exp;
2214       /* Otherwise exp == NULL, we don't need to create new tree.  */
2215     }
2216   return tree;
2217 }
2218
2219 /* This function build the following tree, from regular expression a*:
2220          *
2221          |
2222          a
2223 */
2224
2225 static bin_tree_t *
2226 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2227                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2228 {
2229   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2230   bin_tree_t *tree;
2231   switch (token->type)
2232     {
2233     case CHARACTER:
2234       tree = create_token_tree (dfa, NULL, NULL, token);
2235       if (BE (tree == NULL, 0))
2236         {
2237           *err = REG_ESPACE;
2238           return NULL;
2239         }
2240 #ifdef RE_ENABLE_I18N
2241       if (dfa->mb_cur_max > 1)
2242         {
2243           while (!re_string_eoi (regexp)
2244                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2245             {
2246               bin_tree_t *mbc_remain;
2247               fetch_token (token, regexp, syntax);
2248               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2249               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2250               if (BE (mbc_remain == NULL || tree == NULL, 0))
2251                 {
2252                   *err = REG_ESPACE;
2253                   return NULL;
2254                 }
2255             }
2256         }
2257 #endif
2258       break;
2259     case OP_OPEN_SUBEXP:
2260       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2261       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262         return NULL;
2263       break;
2264     case OP_OPEN_BRACKET:
2265       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2266       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267         return NULL;
2268       break;
2269     case OP_BACK_REF:
2270       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2271         {
2272           *err = REG_ESUBREG;
2273           return NULL;
2274         }
2275       dfa->used_bkref_map |= 1 << token->opr.idx;
2276       tree = create_token_tree (dfa, NULL, NULL, token);
2277       if (BE (tree == NULL, 0))
2278         {
2279           *err = REG_ESPACE;
2280           return NULL;
2281         }
2282       ++dfa->nbackref;
2283       dfa->has_mb_node = 1;
2284       break;
2285     case OP_OPEN_DUP_NUM:
2286       if (syntax & RE_CONTEXT_INVALID_DUP)
2287         {
2288           *err = REG_BADRPT;
2289           return NULL;
2290         }
2291       /* FALLTHROUGH */
2292     case OP_DUP_ASTERISK:
2293     case OP_DUP_PLUS:
2294     case OP_DUP_QUESTION:
2295       if (syntax & RE_CONTEXT_INVALID_OPS)
2296         {
2297           *err = REG_BADRPT;
2298           return NULL;
2299         }
2300       else if (syntax & RE_CONTEXT_INDEP_OPS)
2301         {
2302           fetch_token (token, regexp, syntax);
2303           return parse_expression (regexp, preg, token, syntax, nest, err);
2304         }
2305       /* else fall through  */
2306     case OP_CLOSE_SUBEXP:
2307       if ((token->type == OP_CLOSE_SUBEXP) &&
2308           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2309         {
2310           *err = REG_ERPAREN;
2311           return NULL;
2312         }
2313       /* else fall through  */
2314     case OP_CLOSE_DUP_NUM:
2315       /* We treat it as a normal character.  */
2316
2317       /* Then we can these characters as normal characters.  */
2318       token->type = CHARACTER;
2319       /* mb_partial and word_char bits should be initialized already
2320          by peek_token.  */
2321       tree = create_token_tree (dfa, NULL, NULL, token);
2322       if (BE (tree == NULL, 0))
2323         {
2324           *err = REG_ESPACE;
2325           return NULL;
2326         }
2327       break;
2328     case ANCHOR:
2329       if ((token->opr.ctx_type
2330            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2331           && dfa->word_ops_used == 0)
2332         init_word_char (dfa);
2333       if (token->opr.ctx_type == WORD_DELIM
2334           || token->opr.ctx_type == NOT_WORD_DELIM)
2335         {
2336           bin_tree_t *tree_first, *tree_last;
2337           if (token->opr.ctx_type == WORD_DELIM)
2338             {
2339               token->opr.ctx_type = WORD_FIRST;
2340               tree_first = create_token_tree (dfa, NULL, NULL, token);
2341               token->opr.ctx_type = WORD_LAST;
2342             }
2343           else
2344             {
2345               token->opr.ctx_type = INSIDE_WORD;
2346               tree_first = create_token_tree (dfa, NULL, NULL, token);
2347               token->opr.ctx_type = INSIDE_NOTWORD;
2348             }
2349           tree_last = create_token_tree (dfa, NULL, NULL, token);
2350           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2351           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2352             {
2353               *err = REG_ESPACE;
2354               return NULL;
2355             }
2356         }
2357       else
2358         {
2359           tree = create_token_tree (dfa, NULL, NULL, token);
2360           if (BE (tree == NULL, 0))
2361             {
2362               *err = REG_ESPACE;
2363               return NULL;
2364             }
2365         }
2366       /* We must return here, since ANCHORs can't be followed
2367          by repetition operators.
2368          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2369              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2370       fetch_token (token, regexp, syntax);
2371       return tree;
2372     case OP_PERIOD:
2373       tree = create_token_tree (dfa, NULL, NULL, token);
2374       if (BE (tree == NULL, 0))
2375         {
2376           *err = REG_ESPACE;
2377           return NULL;
2378         }
2379       if (dfa->mb_cur_max > 1)
2380         dfa->has_mb_node = 1;
2381       break;
2382     case OP_WORD:
2383     case OP_NOTWORD:
2384       tree = build_charclass_op (dfa, regexp->trans,
2385                                  "alnum",
2386                                  "_",
2387                                  token->type == OP_NOTWORD, err);
2388       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389         return NULL;
2390       break;
2391     case OP_SPACE:
2392     case OP_NOTSPACE:
2393       tree = build_charclass_op (dfa, regexp->trans,
2394                                  "space",
2395                                  "",
2396                                  token->type == OP_NOTSPACE, err);
2397       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2398         return NULL;
2399       break;
2400     case OP_ALT:
2401     case END_OF_RE:
2402       return NULL;
2403     case BACK_SLASH:
2404       *err = REG_EESCAPE;
2405       return NULL;
2406     default:
2407       /* Must not happen?  */
2408 #ifdef DEBUG
2409       assert (0);
2410 #endif
2411       return NULL;
2412     }
2413   fetch_token (token, regexp, syntax);
2414
2415   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2416          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2417     {
2418       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2419       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2420         return NULL;
2421       /* In BRE consecutive duplications are not allowed.  */
2422       if ((syntax & RE_CONTEXT_INVALID_DUP)
2423           && (token->type == OP_DUP_ASTERISK
2424               || token->type == OP_OPEN_DUP_NUM))
2425         {
2426           *err = REG_BADRPT;
2427           return NULL;
2428         }
2429     }
2430
2431   return tree;
2432 }
2433
2434 /* This function build the following tree, from regular expression
2435    (<reg_exp>):
2436          SUBEXP
2437             |
2438         <reg_exp>
2439 */
2440
2441 static bin_tree_t *
2442 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2443                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2444 {
2445   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2446   bin_tree_t *tree;
2447   size_t cur_nsub;
2448   cur_nsub = preg->re_nsub++;
2449
2450   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2451
2452   /* The subexpression may be a null string.  */
2453   if (token->type == OP_CLOSE_SUBEXP)
2454     tree = NULL;
2455   else
2456     {
2457       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2458       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2459         *err = REG_EPAREN;
2460       if (BE (*err != REG_NOERROR, 0))
2461         return NULL;
2462     }
2463
2464   if (cur_nsub <= '9' - '1')
2465     dfa->completed_bkref_map |= 1 << cur_nsub;
2466
2467   tree = create_tree (dfa, tree, NULL, SUBEXP);
2468   if (BE (tree == NULL, 0))
2469     {
2470       *err = REG_ESPACE;
2471       return NULL;
2472     }
2473   tree->token.opr.idx = cur_nsub;
2474   return tree;
2475 }
2476
2477 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2478
2479 static bin_tree_t *
2480 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2481               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2482 {
2483   bin_tree_t *tree = NULL, *old_tree = NULL;
2484   int i, start, end, start_idx = re_string_cur_idx (regexp);
2485 #ifndef RE_TOKEN_INIT_BUG
2486   re_token_t start_token = *token;
2487 #else
2488   re_token_t start_token;
2489
2490   memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2491 #endif
2492
2493   if (token->type == OP_OPEN_DUP_NUM)
2494     {
2495       end = 0;
2496       start = fetch_number (regexp, token, syntax);
2497       if (start == -1)
2498         {
2499           if (token->type == CHARACTER && token->opr.c == ',')
2500             start = 0; /* We treat "{,m}" as "{0,m}".  */
2501           else
2502             {
2503               *err = REG_BADBR; /* <re>{} is invalid.  */
2504               return NULL;
2505             }
2506         }
2507       if (BE (start != -2, 1))
2508         {
2509           /* We treat "{n}" as "{n,n}".  */
2510           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2511                  : ((token->type == CHARACTER && token->opr.c == ',')
2512                     ? fetch_number (regexp, token, syntax) : -2));
2513         }
2514       if (BE (start == -2 || end == -2, 0))
2515         {
2516           /* Invalid sequence.  */
2517           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2518             {
2519               if (token->type == END_OF_RE)
2520                 *err = REG_EBRACE;
2521               else
2522                 *err = REG_BADBR;
2523
2524               return NULL;
2525             }
2526
2527           /* If the syntax bit is set, rollback.  */
2528           re_string_set_index (regexp, start_idx);
2529           *token = start_token;
2530           token->type = CHARACTER;
2531           /* mb_partial and word_char bits should be already initialized by
2532              peek_token.  */
2533           return elem;
2534         }
2535
2536       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2537         {
2538           /* First number greater than second.  */
2539           *err = REG_BADBR;
2540           return NULL;
2541         }
2542     }
2543   else
2544     {
2545       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2546       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2547     }
2548
2549   fetch_token (token, regexp, syntax);
2550
2551   if (BE (elem == NULL, 0))
2552     return NULL;
2553   if (BE (start == 0 && end == 0, 0))
2554     {
2555       postorder (elem, free_tree, NULL);
2556       return NULL;
2557     }
2558
2559   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2560   if (BE (start > 0, 0))
2561     {
2562       tree = elem;
2563       for (i = 2; i <= start; ++i)
2564         {
2565           elem = duplicate_tree (elem, dfa);
2566           tree = create_tree (dfa, tree, elem, CONCAT);
2567           if (BE (elem == NULL || tree == NULL, 0))
2568             goto parse_dup_op_espace;
2569         }
2570
2571       if (start == end)
2572         return tree;
2573
2574       /* Duplicate ELEM before it is marked optional.  */
2575       elem = duplicate_tree (elem, dfa);
2576       old_tree = tree;
2577     }
2578   else
2579     old_tree = NULL;
2580
2581   if (elem->token.type == SUBEXP)
2582     postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2583
2584   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2585   if (BE (tree == NULL, 0))
2586     goto parse_dup_op_espace;
2587
2588   /* This loop is actually executed only when end != -1,
2589      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2590      already created the start+1-th copy.  */
2591   for (i = start + 2; i <= end; ++i)
2592     {
2593       elem = duplicate_tree (elem, dfa);
2594       tree = create_tree (dfa, tree, elem, CONCAT);
2595       if (BE (elem == NULL || tree == NULL, 0))
2596         goto parse_dup_op_espace;
2597
2598       tree = create_tree (dfa, tree, NULL, OP_ALT);
2599       if (BE (tree == NULL, 0))
2600         goto parse_dup_op_espace;
2601     }
2602
2603   if (old_tree)
2604     tree = create_tree (dfa, old_tree, tree, CONCAT);
2605
2606   return tree;
2607
2608  parse_dup_op_espace:
2609   *err = REG_ESPACE;
2610   return NULL;
2611 }
2612
2613 /* Size of the names for collating symbol/equivalence_class/character_class.
2614    I'm not sure, but maybe enough.  */
2615 #define BRACKET_NAME_BUF_SIZE 32
2616
2617 #ifndef _LIBC
2618   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2619      Build the range expression which starts from START_ELEM, and ends
2620      at END_ELEM.  The result are written to MBCSET and SBCSET.
2621      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2622      mbcset->range_ends, is a pointer argument since we may
2623      update it.  */
2624
2625 static reg_errcode_t
2626 internal_function
2627 # ifdef RE_ENABLE_I18N
2628 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2629                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2630 # else /* not RE_ENABLE_I18N */
2631 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2632                  bracket_elem_t *end_elem)
2633 # endif /* not RE_ENABLE_I18N */
2634 {
2635   unsigned int start_ch, end_ch;
2636   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2637   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2638           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2639           0))
2640     return REG_ERANGE;
2641
2642   /* We can handle no multi character collating elements without libc
2643      support.  */
2644   if (BE ((start_elem->type == COLL_SYM
2645            && strlen ((char *) start_elem->opr.name) > 1)
2646           || (end_elem->type == COLL_SYM
2647               && strlen ((char *) end_elem->opr.name) > 1), 0))
2648     return REG_ECOLLATE;
2649
2650 # ifdef RE_ENABLE_I18N
2651   {
2652     wchar_t wc;
2653     wint_t start_wc;
2654     wint_t end_wc;
2655     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2656
2657     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2658                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2659                    : 0));
2660     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2661               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2662                  : 0));
2663 #ifdef GAWK
2664     /*
2665      * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2666      * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2667      * unsigned, so we don't have sign extension problems.
2668      */
2669     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2670                 ? start_ch : start_elem->opr.wch);
2671     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2672               ? end_ch : end_elem->opr.wch);
2673 #else
2674     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2675                 ? __btowc (start_ch) : start_elem->opr.wch);
2676     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2677               ? __btowc (end_ch) : end_elem->opr.wch);
2678 #endif
2679     if (start_wc == WEOF || end_wc == WEOF)
2680       return REG_ECOLLATE;
2681     cmp_buf[0] = start_wc;
2682     cmp_buf[4] = end_wc;
2683     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2684       return REG_ERANGE;
2685
2686     /* Got valid collation sequence values, add them as a new entry.
2687        However, for !_LIBC we have no collation elements: if the
2688        character set is single byte, the single byte character set
2689        that we build below suffices.  parse_bracket_exp passes
2690        no MBCSET if dfa->mb_cur_max == 1.  */
2691     if (mbcset)
2692       {
2693         /* Check the space of the arrays.  */
2694         if (BE (*range_alloc == mbcset->nranges, 0))
2695           {
2696             /* There is not enough space, need realloc.  */
2697             wchar_t *new_array_start, *new_array_end;
2698             int new_nranges;
2699
2700             /* +1 in case of mbcset->nranges is 0.  */
2701             new_nranges = 2 * mbcset->nranges + 1;
2702             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2703                are NULL if *range_alloc == 0.  */
2704             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2705                                           new_nranges);
2706             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2707                                         new_nranges);
2708
2709             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2710               return REG_ESPACE;
2711
2712             mbcset->range_starts = new_array_start;
2713             mbcset->range_ends = new_array_end;
2714             *range_alloc = new_nranges;
2715           }
2716
2717         mbcset->range_starts[mbcset->nranges] = start_wc;
2718         mbcset->range_ends[mbcset->nranges++] = end_wc;
2719       }
2720
2721     /* Build the table for single byte characters.  */
2722     for (wc = 0; wc < SBC_MAX; ++wc)
2723       {
2724         cmp_buf[2] = wc;
2725         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2726             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2727           bitset_set (sbcset, wc);
2728       }
2729   }
2730 # else /* not RE_ENABLE_I18N */
2731   {
2732     unsigned int ch;
2733     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2734                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2735                    : 0));
2736     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2737               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2738                  : 0));
2739     if (start_ch > end_ch)
2740       return REG_ERANGE;
2741     /* Build the table for single byte characters.  */
2742     for (ch = 0; ch < SBC_MAX; ++ch)
2743       if (start_ch <= ch  && ch <= end_ch)
2744         bitset_set (sbcset, ch);
2745   }
2746 # endif /* not RE_ENABLE_I18N */
2747   return REG_NOERROR;
2748 }
2749 #endif /* not _LIBC */
2750
2751 #ifndef _LIBC
2752 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2753    Build the collating element which is represented by NAME.
2754    The result are written to MBCSET and SBCSET.
2755    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2756    pointer argument since we may update it.  */
2757
2758 static reg_errcode_t
2759 internal_function
2760 # ifdef RE_ENABLE_I18N
2761 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2762                         int *coll_sym_alloc, const unsigned char *name)
2763 # else /* not RE_ENABLE_I18N */
2764 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2765 # endif /* not RE_ENABLE_I18N */
2766 {
2767   size_t name_len = strlen ((const char *) name);
2768   if (BE (name_len != 1, 0))
2769     return REG_ECOLLATE;
2770   else
2771     {
2772       bitset_set (sbcset, name[0]);
2773       return REG_NOERROR;
2774     }
2775 }
2776 #endif /* not _LIBC */
2777
2778 /* This function parse bracket expression like "[abc]", "[a-c]",
2779    "[[.a-a.]]" etc.  */
2780
2781 static bin_tree_t *
2782 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2783                    reg_syntax_t syntax, reg_errcode_t *err)
2784 {
2785 #ifdef _LIBC
2786   const unsigned char *collseqmb;
2787   const char *collseqwc;
2788   uint32_t nrules;
2789   int32_t table_size;
2790   const int32_t *symb_table;
2791   const unsigned char *extra;
2792
2793   /* Local function for parse_bracket_exp used in _LIBC environment.
2794      Seek the collating symbol entry correspondings to NAME.
2795      Return the index of the symbol in the SYMB_TABLE.  */
2796
2797   auto inline int32_t
2798   __attribute ((always_inline))
2799   seek_collating_symbol_entry (name, name_len)
2800          const unsigned char *name;
2801          size_t name_len;
2802     {
2803       int32_t hash = elem_hash ((const char *) name, name_len);
2804       int32_t elem = hash % table_size;
2805       if (symb_table[2 * elem] != 0)
2806         {
2807           int32_t second = hash % (table_size - 2) + 1;
2808
2809           do
2810             {
2811               /* First compare the hashing value.  */
2812               if (symb_table[2 * elem] == hash
2813                   /* Compare the length of the name.  */
2814                   && name_len == extra[symb_table[2 * elem + 1]]
2815                   /* Compare the name.  */
2816                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2817                              name_len) == 0)
2818                 {
2819                   /* Yep, this is the entry.  */
2820                   break;
2821                 }
2822
2823               /* Next entry.  */
2824               elem += second;
2825             }
2826           while (symb_table[2 * elem] != 0);
2827         }
2828       return elem;
2829     }
2830
2831   /* Local function for parse_bracket_exp used in _LIBC environment.
2832      Look up the collation sequence value of BR_ELEM.
2833      Return the value if succeeded, UINT_MAX otherwise.  */
2834
2835   auto inline unsigned int
2836   __attribute ((always_inline))
2837   lookup_collation_sequence_value (br_elem)
2838          bracket_elem_t *br_elem;
2839     {
2840       if (br_elem->type == SB_CHAR)
2841         {
2842           /*
2843           if (MB_CUR_MAX == 1)
2844           */
2845           if (nrules == 0)
2846             return collseqmb[br_elem->opr.ch];
2847           else
2848             {
2849               wint_t wc = __btowc (br_elem->opr.ch);
2850               return __collseq_table_lookup (collseqwc, wc);
2851             }
2852         }
2853       else if (br_elem->type == MB_CHAR)
2854         {
2855           if (nrules != 0)
2856             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2857         }
2858       else if (br_elem->type == COLL_SYM)
2859         {
2860           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2861           if (nrules != 0)
2862             {
2863               int32_t elem, idx;
2864               elem = seek_collating_symbol_entry (br_elem->opr.name,
2865                                                   sym_name_len);
2866               if (symb_table[2 * elem] != 0)
2867                 {
2868                   /* We found the entry.  */
2869                   idx = symb_table[2 * elem + 1];
2870                   /* Skip the name of collating element name.  */
2871                   idx += 1 + extra[idx];
2872                   /* Skip the byte sequence of the collating element.  */
2873                   idx += 1 + extra[idx];
2874                   /* Adjust for the alignment.  */
2875                   idx = (idx + 3) & ~3;
2876                   /* Skip the multibyte collation sequence value.  */
2877                   idx += sizeof (unsigned int);
2878                   /* Skip the wide char sequence of the collating element.  */
2879                   idx += sizeof (unsigned int) *
2880                     (1 + *(unsigned int *) (extra + idx));
2881                   /* Return the collation sequence value.  */
2882                   return *(unsigned int *) (extra + idx);
2883                 }
2884               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2885                 {
2886                   /* No valid character.  Match it as a single byte
2887                      character.  */
2888                   return collseqmb[br_elem->opr.name[0]];
2889                 }
2890             }
2891           else if (sym_name_len == 1)
2892             return collseqmb[br_elem->opr.name[0]];
2893         }
2894       return UINT_MAX;
2895     }
2896
2897   /* Local function for parse_bracket_exp used in _LIBC environment.
2898      Build the range expression which starts from START_ELEM, and ends
2899      at END_ELEM.  The result are written to MBCSET and SBCSET.
2900      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2901      mbcset->range_ends, is a pointer argument since we may
2902      update it.  */
2903
2904   auto inline reg_errcode_t
2905   __attribute ((always_inline))
2906   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2907          re_charset_t *mbcset;
2908          int *range_alloc;
2909          bitset_t sbcset;
2910          bracket_elem_t *start_elem, *end_elem;
2911     {
2912       unsigned int ch;
2913       uint32_t start_collseq;
2914       uint32_t end_collseq;
2915
2916       /* Equivalence Classes and Character Classes can't be a range
2917          start/end.  */
2918       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2919               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2920               0))
2921         return REG_ERANGE;
2922
2923       start_collseq = lookup_collation_sequence_value (start_elem);
2924       end_collseq = lookup_collation_sequence_value (end_elem);
2925       /* Check start/end collation sequence values.  */
2926       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2927         return REG_ECOLLATE;
2928       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2929         return REG_ERANGE;
2930
2931       /* Got valid collation sequence values, add them as a new entry.
2932          However, if we have no collation elements, and the character set
2933          is single byte, the single byte character set that we
2934          build below suffices. */
2935       if (nrules > 0 || dfa->mb_cur_max > 1)
2936         {
2937           /* Check the space of the arrays.  */
2938           if (BE (*range_alloc == mbcset->nranges, 0))
2939             {
2940               /* There is not enough space, need realloc.  */
2941               uint32_t *new_array_start;
2942               uint32_t *new_array_end;
2943               int new_nranges;
2944
2945               /* +1 in case of mbcset->nranges is 0.  */
2946               new_nranges = 2 * mbcset->nranges + 1;
2947               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2948                                             new_nranges);
2949               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2950                                           new_nranges);
2951
2952               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2953                 return REG_ESPACE;
2954
2955               mbcset->range_starts = new_array_start;
2956               mbcset->range_ends = new_array_end;
2957               *range_alloc = new_nranges;
2958             }
2959
2960           mbcset->range_starts[mbcset->nranges] = start_collseq;
2961           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2962         }
2963
2964       /* Build the table for single byte characters.  */
2965       for (ch = 0; ch < SBC_MAX; ch++)
2966         {
2967           uint32_t ch_collseq;
2968           /*
2969           if (MB_CUR_MAX == 1)
2970           */
2971           if (nrules == 0)
2972             ch_collseq = collseqmb[ch];
2973           else
2974             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2975           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2976             bitset_set (sbcset, ch);
2977         }
2978       return REG_NOERROR;
2979     }
2980
2981   /* Local function for parse_bracket_exp used in _LIBC environment.
2982      Build the collating element which is represented by NAME.
2983      The result are written to MBCSET and SBCSET.
2984      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2985      pointer argument since we may update it.  */
2986
2987   auto inline reg_errcode_t
2988   __attribute ((always_inline))
2989   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2990          re_charset_t *mbcset;
2991          int *coll_sym_alloc;
2992          bitset_t sbcset;
2993          const unsigned char *name;
2994     {
2995       int32_t elem, idx;
2996       size_t name_len = strlen ((const char *) name);
2997       if (nrules != 0)
2998         {
2999           elem = seek_collating_symbol_entry (name, name_len);
3000           if (symb_table[2 * elem] != 0)
3001             {
3002               /* We found the entry.  */
3003               idx = symb_table[2 * elem + 1];
3004               /* Skip the name of collating element name.  */
3005               idx += 1 + extra[idx];
3006             }
3007           else if (symb_table[2 * elem] == 0 && name_len == 1)
3008             {
3009               /* No valid character, treat it as a normal
3010                  character.  */
3011               bitset_set (sbcset, name[0]);
3012               return REG_NOERROR;
3013             }
3014           else
3015             return REG_ECOLLATE;
3016
3017           /* Got valid collation sequence, add it as a new entry.  */
3018           /* Check the space of the arrays.  */
3019           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3020             {
3021               /* Not enough, realloc it.  */
3022               /* +1 in case of mbcset->ncoll_syms is 0.  */
3023               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3024               /* Use realloc since mbcset->coll_syms is NULL
3025                  if *alloc == 0.  */
3026               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3027                                                    new_coll_sym_alloc);
3028               if (BE (new_coll_syms == NULL, 0))
3029                 return REG_ESPACE;
3030               mbcset->coll_syms = new_coll_syms;
3031               *coll_sym_alloc = new_coll_sym_alloc;
3032             }
3033           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3034           return REG_NOERROR;
3035         }
3036       else
3037         {
3038           if (BE (name_len != 1, 0))
3039             return REG_ECOLLATE;
3040           else
3041             {
3042               bitset_set (sbcset, name[0]);
3043               return REG_NOERROR;
3044             }
3045         }
3046     }
3047 #endif
3048
3049   re_token_t br_token;
3050   re_bitset_ptr_t sbcset;
3051 #ifdef RE_ENABLE_I18N
3052   re_charset_t *mbcset;
3053   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3054   int equiv_class_alloc = 0, char_class_alloc = 0;
3055 #endif /* not RE_ENABLE_I18N */
3056   int non_match = 0;
3057   bin_tree_t *work_tree;
3058   int token_len;
3059   int first_round = 1;
3060 #ifdef _LIBC
3061   collseqmb = (const unsigned char *)
3062     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3063   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3064   if (nrules)
3065     {
3066       /*
3067       if (MB_CUR_MAX > 1)
3068       */
3069       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3070       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3071       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3072                                                   _NL_COLLATE_SYMB_TABLEMB);
3073       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3074                                                    _NL_COLLATE_SYMB_EXTRAMB);
3075     }
3076 #endif
3077   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3078 #ifdef RE_ENABLE_I18N
3079   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3080 #endif /* RE_ENABLE_I18N */
3081 #ifdef RE_ENABLE_I18N
3082   if (BE (sbcset == NULL || mbcset == NULL, 0))
3083 #else
3084   if (BE (sbcset == NULL, 0))
3085 #endif /* RE_ENABLE_I18N */
3086     {
3087       *err = REG_ESPACE;
3088       return NULL;
3089     }
3090
3091   token_len = peek_token_bracket (token, regexp, syntax);
3092   if (BE (token->type == END_OF_RE, 0))
3093     {
3094       *err = REG_BADPAT;
3095       goto parse_bracket_exp_free_return;
3096     }
3097   if (token->type == OP_NON_MATCH_LIST)
3098     {
3099 #ifdef RE_ENABLE_I18N
3100       mbcset->non_match = 1;
3101 #endif /* not RE_ENABLE_I18N */
3102       non_match = 1;
3103       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3104         bitset_set (sbcset, '\n');
3105       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3106       token_len = peek_token_bracket (token, regexp, syntax);
3107       if (BE (token->type == END_OF_RE, 0))
3108         {
3109           *err = REG_BADPAT;
3110           goto parse_bracket_exp_free_return;
3111         }
3112     }
3113
3114   /* We treat the first ']' as a normal character.  */
3115   if (token->type == OP_CLOSE_BRACKET)
3116     token->type = CHARACTER;
3117
3118   while (1)
3119     {
3120       bracket_elem_t start_elem, end_elem;
3121       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3122       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3123       reg_errcode_t ret;
3124       int token_len2 = 0, is_range_exp = 0;
3125       re_token_t token2;
3126
3127       start_elem.opr.name = start_name_buf;
3128       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3129                                    syntax, first_round);
3130       if (BE (ret != REG_NOERROR, 0))
3131         {
3132           *err = ret;
3133           goto parse_bracket_exp_free_return;
3134         }
3135       first_round = 0;
3136
3137       /* Get information about the next token.  We need it in any case.  */
3138       token_len = peek_token_bracket (token, regexp, syntax);
3139
3140       /* Do not check for ranges if we know they are not allowed.  */
3141       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3142         {
3143           if (BE (token->type == END_OF_RE, 0))
3144             {
3145               *err = REG_EBRACK;
3146               goto parse_bracket_exp_free_return;
3147             }
3148           if (token->type == OP_CHARSET_RANGE)
3149             {
3150               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3151               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3152               if (BE (token2.type == END_OF_RE, 0))
3153                 {
3154                   *err = REG_EBRACK;
3155                   goto parse_bracket_exp_free_return;
3156                 }
3157               if (token2.type == OP_CLOSE_BRACKET)
3158                 {
3159                   /* We treat the last '-' as a normal character.  */
3160                   re_string_skip_bytes (regexp, -token_len);
3161                   token->type = CHARACTER;
3162                 }
3163               else
3164                 is_range_exp = 1;
3165             }
3166         }
3167
3168       if (is_range_exp == 1)
3169         {
3170           end_elem.opr.name = end_name_buf;
3171           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3172                                        dfa, syntax, 1);
3173           if (BE (ret != REG_NOERROR, 0))
3174             {
3175               *err = ret;
3176               goto parse_bracket_exp_free_return;
3177             }
3178
3179           token_len = peek_token_bracket (token, regexp, syntax);
3180
3181 #ifdef _LIBC
3182           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3183                                   &start_elem, &end_elem);
3184 #else
3185 # ifdef RE_ENABLE_I18N
3186           *err = build_range_exp (sbcset,
3187                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3188                                   &range_alloc, &start_elem, &end_elem);
3189 # else
3190           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3191 # endif
3192 #endif /* RE_ENABLE_I18N */
3193           if (BE (*err != REG_NOERROR, 0))
3194             goto parse_bracket_exp_free_return;
3195         }
3196       else
3197         {
3198           switch (start_elem.type)
3199             {
3200             case SB_CHAR:
3201               bitset_set (sbcset, start_elem.opr.ch);
3202               break;
3203 #ifdef RE_ENABLE_I18N
3204             case MB_CHAR:
3205               /* Check whether the array has enough space.  */
3206               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3207                 {
3208                   wchar_t *new_mbchars;
3209                   /* Not enough, realloc it.  */
3210                   /* +1 in case of mbcset->nmbchars is 0.  */
3211                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3212                   /* Use realloc since array is NULL if *alloc == 0.  */
3213                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3214                                             mbchar_alloc);
3215                   if (BE (new_mbchars == NULL, 0))
3216                     goto parse_bracket_exp_espace;
3217                   mbcset->mbchars = new_mbchars;
3218                 }
3219               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3220               break;
3221 #endif /* RE_ENABLE_I18N */
3222             case EQUIV_CLASS:
3223               *err = build_equiv_class (sbcset,
3224 #ifdef RE_ENABLE_I18N
3225                                         mbcset, &equiv_class_alloc,
3226 #endif /* RE_ENABLE_I18N */
3227                                         start_elem.opr.name);
3228               if (BE (*err != REG_NOERROR, 0))
3229                 goto parse_bracket_exp_free_return;
3230               break;
3231             case COLL_SYM:
3232               *err = build_collating_symbol (sbcset,
3233 #ifdef RE_ENABLE_I18N
3234                                              mbcset, &coll_sym_alloc,
3235 #endif /* RE_ENABLE_I18N */
3236                                              start_elem.opr.name);
3237               if (BE (*err != REG_NOERROR, 0))
3238                 goto parse_bracket_exp_free_return;
3239               break;
3240             case CHAR_CLASS:
3241               *err = build_charclass (regexp->trans, sbcset,
3242 #ifdef RE_ENABLE_I18N
3243                                       mbcset, &char_class_alloc,
3244 #endif /* RE_ENABLE_I18N */
3245                                       (const char *) start_elem.opr.name, syntax);
3246               if (BE (*err != REG_NOERROR, 0))
3247                goto parse_bracket_exp_free_return;
3248               break;
3249             default:
3250               assert (0);
3251               break;
3252             }
3253         }
3254       if (BE (token->type == END_OF_RE, 0))
3255         {
3256           *err = REG_EBRACK;
3257           goto parse_bracket_exp_free_return;
3258         }
3259       if (token->type == OP_CLOSE_BRACKET)
3260         break;
3261     }
3262
3263   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3264
3265   /* If it is non-matching list.  */
3266   if (non_match)
3267     bitset_not (sbcset);
3268
3269 #ifdef RE_ENABLE_I18N
3270   /* Ensure only single byte characters are set.  */
3271   if (dfa->mb_cur_max > 1)
3272     bitset_mask (sbcset, dfa->sb_char);
3273
3274   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3275       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3276                                                      || mbcset->non_match)))
3277     {
3278       bin_tree_t *mbc_tree;
3279       int sbc_idx;
3280       /* Build a tree for complex bracket.  */
3281       dfa->has_mb_node = 1;
3282       br_token.type = COMPLEX_BRACKET;
3283       br_token.opr.mbcset = mbcset;
3284       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3285       if (BE (mbc_tree == NULL, 0))
3286         goto parse_bracket_exp_espace;
3287       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3288         if (sbcset[sbc_idx])
3289           break;
3290       /* If there are no bits set in sbcset, there is no point
3291          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3292       if (sbc_idx < BITSET_WORDS)
3293         {
3294           /* Build a tree for simple bracket.  */
3295           br_token.type = SIMPLE_BRACKET;
3296           br_token.opr.sbcset = sbcset;
3297           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3298           if (BE (work_tree == NULL, 0))
3299             goto parse_bracket_exp_espace;
3300
3301           /* Then join them by ALT node.  */
3302           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3303           if (BE (work_tree == NULL, 0))
3304             goto parse_bracket_exp_espace;
3305         }
3306       else
3307         {
3308           re_free (sbcset);
3309           work_tree = mbc_tree;
3310         }
3311     }
3312   else
3313 #endif /* not RE_ENABLE_I18N */
3314     {
3315 #ifdef RE_ENABLE_I18N
3316       free_charset (mbcset);
3317 #endif
3318       /* Build a tree for simple bracket.  */
3319       br_token.type = SIMPLE_BRACKET;
3320       br_token.opr.sbcset = sbcset;
3321       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3322       if (BE (work_tree == NULL, 0))
3323         goto parse_bracket_exp_espace;
3324     }
3325   return work_tree;
3326
3327  parse_bracket_exp_espace:
3328   *err = REG_ESPACE;
3329  parse_bracket_exp_free_return:
3330   re_free (sbcset);
3331 #ifdef RE_ENABLE_I18N
3332   free_charset (mbcset);
3333 #endif /* RE_ENABLE_I18N */
3334   return NULL;
3335 }
3336
3337 /* Parse an element in the bracket expression.  */
3338
3339 static reg_errcode_t
3340 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3341                        re_token_t *token, int token_len, re_dfa_t *dfa,
3342                        reg_syntax_t syntax, int accept_hyphen)
3343 {
3344 #ifdef RE_ENABLE_I18N
3345   int cur_char_size;
3346   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3347   if (cur_char_size > 1)
3348     {
3349       elem->type = MB_CHAR;
3350       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3351       re_string_skip_bytes (regexp, cur_char_size);
3352       return REG_NOERROR;
3353     }
3354 #endif /* RE_ENABLE_I18N */
3355   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3356   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3357       || token->type == OP_OPEN_EQUIV_CLASS)
3358     return parse_bracket_symbol (elem, regexp, token);
3359   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3360     {
3361       /* A '-' must only appear as anything but a range indicator before
3362          the closing bracket.  Everything else is an error.  */
3363       re_token_t token2;
3364       (void) peek_token_bracket (&token2, regexp, syntax);
3365       if (token2.type != OP_CLOSE_BRACKET)
3366         /* The actual error value is not standardized since this whole
3367            case is undefined.  But ERANGE makes good sense.  */
3368         return REG_ERANGE;
3369     }
3370   elem->type = SB_CHAR;
3371   elem->opr.ch = token->opr.c;
3372   return REG_NOERROR;
3373 }
3374
3375 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3376    such as [:<character_class>:], [.<collating_element>.], and
3377    [=<equivalent_class>=].  */
3378
3379 static reg_errcode_t
3380 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3381                       re_token_t *token)
3382 {
3383   unsigned char ch, delim = token->opr.c;
3384   int i = 0;
3385   if (re_string_eoi(regexp))
3386     return REG_EBRACK;
3387   for (;; ++i)
3388     {
3389       if (i >= BRACKET_NAME_BUF_SIZE)
3390         return REG_EBRACK;
3391       if (token->type == OP_OPEN_CHAR_CLASS)
3392         ch = re_string_fetch_byte_case (regexp);
3393       else
3394         ch = re_string_fetch_byte (regexp);
3395       if (re_string_eoi(regexp))
3396         return REG_EBRACK;
3397       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3398         break;
3399       elem->opr.name[i] = ch;
3400     }
3401   re_string_skip_bytes (regexp, 1);
3402   elem->opr.name[i] = '\0';
3403   switch (token->type)
3404     {
3405     case OP_OPEN_COLL_ELEM:
3406       elem->type = COLL_SYM;
3407       break;
3408     case OP_OPEN_EQUIV_CLASS:
3409       elem->type = EQUIV_CLASS;
3410       break;
3411     case OP_OPEN_CHAR_CLASS:
3412       elem->type = CHAR_CLASS;
3413       break;
3414     default:
3415       break;
3416     }
3417   return REG_NOERROR;
3418 }
3419
3420   /* Helper function for parse_bracket_exp.
3421      Build the equivalence class which is represented by NAME.
3422      The result are written to MBCSET and SBCSET.
3423      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3424      is a pointer argument since we may update it.  */
3425
3426 static reg_errcode_t
3427 #ifdef RE_ENABLE_I18N
3428 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3429                    int *equiv_class_alloc, const unsigned char *name)
3430 #else /* not RE_ENABLE_I18N */
3431 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3432 #endif /* not RE_ENABLE_I18N */
3433 {
3434 #ifdef _LIBC
3435   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3436   if (nrules != 0)
3437     {
3438       const int32_t *table, *indirect;
3439       const unsigned char *weights, *extra, *cp;
3440       unsigned char char_buf[2];
3441       int32_t idx1, idx2;
3442       unsigned int ch;
3443       size_t len;
3444       /* This #include defines a local function!  */
3445 # include <locale/weight.h>
3446       /* Calculate the index for equivalence class.  */
3447       cp = name;
3448       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3449       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3450                                                _NL_COLLATE_WEIGHTMB);
3451       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3452                                                    _NL_COLLATE_EXTRAMB);
3453       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3454                                                 _NL_COLLATE_INDIRECTMB);
3455       idx1 = findidx (&cp);
3456       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3457         /* This isn't a valid character.  */
3458         return REG_ECOLLATE;
3459
3460       /* Build single byte matcing table for this equivalence class.  */
3461       char_buf[1] = (unsigned char) '\0';
3462       len = weights[idx1 & 0xffffff];
3463       for (ch = 0; ch < SBC_MAX; ++ch)
3464         {
3465           char_buf[0] = ch;
3466           cp = char_buf;
3467           idx2 = findidx (&cp);
3468 /*
3469           idx2 = table[ch];
3470 */
3471           if (idx2 == 0)
3472             /* This isn't a valid character.  */
3473             continue;
3474           /* Compare only if the length matches and the collation rule
3475              index is the same.  */
3476           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3477             {
3478               int cnt = 0;
3479
3480               while (cnt <= len &&
3481                      weights[(idx1 & 0xffffff) + 1 + cnt]
3482                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3483                 ++cnt;
3484
3485               if (cnt > len)
3486                 bitset_set (sbcset, ch);
3487             }
3488         }
3489       /* Check whether the array has enough space.  */
3490       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3491         {
3492           /* Not enough, realloc it.  */
3493           /* +1 in case of mbcset->nequiv_classes is 0.  */
3494           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3495           /* Use realloc since the array is NULL if *alloc == 0.  */
3496           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3497                                                    int32_t,
3498                                                    new_equiv_class_alloc);
3499           if (BE (new_equiv_classes == NULL, 0))
3500             return REG_ESPACE;
3501           mbcset->equiv_classes = new_equiv_classes;
3502           *equiv_class_alloc = new_equiv_class_alloc;
3503         }
3504       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3505     }
3506   else
3507 #endif /* _LIBC */
3508     {
3509       if (BE (strlen ((const char *) name) != 1, 0))
3510         return REG_ECOLLATE;
3511       bitset_set (sbcset, *name);
3512     }
3513   return REG_NOERROR;
3514 }
3515
3516   /* Helper function for parse_bracket_exp.
3517      Build the character class which is represented by NAME.
3518      The result are written to MBCSET and SBCSET.
3519      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3520      is a pointer argument since we may update it.  */
3521
3522 static reg_errcode_t
3523 #ifdef RE_ENABLE_I18N
3524 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3525                  re_charset_t *mbcset, int *char_class_alloc,
3526                  const char *class_name, reg_syntax_t syntax)
3527 #else /* not RE_ENABLE_I18N */
3528 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3529                  const char *class_name, reg_syntax_t syntax)
3530 #endif /* not RE_ENABLE_I18N */
3531 {
3532   int i;
3533
3534   /* In case of REG_ICASE "upper" and "lower" match the both of
3535      upper and lower cases.  */
3536   if ((syntax & RE_ICASE)
3537       && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3538     class_name = "alpha";
3539
3540 #ifdef RE_ENABLE_I18N
3541   /* Check the space of the arrays.  */
3542   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3543     {
3544       /* Not enough, realloc it.  */
3545       /* +1 in case of mbcset->nchar_classes is 0.  */
3546       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3547       /* Use realloc since array is NULL if *alloc == 0.  */
3548       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3549                                                new_char_class_alloc);
3550       if (BE (new_char_classes == NULL, 0))
3551         return REG_ESPACE;
3552       mbcset->char_classes = new_char_classes;
3553       *char_class_alloc = new_char_class_alloc;
3554     }
3555   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3556 #endif /* RE_ENABLE_I18N */
3557
3558 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3559   do {                                          \
3560     if (BE (trans != NULL, 0))                  \
3561       {                                         \
3562         for (i = 0; i < SBC_MAX; ++i)           \
3563           if (ctype_func (i))                   \
3564             bitset_set (sbcset, trans[i]);      \
3565       }                                         \
3566     else                                        \
3567       {                                         \
3568         for (i = 0; i < SBC_MAX; ++i)           \
3569           if (ctype_func (i))                   \
3570             bitset_set (sbcset, i);             \
3571       }                                         \
3572   } while (0)
3573
3574   if (strcmp (class_name, "alnum") == 0)
3575     BUILD_CHARCLASS_LOOP (isalnum);
3576   else if (strcmp (class_name, "cntrl") == 0)
3577     BUILD_CHARCLASS_LOOP (iscntrl);
3578   else if (strcmp (class_name, "lower") == 0)
3579     BUILD_CHARCLASS_LOOP (islower);
3580   else if (strcmp (class_name, "space") == 0)
3581     BUILD_CHARCLASS_LOOP (isspace);
3582   else if (strcmp (class_name, "alpha") == 0)
3583     BUILD_CHARCLASS_LOOP (isalpha);
3584   else if (strcmp (class_name, "digit") == 0)
3585     BUILD_CHARCLASS_LOOP (isdigit);
3586   else if (strcmp (class_name, "print") == 0)
3587     BUILD_CHARCLASS_LOOP (isprint);
3588   else if (strcmp (class_name, "upper") == 0)
3589     BUILD_CHARCLASS_LOOP (isupper);
3590   else if (strcmp (class_name, "blank") == 0)
3591 #ifndef GAWK
3592     BUILD_CHARCLASS_LOOP (isblank);
3593 #else
3594     /* see comments above */
3595     BUILD_CHARCLASS_LOOP (is_blank);
3596 #endif
3597   else if (strcmp (class_name, "graph") == 0)
3598     BUILD_CHARCLASS_LOOP (isgraph);
3599   else if (strcmp (class_name, "punct") == 0)
3600     BUILD_CHARCLASS_LOOP (ispunct);
3601   else if (strcmp (class_name, "xdigit") == 0)
3602     BUILD_CHARCLASS_LOOP (isxdigit);
3603   else
3604     return REG_ECTYPE;
3605
3606   return REG_NOERROR;
3607 }
3608
3609 static bin_tree_t *
3610 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3611                     const char *class_name,
3612                     const char *extra, int non_match,
3613                     reg_errcode_t *err)
3614 {
3615   re_bitset_ptr_t sbcset;
3616 #ifdef RE_ENABLE_I18N
3617   re_charset_t *mbcset;
3618   int alloc = 0;
3619 #endif /* not RE_ENABLE_I18N */
3620   reg_errcode_t ret;
3621   re_token_t br_token;
3622   bin_tree_t *tree;
3623
3624   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3625 #ifdef RE_ENABLE_I18N
3626   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3627 #endif /* RE_ENABLE_I18N */
3628
3629 #ifdef RE_ENABLE_I18N
3630   if (BE (sbcset == NULL || mbcset == NULL, 0))
3631 #else /* not RE_ENABLE_I18N */
3632   if (BE (sbcset == NULL, 0))
3633 #endif /* not RE_ENABLE_I18N */
3634     {
3635       *err = REG_ESPACE;
3636       return NULL;
3637     }
3638
3639   if (non_match)
3640     {
3641 #ifdef RE_ENABLE_I18N
3642       mbcset->non_match = 1;
3643 #endif /* not RE_ENABLE_I18N */
3644     }
3645
3646   /* We don't care the syntax in this case.  */
3647   ret = build_charclass (trans, sbcset,
3648 #ifdef RE_ENABLE_I18N
3649                          mbcset, &alloc,
3650 #endif /* RE_ENABLE_I18N */
3651                          class_name, 0);
3652
3653   if (BE (ret != REG_NOERROR, 0))
3654     {
3655       re_free (sbcset);
3656 #ifdef RE_ENABLE_I18N
3657       free_charset (mbcset);
3658 #endif /* RE_ENABLE_I18N */
3659       *err = ret;
3660       return NULL;
3661     }
3662   /* \w match '_' also.  */
3663   for (; *extra; extra++)
3664     bitset_set (sbcset, *extra);
3665
3666   /* If it is non-matching list.  */
3667   if (non_match)
3668     bitset_not (sbcset);
3669
3670 #ifdef RE_ENABLE_I18N
3671   /* Ensure only single byte characters are set.  */
3672   if (dfa->mb_cur_max > 1)
3673     bitset_mask (sbcset, dfa->sb_char);
3674 #endif
3675
3676   /* Build a tree for simple bracket.  */
3677   br_token.type = SIMPLE_BRACKET;
3678   br_token.opr.sbcset = sbcset;
3679   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3680   if (BE (tree == NULL, 0))
3681     goto build_word_op_espace;
3682
3683 #ifdef RE_ENABLE_I18N
3684   if (dfa->mb_cur_max > 1)
3685     {
3686       bin_tree_t *mbc_tree;
3687       /* Build a tree for complex bracket.  */
3688       br_token.type = COMPLEX_BRACKET;
3689       br_token.opr.mbcset = mbcset;
3690       dfa->has_mb_node = 1;
3691       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3692       if (BE (mbc_tree == NULL, 0))
3693         goto build_word_op_espace;
3694       /* Then join them by ALT node.  */
3695       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3696       if (BE (mbc_tree != NULL, 1))
3697         return tree;
3698     }
3699   else
3700     {
3701       free_charset (mbcset);
3702       return tree;
3703     }
3704 #else /* not RE_ENABLE_I18N */
3705   return tree;
3706 #endif /* not RE_ENABLE_I18N */
3707
3708  build_word_op_espace:
3709   re_free (sbcset);
3710 #ifdef RE_ENABLE_I18N
3711   free_charset (mbcset);
3712 #endif /* RE_ENABLE_I18N */
3713   *err = REG_ESPACE;
3714   return NULL;
3715 }
3716
3717 /* This is intended for the expressions like "a{1,3}".
3718    Fetch a number from `input', and return the number.
3719    Return -1, if the number field is empty like "{,1}".
3720    Return -2, if an error has occurred.  */
3721
3722 static int
3723 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3724 {
3725   int num = -1;
3726   unsigned char c;
3727   while (1)
3728     {
3729       fetch_token (token, input, syntax);
3730       c = token->opr.c;
3731       if (BE (token->type == END_OF_RE, 0))
3732         return -2;
3733       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3734         break;
3735       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3736              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3737       num = (num > RE_DUP_MAX) ? -2 : num;
3738     }
3739   return num;
3740 }
3741 \f
3742 #ifdef RE_ENABLE_I18N
3743 static void
3744 free_charset (re_charset_t *cset)
3745 {
3746   re_free (cset->mbchars);
3747 # ifdef _LIBC
3748   re_free (cset->coll_syms);
3749   re_free (cset->equiv_classes);
3750   re_free (cset->range_starts);
3751   re_free (cset->range_ends);
3752 # endif
3753   re_free (cset->char_classes);
3754   re_free (cset);
3755 }
3756 #endif /* RE_ENABLE_I18N */
3757 \f
3758 /* Functions for binary tree operation.  */
3759
3760 /* Create a tree node.  */
3761
3762 static bin_tree_t *
3763 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3764              re_token_type_t type)
3765 {
3766   re_token_t t;
3767   t.type = type;
3768   return create_token_tree (dfa, left, right, &t);
3769 }
3770
3771 static bin_tree_t *
3772 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3773                    const re_token_t *token)
3774 {
3775   bin_tree_t *tree;
3776   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3777     {
3778       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3779
3780       if (storage == NULL)
3781         return NULL;
3782       storage->next = dfa->str_tree_storage;
3783       dfa->str_tree_storage = storage;
3784       dfa->str_tree_storage_idx = 0;
3785     }
3786   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3787
3788   tree->parent = NULL;
3789   tree->left = left;
3790   tree->right = right;
3791   tree->token = *token;
3792   tree->token.duplicated = 0;
3793   tree->token.opt_subexp = 0;
3794   tree->first = NULL;
3795   tree->next = NULL;
3796   tree->node_idx = -1;
3797
3798   if (left != NULL)
3799     left->parent = tree;
3800   if (right != NULL)
3801     right->parent = tree;
3802   return tree;
3803 }
3804
3805 /* Mark the tree SRC as an optional subexpression.
3806    To be called from preorder or postorder.  */
3807
3808 static reg_errcode_t
3809 mark_opt_subexp (void *extra, bin_tree_t *node)
3810 {
3811   int idx = (int) (intptr_t) extra;
3812   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3813     node->token.opt_subexp = 1;
3814
3815   return REG_NOERROR;
3816 }
3817
3818 /* Free the allocated memory inside NODE. */
3819
3820 static void
3821 free_token (re_token_t *node)
3822 {
3823 #ifdef RE_ENABLE_I18N
3824   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3825     free_charset (node->opr.mbcset);
3826   else
3827 #endif /* RE_ENABLE_I18N */
3828     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3829       re_free (node->opr.sbcset);
3830 }
3831
3832 /* Worker function for tree walking.  Free the allocated memory inside NODE
3833    and its children. */
3834
3835 static reg_errcode_t
3836 free_tree (void *extra, bin_tree_t *node)
3837 {
3838   free_token (&node->token);
3839   return REG_NOERROR;
3840 }
3841
3842
3843 /* Duplicate the node SRC, and return new node.  This is a preorder
3844    visit similar to the one implemented by the generic visitor, but
3845    we need more infrastructure to maintain two parallel trees --- so,
3846    it's easier to duplicate.  */
3847
3848 static bin_tree_t *
3849 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3850 {
3851   const bin_tree_t *node;
3852   bin_tree_t *dup_root;
3853   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3854
3855   for (node = root; ; )
3856     {
3857       /* Create a new tree and link it back to the current parent.  */
3858       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3859       if (*p_new == NULL)
3860         return NULL;
3861       (*p_new)->parent = dup_node;
3862       (*p_new)->token.duplicated = 1;
3863       dup_node = *p_new;
3864
3865       /* Go to the left node, or up and to the right.  */
3866       if (node->left)
3867         {
3868           node = node->left;
3869           p_new = &dup_node->left;
3870         }
3871       else
3872         {
3873           const bin_tree_t *prev = NULL;
3874           while (node->right == prev || node->right == NULL)
3875             {
3876               prev = node;
3877               node = node->parent;
3878               dup_node = dup_node->parent;
3879               if (!node)
3880                 return dup_root;
3881             }
3882           node = node->right;
3883           p_new = &dup_node->right;
3884         }
3885     }
3886 }