posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2015 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <stdint.h>
  21
  22 #ifdef _LIBC
  23 # include <locale/weight.h>
  24 #endif
  25
  26 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  27                                           size_t length, reg_syntax_t syntax);
  28 static void re_compile_fastmap_iter (regex_t *bufp,
  29                                      const re_dfastate_t *init_state,
  30                                      char *fastmap);
  31 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  32 #ifdef RE_ENABLE_I18N
  33 static void free_charset (re_charset_t *cset);
  34 #endif /* RE_ENABLE_I18N */
  35 static void free_workarea_compile (regex_t *preg);
  36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  37 #ifdef RE_ENABLE_I18N
  38 static void optimize_utf8 (re_dfa_t *dfa);
  39 #endif
  40 static reg_errcode_t analyze (regex_t *preg);
  41 static reg_errcode_t preorder (bin_tree_t *root,
  42                                reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                void *extra);
  44 static reg_errcode_t postorder (bin_tree_t *root,
  45                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  46                                 void *extra);
  47 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  48 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  49 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  50                                  bin_tree_t *node);
  51 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  52 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  53 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  54 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  55 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  56                                    unsigned int constraint);
  57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  59                                          int node, int root);
  60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  61 static int fetch_number (re_string_t *input, re_token_t *token,
  62                          reg_syntax_t syntax);
  63 static int peek_token (re_token_t *token, re_string_t *input,
  64                         reg_syntax_t syntax) internal_function;
  65 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  66                           reg_syntax_t syntax, reg_errcode_t *err);
  67 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  68                                   re_token_t *token, reg_syntax_t syntax,
  69                                   int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  71                                  re_token_t *token, reg_syntax_t syntax,
  72                                  int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  74                                      re_token_t *token, reg_syntax_t syntax,
  75                                      int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  77                                   re_token_t *token, reg_syntax_t syntax,
  78                                   int nest, reg_errcode_t *err);
  79 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  80                                  re_dfa_t *dfa, re_token_t *token,
  81                                  reg_syntax_t syntax, reg_errcode_t *err);
  82 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  83                                       re_token_t *token, reg_syntax_t syntax,
  84                                       reg_errcode_t *err);
  85 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  86                                             re_string_t *regexp,
  87                                             re_token_t *token, int token_len,
  88                                             re_dfa_t *dfa,
  89                                             reg_syntax_t syntax,
  90                                             int accept_hyphen);
  91 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  92                                           re_string_t *regexp,
  93                                           re_token_t *token);
  94 #ifdef RE_ENABLE_I18N
  95 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  96                                         re_charset_t *mbcset,
  97                                         int *equiv_class_alloc,
  98                                         const unsigned char *name);
  99 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 100                                       bitset_t sbcset,
 101                                       re_charset_t *mbcset,
 102                                       int *char_class_alloc,
 103                                       const unsigned char *class_name,
 104                                       reg_syntax_t syntax);
 105 #else  /* not RE_ENABLE_I18N */
 106 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 107                                         const unsigned char *name);
 108 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 109                                       bitset_t sbcset,
 110                                       const unsigned char *class_name,
 111                                       reg_syntax_t syntax);
 112 #endif /* not RE_ENABLE_I18N */
 113 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 114                                        RE_TRANSLATE_TYPE trans,
 115                                        const unsigned char *class_name,
 116                                        const unsigned char *extra,
 117                                        int non_match, reg_errcode_t *err);
 118 static bin_tree_t *create_tree (re_dfa_t *dfa,
 119                                 bin_tree_t *left, bin_tree_t *right,
 120                                 re_token_type_t type);
 121 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 122                                       bin_tree_t *left, bin_tree_t *right,
 123                                       const re_token_t *token);
 124 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 125 static void free_token (re_token_t *node);
 126 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 127 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 128 \f
 129 /* This table gives an error message for each of the error codes listed
 130    in regex.h.  Obviously the order here has to be same as there.
 131    POSIX doesn't require that we do anything for REG_NOERROR,
 132    but why not be nice?  */
 133
 134 const char __re_error_msgid[] attribute_hidden =
 135   {
 136 #define REG_NOERROR_IDX 0
 137     gettext_noop ("Success")    /* REG_NOERROR */
 138     "\0"
 139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 140     gettext_noop ("No match")   /* REG_NOMATCH */
 141     "\0"
 142 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 143     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 144     "\0"
 145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 146     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 147     "\0"
 148 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 149     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 150     "\0"
 151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 152     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 153     "\0"
 154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 155     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 156     "\0"
 157 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 158     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 159     "\0"
 160 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 161     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 162     "\0"
 163 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 164     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 165     "\0"
 166 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 167     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 168     "\0"
 169 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 170     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 171     "\0"
 172 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 173     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 174     "\0"
 175 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 176     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 177     "\0"
 178 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 179     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 180     "\0"
 181 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 182     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 183     "\0"
 184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 185     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 186   };
 187
 188 const size_t __re_error_msgid_idx[] attribute_hidden =
 189   {
 190     REG_NOERROR_IDX,
 191     REG_NOMATCH_IDX,
 192     REG_BADPAT_IDX,
 193     REG_ECOLLATE_IDX,
 194     REG_ECTYPE_IDX,
 195     REG_EESCAPE_IDX,
 196     REG_ESUBREG_IDX,
 197     REG_EBRACK_IDX,
 198     REG_EPAREN_IDX,
 199     REG_EBRACE_IDX,
 200     REG_BADBR_IDX,
 201     REG_ERANGE_IDX,
 202     REG_ESPACE_IDX,
 203     REG_BADRPT_IDX,
 204     REG_EEND_IDX,
 205     REG_ESIZE_IDX,
 206     REG_ERPAREN_IDX
 207   };
 208 \f
 209 /* Entry points for GNU code.  */
 210
 211 /* re_compile_pattern is the GNU regular expression compiler: it
 212    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 213    Returns 0 if the pattern was valid, otherwise an error string.
 214
 215    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 216    are set in BUFP on entry.  */
 217
 218 const char *
 219 re_compile_pattern (pattern, length, bufp)
 220     const char *pattern;
 221     size_t length;
 222     struct re_pattern_buffer *bufp;
 223 {
 224   reg_errcode_t ret;
 225
 226   /* And GNU code determines whether or not to get register information
 227      by passing null for the REGS argument to re_match, etc., not by
 228      setting no_sub, unless RE_NO_SUB is set.  */
 229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 230
 231   /* Match anchors at newline.  */
 232   bufp->newline_anchor = 1;
 233
 234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 235
 236   if (!ret)
 237     return NULL;
 238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 239 }
 240 #ifdef _LIBC
 241 weak_alias (__re_compile_pattern, re_compile_pattern)
 242 #endif
 243
 244 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 245    also be assigned to arbitrarily: each pattern buffer stores its own
 246    syntax, so it can be changed between regex compilations.  */
 247 /* This has no initializer because initialized variables in Emacs
 248    become read-only after dumping.  */
 249 reg_syntax_t re_syntax_options;
 250
 251
 252 /* Specify the precise syntax of regexps for compilation.  This provides
 253    for compatibility for various utilities which historically have
 254    different, incompatible syntaxes.
 255
 256    The argument SYNTAX is a bit mask comprised of the various bits
 257    defined in regex.h.  We return the old syntax.  */
 258
 259 reg_syntax_t
 260 re_set_syntax (syntax)
 261     reg_syntax_t syntax;
 262 {
 263   reg_syntax_t ret = re_syntax_options;
 264
 265   re_syntax_options = syntax;
 266   return ret;
 267 }
 268 #ifdef _LIBC
 269 weak_alias (__re_set_syntax, re_set_syntax)
 270 #endif
 271
 272 int
 273 re_compile_fastmap (bufp)
 274     struct re_pattern_buffer *bufp;
 275 {
 276   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 277   char *fastmap = bufp->fastmap;
 278
 279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 281   if (dfa->init_state != dfa->init_state_word)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 283   if (dfa->init_state != dfa->init_state_nl)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 285   if (dfa->init_state != dfa->init_state_begbuf)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 287   bufp->fastmap_accurate = 1;
 288   return 0;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 292 #endif
 293
 294 static inline void
 295 __attribute__ ((always_inline))
 296 re_set_fastmap (char *fastmap, bool icase, int ch)
 297 {
 298   fastmap[ch] = 1;
 299   if (icase)
 300     fastmap[tolower (ch)] = 1;
 301 }
 302
 303 /* Helper function for re_compile_fastmap.
 304    Compile fastmap for the initial_state INIT_STATE.  */
 305
 306 static void
 307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 308                          char *fastmap)
 309 {
 310   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 311   int node_cnt;
 312   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 314     {
 315       int node = init_state->nodes.elems[node_cnt];
 316       re_token_type_t type = dfa->nodes[node].type;
 317
 318       if (type == CHARACTER)
 319         {
 320           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 321 #ifdef RE_ENABLE_I18N
 322           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 323             {
 324               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 325               wchar_t wc;
 326               mbstate_t state;
 327
 328               p = buf;
 329               *p++ = dfa->nodes[node].opr.c;
 330               while (++node < dfa->nodes_len
 331                      && dfa->nodes[node].type == CHARACTER
 332                      && dfa->nodes[node].mb_partial)
 333                 *p++ = dfa->nodes[node].opr.c;
 334               memset (&state, '\0', sizeof (state));
 335               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 336                              &state) == p - buf
 337                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 338                       != (size_t) -1))
 339                 re_set_fastmap (fastmap, 0, buf[0]);
 340             }
 341 #endif
 342         }
 343       else if (type == SIMPLE_BRACKET)
 344         {
 345           int i, ch;
 346           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 347             {
 348               int j;
 349               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 350               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 351                 if (w & ((bitset_word_t) 1 << j))
 352                   re_set_fastmap (fastmap, icase, ch);
 353             }
 354         }
 355 #ifdef RE_ENABLE_I18N
 356       else if (type == COMPLEX_BRACKET)
 357         {
 358           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 359           int i;
 360
 361 # ifdef _LIBC
 362           /* See if we have to try all bytes which start multiple collation
 363              elements.
 364              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 365                   collation element, and don't catch 'b' since 'b' is
 366                   the only collation element which starts from 'b' (and
 367                   it is caught by SIMPLE_BRACKET).  */
 368               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 369                   && (cset->ncoll_syms || cset->nranges))
 370                 {
 371                   const int32_t *table = (const int32_t *)
 372                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 373                   for (i = 0; i < SBC_MAX; ++i)
 374                     if (table[i] < 0)
 375                       re_set_fastmap (fastmap, icase, i);
 376                 }
 377 # endif /* _LIBC */
 378
 379           /* See if we have to start the match at all multibyte characters,
 380              i.e. where we would not find an invalid sequence.  This only
 381              applies to multibyte character sets; for single byte character
 382              sets, the SIMPLE_BRACKET again suffices.  */
 383           if (dfa->mb_cur_max > 1
 384               && (cset->nchar_classes || cset->non_match || cset->nranges
 385 # ifdef _LIBC
 386                   || cset->nequiv_classes
 387 # endif /* _LIBC */
 388                  ))
 389             {
 390               unsigned char c = 0;
 391               do
 392                 {
 393                   mbstate_t mbs;
 394                   memset (&mbs, 0, sizeof (mbs));
 395                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 396                     re_set_fastmap (fastmap, false, (int) c);
 397                 }
 398               while (++c != 0);
 399             }
 400
 401           else
 402             {
 403               /* ... Else catch all bytes which can start the mbchars.  */
 404               for (i = 0; i < cset->nmbchars; ++i)
 405                 {
 406                   char buf[256];
 407                   mbstate_t state;
 408                   memset (&state, '\0', sizeof (state));
 409                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 410                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 411                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 412                     {
 413                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 414                           != (size_t) -1)
 415                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 416                     }
 417                 }
 418             }
 419         }
 420 #endif /* RE_ENABLE_I18N */
 421       else if (type == OP_PERIOD
 422 #ifdef RE_ENABLE_I18N
 423                || type == OP_UTF8_PERIOD
 424 #endif /* RE_ENABLE_I18N */
 425                || type == END_OF_RE)
 426         {
 427           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 428           if (type == END_OF_RE)
 429             bufp->can_be_null = 1;
 430           return;
 431         }
 432     }
 433 }
 434 \f
 435 /* Entry point for POSIX code.  */
 436 /* regcomp takes a regular expression as a string and compiles it.
 437
 438    PREG is a regex_t *.  We do not expect any fields to be initialized,
 439    since POSIX says we shouldn't.  Thus, we set
 440
 441      'buffer' to the compiled pattern;
 442      'used' to the length of the compiled pattern;
 443      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 444        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 445        RE_SYNTAX_POSIX_BASIC;
 446      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 447      'fastmap' to an allocated space for the fastmap;
 448      'fastmap_accurate' to zero;
 449      're_nsub' to the number of subexpressions in PATTERN.
 450
 451    PATTERN is the address of the pattern string.
 452
 453    CFLAGS is a series of bits which affect compilation.
 454
 455      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 456      use POSIX basic syntax.
 457
 458      If REG_NEWLINE is set, then . and [^...] don't match newline.
 459      Also, regexec will try a match beginning after every newline.
 460
 461      If REG_ICASE is set, then we considers upper- and lowercase
 462      versions of letters to be equivalent when matching.
 463
 464      If REG_NOSUB is set, then when PREG is passed to regexec, that
 465      routine will report only success or failure, and nothing about the
 466      registers.
 467
 468    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 469    the return codes and their meanings.)  */
 470
 471 int
 472 regcomp (preg, pattern, cflags)
 473     regex_t *__restrict preg;
 474     const char *__restrict pattern;
 475     int cflags;
 476 {
 477   reg_errcode_t ret;
 478   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 479                          : RE_SYNTAX_POSIX_BASIC);
 480
 481   preg->buffer = NULL;
 482   preg->allocated = 0;
 483   preg->used = 0;
 484
 485   /* Try to allocate space for the fastmap.  */
 486   preg->fastmap = re_malloc (char, SBC_MAX);
 487   if (BE (preg->fastmap == NULL, 0))
 488     return REG_ESPACE;
 489
 490   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 491
 492   /* If REG_NEWLINE is set, newlines are treated differently.  */
 493   if (cflags & REG_NEWLINE)
 494     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 495       syntax &= ~RE_DOT_NEWLINE;
 496       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 497       /* It also changes the matching behavior.  */
 498       preg->newline_anchor = 1;
 499     }
 500   else
 501     preg->newline_anchor = 0;
 502   preg->no_sub = !!(cflags & REG_NOSUB);
 503   preg->translate = NULL;
 504
 505   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 506
 507   /* POSIX doesn't distinguish between an unmatched open-group and an
 508      unmatched close-group: both are REG_EPAREN.  */
 509   if (ret == REG_ERPAREN)
 510     ret = REG_EPAREN;
 511
 512   /* We have already checked preg->fastmap != NULL.  */
 513   if (BE (ret == REG_NOERROR, 1))
 514     /* Compute the fastmap now, since regexec cannot modify the pattern
 515        buffer.  This function never fails in this implementation.  */
 516     (void) re_compile_fastmap (preg);
 517   else
 518     {
 519       /* Some error occurred while compiling the expression.  */
 520       re_free (preg->fastmap);
 521       preg->fastmap = NULL;
 522     }
 523
 524   return (int) ret;
 525 }
 526 #ifdef _LIBC
 527 weak_alias (__regcomp, regcomp)
 528 #endif
 529
 530 /* Returns a message corresponding to an error code, ERRCODE, returned
 531    from either regcomp or regexec.   We don't use PREG here.  */
 532
 533 size_t
 534 regerror (errcode, preg, errbuf, errbuf_size)
 535     int errcode;
 536     const regex_t *__restrict preg;
 537     char *__restrict errbuf;
 538     size_t errbuf_size;
 539 {
 540   const char *msg;
 541   size_t msg_size;
 542
 543   if (BE (errcode < 0
 544           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 545                                / sizeof (__re_error_msgid_idx[0])), 0))
 546     /* Only error codes returned by the rest of the code should be passed
 547        to this routine.  If we are given anything else, or if other regex
 548        code generates an invalid error code, then the program has a bug.
 549        Dump core so we can fix it.  */
 550     abort ();
 551
 552   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 553
 554   msg_size = strlen (msg) + 1; /* Includes the null.  */
 555
 556   if (BE (errbuf_size != 0, 1))
 557     {
 558       if (BE (msg_size > errbuf_size, 0))
 559         {
 560 #if defined HAVE_MEMPCPY || defined _LIBC
 561           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 562 #else
 563           memcpy (errbuf, msg, errbuf_size - 1);
 564           errbuf[errbuf_size - 1] = 0;
 565 #endif
 566         }
 567       else
 568         memcpy (errbuf, msg, msg_size);
 569     }
 570
 571   return msg_size;
 572 }
 573 #ifdef _LIBC
 574 weak_alias (__regerror, regerror)
 575 #endif
 576
 577
 578 #ifdef RE_ENABLE_I18N
 579 /* This static array is used for the map to single-byte characters when
 580    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 581    it the same all the time.  UTF-8 is the preferred encoding so this is
 582    a worthwhile optimization.  */
 583 static const bitset_t utf8_sb_map =
 584 {
 585   /* Set the first 128 bits.  */
 586   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 587 };
 588 #endif
 589
 590
 591 static void
 592 free_dfa_content (re_dfa_t *dfa)
 593 {
 594   int i, j;
 595
 596   if (dfa->nodes)
 597     for (i = 0; i < dfa->nodes_len; ++i)
 598       free_token (dfa->nodes + i);
 599   re_free (dfa->nexts);
 600   for (i = 0; i < dfa->nodes_len; ++i)
 601     {
 602       if (dfa->eclosures != NULL)
 603         re_node_set_free (dfa->eclosures + i);
 604       if (dfa->inveclosures != NULL)
 605         re_node_set_free (dfa->inveclosures + i);
 606       if (dfa->edests != NULL)
 607         re_node_set_free (dfa->edests + i);
 608     }
 609   re_free (dfa->edests);
 610   re_free (dfa->eclosures);
 611   re_free (dfa->inveclosures);
 612   re_free (dfa->nodes);
 613
 614   if (dfa->state_table)
 615     for (i = 0; i <= dfa->state_hash_mask; ++i)
 616       {
 617         struct re_state_table_entry *entry = dfa->state_table + i;
 618         for (j = 0; j < entry->num; ++j)
 619           {
 620             re_dfastate_t *state = entry->array[j];
 621             free_state (state);
 622           }
 623         re_free (entry->array);
 624       }
 625   re_free (dfa->state_table);
 626 #ifdef RE_ENABLE_I18N
 627   if (dfa->sb_char != utf8_sb_map)
 628     re_free (dfa->sb_char);
 629 #endif
 630   re_free (dfa->subexp_map);
 631 #ifdef DEBUG
 632   re_free (dfa->re_str);
 633 #endif
 634
 635   re_free (dfa);
 636 }
 637
 638
 639 /* Free dynamically allocated space used by PREG.  */
 640
 641 void
 642 regfree (preg)
 643     regex_t *preg;
 644 {
 645   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 646   if (BE (dfa != NULL, 1))
 647     free_dfa_content (dfa);
 648   preg->buffer = NULL;
 649   preg->allocated = 0;
 650
 651   re_free (preg->fastmap);
 652   preg->fastmap = NULL;
 653
 654   re_free (preg->translate);
 655   preg->translate = NULL;
 656 }
 657 #ifdef _LIBC
 658 weak_alias (__regfree, regfree)
 659 #endif
 660 \f
 661 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 662    them unless specifically requested.  */
 663
 664 #if defined _REGEX_RE_COMP || defined _LIBC
 665
 666 /* BSD has one and only one pattern buffer.  */
 667 static struct re_pattern_buffer re_comp_buf;
 668
 669 char *
 670 # ifdef _LIBC
 671 /* Make these definitions weak in libc, so POSIX programs can redefine
 672    these names if they don't use our functions, and still use
 673    regcomp/regexec above without link errors.  */
 674 weak_function
 675 # endif
 676 re_comp (s)
 677      const char *s;
 678 {
 679   reg_errcode_t ret;
 680   char *fastmap;
 681
 682   if (!s)
 683     {
 684       if (!re_comp_buf.buffer)
 685         return gettext ("No previous regular expression");
 686       return 0;
 687     }
 688
 689   if (re_comp_buf.buffer)
 690     {
 691       fastmap = re_comp_buf.fastmap;
 692       re_comp_buf.fastmap = NULL;
 693       __regfree (&re_comp_buf);
 694       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 695       re_comp_buf.fastmap = fastmap;
 696     }
 697
 698   if (re_comp_buf.fastmap == NULL)
 699     {
 700       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 701       if (re_comp_buf.fastmap == NULL)
 702         return (char *) gettext (__re_error_msgid
 703                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 704     }
 705
 706   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 707      don't need to initialize the pattern buffer fields which affect it.  */
 708
 709   /* Match anchors at newlines.  */
 710   re_comp_buf.newline_anchor = 1;
 711
 712   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 713
 714   if (!ret)
 715     return NULL;
 716
 717   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 718   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 719 }
 720
 721 #ifdef _LIBC
 722 libc_freeres_fn (free_mem)
 723 {
 724   __regfree (&re_comp_buf);
 725 }
 726 #endif
 727
 728 #endif /* _REGEX_RE_COMP */
 729 \f
 730 /* Internal entry point.
 731    Compile the regular expression PATTERN, whose length is LENGTH.
 732    SYNTAX indicate regular expression's syntax.  */
 733
 734 static reg_errcode_t
 735 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 736                      reg_syntax_t syntax)
 737 {
 738   reg_errcode_t err = REG_NOERROR;
 739   re_dfa_t *dfa;
 740   re_string_t regexp;
 741
 742   /* Initialize the pattern buffer.  */
 743   preg->fastmap_accurate = 0;
 744   preg->syntax = syntax;
 745   preg->not_bol = preg->not_eol = 0;
 746   preg->used = 0;
 747   preg->re_nsub = 0;
 748   preg->can_be_null = 0;
 749   preg->regs_allocated = REGS_UNALLOCATED;
 750
 751   /* Initialize the dfa.  */
 752   dfa = (re_dfa_t *) preg->buffer;
 753   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 754     {
 755       /* If zero allocated, but buffer is non-null, try to realloc
 756          enough space.  This loses if buffer's address is bogus, but
 757          that is the user's responsibility.  If ->buffer is NULL this
 758          is a simple allocation.  */
 759       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 760       if (dfa == NULL)
 761         return REG_ESPACE;
 762       preg->allocated = sizeof (re_dfa_t);
 763       preg->buffer = (unsigned char *) dfa;
 764     }
 765   preg->used = sizeof (re_dfa_t);
 766
 767   err = init_dfa (dfa, length);
 768   if (BE (err != REG_NOERROR, 0))
 769     {
 770       free_dfa_content (dfa);
 771       preg->buffer = NULL;
 772       preg->allocated = 0;
 773       return err;
 774     }
 775 #ifdef DEBUG
 776   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 777   dfa->re_str = re_malloc (char, length + 1);
 778   strncpy (dfa->re_str, pattern, length + 1);
 779 #endif
 780
 781   __libc_lock_init (dfa->lock);
 782
 783   err = re_string_construct (&regexp, pattern, length, preg->translate,
 784                              syntax & RE_ICASE, dfa);
 785   if (BE (err != REG_NOERROR, 0))
 786     {
 787     re_compile_internal_free_return:
 788       free_workarea_compile (preg);
 789       re_string_destruct (&regexp);
 790       free_dfa_content (dfa);
 791       preg->buffer = NULL;
 792       preg->allocated = 0;
 793       return err;
 794     }
 795
 796   /* Parse the regular expression, and build a structure tree.  */
 797   preg->re_nsub = 0;
 798   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 799   if (BE (dfa->str_tree == NULL, 0))
 800     goto re_compile_internal_free_return;
 801
 802   /* Analyze the tree and create the nfa.  */
 803   err = analyze (preg);
 804   if (BE (err != REG_NOERROR, 0))
 805     goto re_compile_internal_free_return;
 806
 807 #ifdef RE_ENABLE_I18N
 808   /* If possible, do searching in single byte encoding to speed things up.  */
 809   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 810     optimize_utf8 (dfa);
 811 #endif
 812
 813   /* Then create the initial state of the dfa.  */
 814   err = create_initial_state (dfa);
 815
 816   /* Release work areas.  */
 817   free_workarea_compile (preg);
 818   re_string_destruct (&regexp);
 819
 820   if (BE (err != REG_NOERROR, 0))
 821     {
 822       free_dfa_content (dfa);
 823       preg->buffer = NULL;
 824       preg->allocated = 0;
 825     }
 826
 827   return err;
 828 }
 829
 830 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 831    as the initial length of some arrays.  */
 832
 833 static reg_errcode_t
 834 init_dfa (re_dfa_t *dfa, size_t pat_len)
 835 {
 836   unsigned int table_size;
 837 #ifndef _LIBC
 838   char *codeset_name;
 839 #endif
 840
 841   memset (dfa, '\0', sizeof (re_dfa_t));
 842
 843   /* Force allocation of str_tree_storage the first time.  */
 844   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 845
 846   /* Avoid overflows.  */
 847   if (pat_len == SIZE_MAX)
 848     return REG_ESPACE;
 849
 850   dfa->nodes_alloc = pat_len + 1;
 851   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 852
 853   /*  table_size = 2 ^ ceil(log pat_len) */
 854   for (table_size = 1; ; table_size <<= 1)
 855     if (table_size > pat_len)
 856       break;
 857
 858   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 859   dfa->state_hash_mask = table_size - 1;
 860
 861   dfa->mb_cur_max = MB_CUR_MAX;
 862 #ifdef _LIBC
 863   if (dfa->mb_cur_max == 6
 864       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 865     dfa->is_utf8 = 1;
 866   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 867                        != 0);
 868 #else
 869 # ifdef HAVE_LANGINFO_CODESET
 870   codeset_name = nl_langinfo (CODESET);
 871 # else
 872   codeset_name = getenv ("LC_ALL");
 873   if (codeset_name == NULL || codeset_name[0] == '\0')
 874     codeset_name = getenv ("LC_CTYPE");
 875   if (codeset_name == NULL || codeset_name[0] == '\0')
 876     codeset_name = getenv ("LANG");
 877   if (codeset_name == NULL)
 878     codeset_name = "";
 879   else if (strchr (codeset_name, '.') !=  NULL)
 880     codeset_name = strchr (codeset_name, '.') + 1;
 881 # endif
 882
 883   if (strcasecmp (codeset_name, "UTF-8") == 0
 884       || strcasecmp (codeset_name, "UTF8") == 0)
 885     dfa->is_utf8 = 1;
 886
 887   /* We check exhaustively in the loop below if this charset is a
 888      superset of ASCII.  */
 889   dfa->map_notascii = 0;
 890 #endif
 891
 892 #ifdef RE_ENABLE_I18N
 893   if (dfa->mb_cur_max > 1)
 894     {
 895       if (dfa->is_utf8)
 896         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 897       else
 898         {
 899           int i, j, ch;
 900
 901           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 902           if (BE (dfa->sb_char == NULL, 0))
 903             return REG_ESPACE;
 904
 905           /* Set the bits corresponding to single byte chars.  */
 906           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 907             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 908               {
 909                 wint_t wch = __btowc (ch);
 910                 if (wch != WEOF)
 911                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 912 # ifndef _LIBC
 913                 if (isascii (ch) && wch != ch)
 914                   dfa->map_notascii = 1;
 915 # endif
 916               }
 917         }
 918     }
 919 #endif
 920
 921   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 922     return REG_ESPACE;
 923   return REG_NOERROR;
 924 }
 925
 926 /* Initialize WORD_CHAR table, which indicate which character is
 927    "word".  In this case "word" means that it is the word construction
 928    character used by some operators like "\<", "\>", etc.  */
 929
 930 static void
 931 internal_function
 932 init_word_char (re_dfa_t *dfa)
 933 {
 934   dfa->word_ops_used = 1;
 935   int i = 0;
 936   int ch = 0;
 937   if (BE (dfa->map_notascii == 0, 1))
 938     {
 939       if (sizeof (dfa->word_char[0]) == 8)
 940         {
 941           /* The extra temporaries here avoid "implicitly truncated"
 942              warnings in the case when this is dead code, i.e. 32-bit.  */
 943           const uint64_t wc0 = UINT64_C (0x03ff000000000000);
 944           const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
 945           dfa->word_char[0] = wc0;
 946           dfa->word_char[1] = wc1;
 947           i = 2;
 948         }
 949       else if (sizeof (dfa->word_char[0]) == 4)
 950         {
 951           dfa->word_char[0] = UINT32_C (0x00000000);
 952           dfa->word_char[1] = UINT32_C (0x03ff0000);
 953           dfa->word_char[2] = UINT32_C (0x87fffffe);
 954           dfa->word_char[3] = UINT32_C (0x07fffffe);
 955           i = 4;
 956         }
 957       else
 958         abort ();
 959       ch = 128;
 960
 961       if (BE (dfa->is_utf8, 1))
 962         {
 963           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 964           return;
 965         }
 966     }
 967
 968   for (; i < BITSET_WORDS; ++i)
 969     for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 970       if (isalnum (ch) || ch == '_')
 971         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 972 }
 973
 974 /* Free the work area which are only used while compiling.  */
 975
 976 static void
 977 free_workarea_compile (regex_t *preg)
 978 {
 979   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 980   bin_tree_storage_t *storage, *next;
 981   for (storage = dfa->str_tree_storage; storage; storage = next)
 982     {
 983       next = storage->next;
 984       re_free (storage);
 985     }
 986   dfa->str_tree_storage = NULL;
 987   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 988   dfa->str_tree = NULL;
 989   re_free (dfa->org_indices);
 990   dfa->org_indices = NULL;
 991 }
 992
 993 /* Create initial states for all contexts.  */
 994
 995 static reg_errcode_t
 996 create_initial_state (re_dfa_t *dfa)
 997 {
 998   int first, i;
 999   reg_errcode_t err;
1000   re_node_set init_nodes;
1001
1002   /* Initial states have the epsilon closure of the node which is
1003      the first node of the regular expression.  */
1004   first = dfa->str_tree->first->node_idx;
1005   dfa->init_node = first;
1006   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1007   if (BE (err != REG_NOERROR, 0))
1008     return err;
1009
1010   /* The back-references which are in initial states can epsilon transit,
1011      since in this case all of the subexpressions can be null.
1012      Then we add epsilon closures of the nodes which are the next nodes of
1013      the back-references.  */
1014   if (dfa->nbackref > 0)
1015     for (i = 0; i < init_nodes.nelem; ++i)
1016       {
1017         int node_idx = init_nodes.elems[i];
1018         re_token_type_t type = dfa->nodes[node_idx].type;
1019
1020         int clexp_idx;
1021         if (type != OP_BACK_REF)
1022           continue;
1023         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1024           {
1025             re_token_t *clexp_node;
1026             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1027             if (clexp_node->type == OP_CLOSE_SUBEXP
1028                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1029               break;
1030           }
1031         if (clexp_idx == init_nodes.nelem)
1032           continue;
1033
1034         if (type == OP_BACK_REF)
1035           {
1036             int dest_idx = dfa->edests[node_idx].elems[0];
1037             if (!re_node_set_contains (&init_nodes, dest_idx))
1038               {
1039                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1040                                                        dfa->eclosures
1041                                                        + dest_idx);
1042                 if (err != REG_NOERROR)
1043                   return err;
1044                 i = 0;
1045               }
1046           }
1047       }
1048
1049   /* It must be the first time to invoke acquire_state.  */
1050   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1051   /* We don't check ERR here, since the initial state must not be NULL.  */
1052   if (BE (dfa->init_state == NULL, 0))
1053     return err;
1054   if (dfa->init_state->has_constraint)
1055     {
1056       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057                                                        CONTEXT_WORD);
1058       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059                                                      CONTEXT_NEWLINE);
1060       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1061                                                          &init_nodes,
1062                                                          CONTEXT_NEWLINE
1063                                                          | CONTEXT_BEGBUF);
1064       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1065               || dfa->init_state_begbuf == NULL, 0))
1066         return err;
1067     }
1068   else
1069     dfa->init_state_word = dfa->init_state_nl
1070       = dfa->init_state_begbuf = dfa->init_state;
1071
1072   re_node_set_free (&init_nodes);
1073   return REG_NOERROR;
1074 }
1075 \f
1076 #ifdef RE_ENABLE_I18N
1077 /* If it is possible to do searching in single byte encoding instead of UTF-8
1078    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1079    DFA nodes where needed.  */
1080
1081 static void
1082 optimize_utf8 (re_dfa_t *dfa)
1083 {
1084   int node, i, mb_chars = 0, has_period = 0;
1085
1086   for (node = 0; node < dfa->nodes_len; ++node)
1087     switch (dfa->nodes[node].type)
1088       {
1089       case CHARACTER:
1090         if (dfa->nodes[node].opr.c >= 0x80)
1091           mb_chars = 1;
1092         break;
1093       case ANCHOR:
1094         switch (dfa->nodes[node].opr.ctx_type)
1095           {
1096           case LINE_FIRST:
1097           case LINE_LAST:
1098           case BUF_FIRST:
1099           case BUF_LAST:
1100             break;
1101           default:
1102             /* Word anchors etc. cannot be handled.  It's okay to test
1103                opr.ctx_type since constraints (for all DFA nodes) are
1104                created by ORing one or more opr.ctx_type values.  */
1105             return;
1106           }
1107         break;
1108       case OP_PERIOD:
1109         has_period = 1;
1110         break;
1111       case OP_BACK_REF:
1112       case OP_ALT:
1113       case END_OF_RE:
1114       case OP_DUP_ASTERISK:
1115       case OP_OPEN_SUBEXP:
1116       case OP_CLOSE_SUBEXP:
1117         break;
1118       case COMPLEX_BRACKET:
1119         return;
1120       case SIMPLE_BRACKET:
1121         /* Just double check.  The non-ASCII range starts at 0x80.  */
1122         assert (0x80 % BITSET_WORD_BITS == 0);
1123         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1124           if (dfa->nodes[node].opr.sbcset[i])
1125             return;
1126         break;
1127       default:
1128         abort ();
1129       }
1130
1131   if (mb_chars || has_period)
1132     for (node = 0; node < dfa->nodes_len; ++node)
1133       {
1134         if (dfa->nodes[node].type == CHARACTER
1135             && dfa->nodes[node].opr.c >= 0x80)
1136           dfa->nodes[node].mb_partial = 0;
1137         else if (dfa->nodes[node].type == OP_PERIOD)
1138           dfa->nodes[node].type = OP_UTF8_PERIOD;
1139       }
1140
1141   /* The search can be in single byte locale.  */
1142   dfa->mb_cur_max = 1;
1143   dfa->is_utf8 = 0;
1144   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1145 }
1146 #endif
1147 \f
1148 /* Analyze the structure tree, and calculate "first", "next", "edest",
1149    "eclosure", and "inveclosure".  */
1150
1151 static reg_errcode_t
1152 analyze (regex_t *preg)
1153 {
1154   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1155   reg_errcode_t ret;
1156
1157   /* Allocate arrays.  */
1158   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1159   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1160   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1161   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1162   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1163           || dfa->eclosures == NULL, 0))
1164     return REG_ESPACE;
1165
1166   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1167   if (dfa->subexp_map != NULL)
1168     {
1169       int i;
1170       for (i = 0; i < preg->re_nsub; i++)
1171         dfa->subexp_map[i] = i;
1172       preorder (dfa->str_tree, optimize_subexps, dfa);
1173       for (i = 0; i < preg->re_nsub; i++)
1174         if (dfa->subexp_map[i] != i)
1175           break;
1176       if (i == preg->re_nsub)
1177         {
1178           free (dfa->subexp_map);
1179           dfa->subexp_map = NULL;
1180         }
1181     }
1182
1183   ret = postorder (dfa->str_tree, lower_subexps, preg);
1184   if (BE (ret != REG_NOERROR, 0))
1185     return ret;
1186   ret = postorder (dfa->str_tree, calc_first, dfa);
1187   if (BE (ret != REG_NOERROR, 0))
1188     return ret;
1189   preorder (dfa->str_tree, calc_next, dfa);
1190   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1191   if (BE (ret != REG_NOERROR, 0))
1192     return ret;
1193   ret = calc_eclosure (dfa);
1194   if (BE (ret != REG_NOERROR, 0))
1195     return ret;
1196
1197   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1198      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1199   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1200       || dfa->nbackref)
1201     {
1202       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1203       if (BE (dfa->inveclosures == NULL, 0))
1204         return REG_ESPACE;
1205       ret = calc_inveclosure (dfa);
1206     }
1207
1208   return ret;
1209 }
1210
1211 /* Our parse trees are very unbalanced, so we cannot use a stack to
1212    implement parse tree visits.  Instead, we use parent pointers and
1213    some hairy code in these two functions.  */
1214 static reg_errcode_t
1215 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1216            void *extra)
1217 {
1218   bin_tree_t *node, *prev;
1219
1220   for (node = root; ; )
1221     {
1222       /* Descend down the tree, preferably to the left (or to the right
1223          if that's the only child).  */
1224       while (node->left || node->right)
1225         if (node->left)
1226           node = node->left;
1227         else
1228           node = node->right;
1229
1230       do
1231         {
1232           reg_errcode_t err = fn (extra, node);
1233           if (BE (err != REG_NOERROR, 0))
1234             return err;
1235           if (node->parent == NULL)
1236             return REG_NOERROR;
1237           prev = node;
1238           node = node->parent;
1239         }
1240       /* Go up while we have a node that is reached from the right.  */
1241       while (node->right == prev || node->right == NULL);
1242       node = node->right;
1243     }
1244 }
1245
1246 static reg_errcode_t
1247 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1248           void *extra)
1249 {
1250   bin_tree_t *node;
1251
1252   for (node = root; ; )
1253     {
1254       reg_errcode_t err = fn (extra, node);
1255       if (BE (err != REG_NOERROR, 0))
1256         return err;
1257
1258       /* Go to the left node, or up and to the right.  */
1259       if (node->left)
1260         node = node->left;
1261       else
1262         {
1263           bin_tree_t *prev = NULL;
1264           while (node->right == prev || node->right == NULL)
1265             {
1266               prev = node;
1267               node = node->parent;
1268               if (!node)
1269                 return REG_NOERROR;
1270             }
1271           node = node->right;
1272         }
1273     }
1274 }
1275
1276 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1277    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1278    backreferences as well.  Requires a preorder visit.  */
1279 static reg_errcode_t
1280 optimize_subexps (void *extra, bin_tree_t *node)
1281 {
1282   re_dfa_t *dfa = (re_dfa_t *) extra;
1283
1284   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1285     {
1286       int idx = node->token.opr.idx;
1287       node->token.opr.idx = dfa->subexp_map[idx];
1288       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1289     }
1290
1291   else if (node->token.type == SUBEXP
1292            && node->left && node->left->token.type == SUBEXP)
1293     {
1294       int other_idx = node->left->token.opr.idx;
1295
1296       node->left = node->left->left;
1297       if (node->left)
1298         node->left->parent = node;
1299
1300       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1301       if (other_idx < BITSET_WORD_BITS)
1302           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1303     }
1304
1305   return REG_NOERROR;
1306 }
1307
1308 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1309    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1310 static reg_errcode_t
1311 lower_subexps (void *extra, bin_tree_t *node)
1312 {
1313   regex_t *preg = (regex_t *) extra;
1314   reg_errcode_t err = REG_NOERROR;
1315
1316   if (node->left && node->left->token.type == SUBEXP)
1317     {
1318       node->left = lower_subexp (&err, preg, node->left);
1319       if (node->left)
1320         node->left->parent = node;
1321     }
1322   if (node->right && node->right->token.type == SUBEXP)
1323     {
1324       node->right = lower_subexp (&err, preg, node->right);
1325       if (node->right)
1326         node->right->parent = node;
1327     }
1328
1329   return err;
1330 }
1331
1332 static bin_tree_t *
1333 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1334 {
1335   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1336   bin_tree_t *body = node->left;
1337   bin_tree_t *op, *cls, *tree1, *tree;
1338
1339   if (preg->no_sub
1340       /* We do not optimize empty subexpressions, because otherwise we may
1341          have bad CONCAT nodes with NULL children.  This is obviously not
1342          very common, so we do not lose much.  An example that triggers
1343          this case is the sed "script" /\(\)/x.  */
1344       && node->left != NULL
1345       && (node->token.opr.idx >= BITSET_WORD_BITS
1346           || !(dfa->used_bkref_map
1347                & ((bitset_word_t) 1 << node->token.opr.idx))))
1348     return node->left;
1349
1350   /* Convert the SUBEXP node to the concatenation of an
1351      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1352   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1353   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1354   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1355   tree = create_tree (dfa, op, tree1, CONCAT);
1356   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1357     {
1358       *err = REG_ESPACE;
1359       return NULL;
1360     }
1361
1362   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1363   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1364   return tree;
1365 }
1366
1367 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1368    nodes.  Requires a postorder visit.  */
1369 static reg_errcode_t
1370 calc_first (void *extra, bin_tree_t *node)
1371 {
1372   re_dfa_t *dfa = (re_dfa_t *) extra;
1373   if (node->token.type == CONCAT)
1374     {
1375       node->first = node->left->first;
1376       node->node_idx = node->left->node_idx;
1377     }
1378   else
1379     {
1380       node->first = node;
1381       node->node_idx = re_dfa_add_node (dfa, node->token);
1382       if (BE (node->node_idx == -1, 0))
1383         return REG_ESPACE;
1384       if (node->token.type == ANCHOR)
1385         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1386     }
1387   return REG_NOERROR;
1388 }
1389
1390 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1391 static reg_errcode_t
1392 calc_next (void *extra, bin_tree_t *node)
1393 {
1394   switch (node->token.type)
1395     {
1396     case OP_DUP_ASTERISK:
1397       node->left->next = node;
1398       break;
1399     case CONCAT:
1400       node->left->next = node->right->first;
1401       node->right->next = node->next;
1402       break;
1403     default:
1404       if (node->left)
1405         node->left->next = node->next;
1406       if (node->right)
1407         node->right->next = node->next;
1408       break;
1409     }
1410   return REG_NOERROR;
1411 }
1412
1413 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1414 static reg_errcode_t
1415 link_nfa_nodes (void *extra, bin_tree_t *node)
1416 {
1417   re_dfa_t *dfa = (re_dfa_t *) extra;
1418   int idx = node->node_idx;
1419   reg_errcode_t err = REG_NOERROR;
1420
1421   switch (node->token.type)
1422     {
1423     case CONCAT:
1424       break;
1425
1426     case END_OF_RE:
1427       assert (node->next == NULL);
1428       break;
1429
1430     case OP_DUP_ASTERISK:
1431     case OP_ALT:
1432       {
1433         int left, right;
1434         dfa->has_plural_match = 1;
1435         if (node->left != NULL)
1436           left = node->left->first->node_idx;
1437         else
1438           left = node->next->node_idx;
1439         if (node->right != NULL)
1440           right = node->right->first->node_idx;
1441         else
1442           right = node->next->node_idx;
1443         assert (left > -1);
1444         assert (right > -1);
1445         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1446       }
1447       break;
1448
1449     case ANCHOR:
1450     case OP_OPEN_SUBEXP:
1451     case OP_CLOSE_SUBEXP:
1452       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1453       break;
1454
1455     case OP_BACK_REF:
1456       dfa->nexts[idx] = node->next->node_idx;
1457       if (node->token.type == OP_BACK_REF)
1458         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1459       break;
1460
1461     default:
1462       assert (!IS_EPSILON_NODE (node->token.type));
1463       dfa->nexts[idx] = node->next->node_idx;
1464       break;
1465     }
1466
1467   return err;
1468 }
1469
1470 /* Duplicate the epsilon closure of the node ROOT_NODE.
1471    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1472    to their own constraint.  */
1473
1474 static reg_errcode_t
1475 internal_function
1476 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1477                         int root_node, unsigned int init_constraint)
1478 {
1479   int org_node, clone_node, ret;
1480   unsigned int constraint = init_constraint;
1481   for (org_node = top_org_node, clone_node = top_clone_node;;)
1482     {
1483       int org_dest, clone_dest;
1484       if (dfa->nodes[org_node].type == OP_BACK_REF)
1485         {
1486           /* If the back reference epsilon-transit, its destination must
1487              also have the constraint.  Then duplicate the epsilon closure
1488              of the destination of the back reference, and store it in
1489              edests of the back reference.  */
1490           org_dest = dfa->nexts[org_node];
1491           re_node_set_empty (dfa->edests + clone_node);
1492           clone_dest = duplicate_node (dfa, org_dest, constraint);
1493           if (BE (clone_dest == -1, 0))
1494             return REG_ESPACE;
1495           dfa->nexts[clone_node] = dfa->nexts[org_node];
1496           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1497           if (BE (ret < 0, 0))
1498             return REG_ESPACE;
1499         }
1500       else if (dfa->edests[org_node].nelem == 0)
1501         {
1502           /* In case of the node can't epsilon-transit, don't duplicate the
1503              destination and store the original destination as the
1504              destination of the node.  */
1505           dfa->nexts[clone_node] = dfa->nexts[org_node];
1506           break;
1507         }
1508       else if (dfa->edests[org_node].nelem == 1)
1509         {
1510           /* In case of the node can epsilon-transit, and it has only one
1511              destination.  */
1512           org_dest = dfa->edests[org_node].elems[0];
1513           re_node_set_empty (dfa->edests + clone_node);
1514           /* If the node is root_node itself, it means the epsilon closure
1515              has a loop.   Then tie it to the destination of the root_node.  */
1516           if (org_node == root_node && clone_node != org_node)
1517             {
1518               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1519               if (BE (ret < 0, 0))
1520                 return REG_ESPACE;
1521               break;
1522             }
1523           /* In case the node has another constraint, append it.  */
1524           constraint |= dfa->nodes[org_node].constraint;
1525           clone_dest = duplicate_node (dfa, org_dest, constraint);
1526           if (BE (clone_dest == -1, 0))
1527             return REG_ESPACE;
1528           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529           if (BE (ret < 0, 0))
1530             return REG_ESPACE;
1531         }
1532       else /* dfa->edests[org_node].nelem == 2 */
1533         {
1534           /* In case of the node can epsilon-transit, and it has two
1535              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1536           org_dest = dfa->edests[org_node].elems[0];
1537           re_node_set_empty (dfa->edests + clone_node);
1538           /* Search for a duplicated node which satisfies the constraint.  */
1539           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1540           if (clone_dest == -1)
1541             {
1542               /* There is no such duplicated node, create a new one.  */
1543               reg_errcode_t err;
1544               clone_dest = duplicate_node (dfa, org_dest, constraint);
1545               if (BE (clone_dest == -1, 0))
1546                 return REG_ESPACE;
1547               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548               if (BE (ret < 0, 0))
1549                 return REG_ESPACE;
1550               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1551                                             root_node, constraint);
1552               if (BE (err != REG_NOERROR, 0))
1553                 return err;
1554             }
1555           else
1556             {
1557               /* There is a duplicated node which satisfies the constraint,
1558                  use it to avoid infinite loop.  */
1559               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560               if (BE (ret < 0, 0))
1561                 return REG_ESPACE;
1562             }
1563
1564           org_dest = dfa->edests[org_node].elems[1];
1565           clone_dest = duplicate_node (dfa, org_dest, constraint);
1566           if (BE (clone_dest == -1, 0))
1567             return REG_ESPACE;
1568           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1569           if (BE (ret < 0, 0))
1570             return REG_ESPACE;
1571         }
1572       org_node = org_dest;
1573       clone_node = clone_dest;
1574     }
1575   return REG_NOERROR;
1576 }
1577
1578 /* Search for a node which is duplicated from the node ORG_NODE, and
1579    satisfies the constraint CONSTRAINT.  */
1580
1581 static int
1582 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1583                         unsigned int constraint)
1584 {
1585   int idx;
1586   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1587     {
1588       if (org_node == dfa->org_indices[idx]
1589           && constraint == dfa->nodes[idx].constraint)
1590         return idx; /* Found.  */
1591     }
1592   return -1; /* Not found.  */
1593 }
1594
1595 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1596    Return the index of the new node, or -1 if insufficient storage is
1597    available.  */
1598
1599 static int
1600 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1601 {
1602   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1603   if (BE (dup_idx != -1, 1))
1604     {
1605       dfa->nodes[dup_idx].constraint = constraint;
1606       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1607       dfa->nodes[dup_idx].duplicated = 1;
1608
1609       /* Store the index of the original node.  */
1610       dfa->org_indices[dup_idx] = org_idx;
1611     }
1612   return dup_idx;
1613 }
1614
1615 static reg_errcode_t
1616 calc_inveclosure (re_dfa_t *dfa)
1617 {
1618   int src, idx, ret;
1619   for (idx = 0; idx < dfa->nodes_len; ++idx)
1620     re_node_set_init_empty (dfa->inveclosures + idx);
1621
1622   for (src = 0; src < dfa->nodes_len; ++src)
1623     {
1624       int *elems = dfa->eclosures[src].elems;
1625       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1626         {
1627           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1628           if (BE (ret == -1, 0))
1629             return REG_ESPACE;
1630         }
1631     }
1632
1633   return REG_NOERROR;
1634 }
1635
1636 /* Calculate "eclosure" for all the node in DFA.  */
1637
1638 static reg_errcode_t
1639 calc_eclosure (re_dfa_t *dfa)
1640 {
1641   int node_idx, incomplete;
1642 #ifdef DEBUG
1643   assert (dfa->nodes_len > 0);
1644 #endif
1645   incomplete = 0;
1646   /* For each nodes, calculate epsilon closure.  */
1647   for (node_idx = 0; ; ++node_idx)
1648     {
1649       reg_errcode_t err;
1650       re_node_set eclosure_elem;
1651       if (node_idx == dfa->nodes_len)
1652         {
1653           if (!incomplete)
1654             break;
1655           incomplete = 0;
1656           node_idx = 0;
1657         }
1658
1659 #ifdef DEBUG
1660       assert (dfa->eclosures[node_idx].nelem != -1);
1661 #endif
1662
1663       /* If we have already calculated, skip it.  */
1664       if (dfa->eclosures[node_idx].nelem != 0)
1665         continue;
1666       /* Calculate epsilon closure of 'node_idx'.  */
1667       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1668       if (BE (err != REG_NOERROR, 0))
1669         return err;
1670
1671       if (dfa->eclosures[node_idx].nelem == 0)
1672         {
1673           incomplete = 1;
1674           re_node_set_free (&eclosure_elem);
1675         }
1676     }
1677   return REG_NOERROR;
1678 }
1679
1680 /* Calculate epsilon closure of NODE.  */
1681
1682 static reg_errcode_t
1683 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1684 {
1685   reg_errcode_t err;
1686   int i;
1687   re_node_set eclosure;
1688   int ret;
1689   int incomplete = 0;
1690   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1691   if (BE (err != REG_NOERROR, 0))
1692     return err;
1693
1694   /* This indicates that we are calculating this node now.
1695      We reference this value to avoid infinite loop.  */
1696   dfa->eclosures[node].nelem = -1;
1697
1698   /* If the current node has constraints, duplicate all nodes
1699      since they must inherit the constraints.  */
1700   if (dfa->nodes[node].constraint
1701       && dfa->edests[node].nelem
1702       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1703     {
1704       err = duplicate_node_closure (dfa, node, node, node,
1705                                     dfa->nodes[node].constraint);
1706       if (BE (err != REG_NOERROR, 0))
1707         return err;
1708     }
1709
1710   /* Expand each epsilon destination nodes.  */
1711   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1712     for (i = 0; i < dfa->edests[node].nelem; ++i)
1713       {
1714         re_node_set eclosure_elem;
1715         int edest = dfa->edests[node].elems[i];
1716         /* If calculating the epsilon closure of `edest' is in progress,
1717            return intermediate result.  */
1718         if (dfa->eclosures[edest].nelem == -1)
1719           {
1720             incomplete = 1;
1721             continue;
1722           }
1723         /* If we haven't calculated the epsilon closure of `edest' yet,
1724            calculate now. Otherwise use calculated epsilon closure.  */
1725         if (dfa->eclosures[edest].nelem == 0)
1726           {
1727             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1728             if (BE (err != REG_NOERROR, 0))
1729               return err;
1730           }
1731         else
1732           eclosure_elem = dfa->eclosures[edest];
1733         /* Merge the epsilon closure of 'edest'.  */
1734         err = re_node_set_merge (&eclosure, &eclosure_elem);
1735         if (BE (err != REG_NOERROR, 0))
1736           return err;
1737         /* If the epsilon closure of 'edest' is incomplete,
1738            the epsilon closure of this node is also incomplete.  */
1739         if (dfa->eclosures[edest].nelem == 0)
1740           {
1741             incomplete = 1;
1742             re_node_set_free (&eclosure_elem);
1743           }
1744       }
1745
1746   /* An epsilon closure includes itself.  */
1747   ret = re_node_set_insert (&eclosure, node);
1748   if (BE (ret < 0, 0))
1749     return REG_ESPACE;
1750   if (incomplete && !root)
1751     dfa->eclosures[node].nelem = 0;
1752   else
1753     dfa->eclosures[node] = eclosure;
1754   *new_set = eclosure;
1755   return REG_NOERROR;
1756 }
1757 \f
1758 /* Functions for token which are used in the parser.  */
1759
1760 /* Fetch a token from INPUT.
1761    We must not use this function inside bracket expressions.  */
1762
1763 static void
1764 internal_function
1765 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1766 {
1767   re_string_skip_bytes (input, peek_token (result, input, syntax));
1768 }
1769
1770 /* Peek a token from INPUT, and return the length of the token.
1771    We must not use this function inside bracket expressions.  */
1772
1773 static int
1774 internal_function
1775 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1776 {
1777   unsigned char c;
1778
1779   if (re_string_eoi (input))
1780     {
1781       token->type = END_OF_RE;
1782       return 0;
1783     }
1784
1785   c = re_string_peek_byte (input, 0);
1786   token->opr.c = c;
1787
1788   token->word_char = 0;
1789 #ifdef RE_ENABLE_I18N
1790   token->mb_partial = 0;
1791   if (input->mb_cur_max > 1 &&
1792       !re_string_first_byte (input, re_string_cur_idx (input)))
1793     {
1794       token->type = CHARACTER;
1795       token->mb_partial = 1;
1796       return 1;
1797     }
1798 #endif
1799   if (c == '\\')
1800     {
1801       unsigned char c2;
1802       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1803         {
1804           token->type = BACK_SLASH;
1805           return 1;
1806         }
1807
1808       c2 = re_string_peek_byte_case (input, 1);
1809       token->opr.c = c2;
1810       token->type = CHARACTER;
1811 #ifdef RE_ENABLE_I18N
1812       if (input->mb_cur_max > 1)
1813         {
1814           wint_t wc = re_string_wchar_at (input,
1815                                           re_string_cur_idx (input) + 1);
1816           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1817         }
1818       else
1819 #endif
1820         token->word_char = IS_WORD_CHAR (c2) != 0;
1821
1822       switch (c2)
1823         {
1824         case '|':
1825           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1826             token->type = OP_ALT;
1827           break;
1828         case '1': case '2': case '3': case '4': case '5':
1829         case '6': case '7': case '8': case '9':
1830           if (!(syntax & RE_NO_BK_REFS))
1831             {
1832               token->type = OP_BACK_REF;
1833               token->opr.idx = c2 - '1';
1834             }
1835           break;
1836         case '<':
1837           if (!(syntax & RE_NO_GNU_OPS))
1838             {
1839               token->type = ANCHOR;
1840               token->opr.ctx_type = WORD_FIRST;
1841             }
1842           break;
1843         case '>':
1844           if (!(syntax & RE_NO_GNU_OPS))
1845             {
1846               token->type = ANCHOR;
1847               token->opr.ctx_type = WORD_LAST;
1848             }
1849           break;
1850         case 'b':
1851           if (!(syntax & RE_NO_GNU_OPS))
1852             {
1853               token->type = ANCHOR;
1854               token->opr.ctx_type = WORD_DELIM;
1855             }
1856           break;
1857         case 'B':
1858           if (!(syntax & RE_NO_GNU_OPS))
1859             {
1860               token->type = ANCHOR;
1861               token->opr.ctx_type = NOT_WORD_DELIM;
1862             }
1863           break;
1864         case 'w':
1865           if (!(syntax & RE_NO_GNU_OPS))
1866             token->type = OP_WORD;
1867           break;
1868         case 'W':
1869           if (!(syntax & RE_NO_GNU_OPS))
1870             token->type = OP_NOTWORD;
1871           break;
1872         case 's':
1873           if (!(syntax & RE_NO_GNU_OPS))
1874             token->type = OP_SPACE;
1875           break;
1876         case 'S':
1877           if (!(syntax & RE_NO_GNU_OPS))
1878             token->type = OP_NOTSPACE;
1879           break;
1880         case '`':
1881           if (!(syntax & RE_NO_GNU_OPS))
1882             {
1883               token->type = ANCHOR;
1884               token->opr.ctx_type = BUF_FIRST;
1885             }
1886           break;
1887         case '\'':
1888           if (!(syntax & RE_NO_GNU_OPS))
1889             {
1890               token->type = ANCHOR;
1891               token->opr.ctx_type = BUF_LAST;
1892             }
1893           break;
1894         case '(':
1895           if (!(syntax & RE_NO_BK_PARENS))
1896             token->type = OP_OPEN_SUBEXP;
1897           break;
1898         case ')':
1899           if (!(syntax & RE_NO_BK_PARENS))
1900             token->type = OP_CLOSE_SUBEXP;
1901           break;
1902         case '+':
1903           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1904             token->type = OP_DUP_PLUS;
1905           break;
1906         case '?':
1907           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1908             token->type = OP_DUP_QUESTION;
1909           break;
1910         case '{':
1911           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1912             token->type = OP_OPEN_DUP_NUM;
1913           break;
1914         case '}':
1915           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1916             token->type = OP_CLOSE_DUP_NUM;
1917           break;
1918         default:
1919           break;
1920         }
1921       return 2;
1922     }
1923
1924   token->type = CHARACTER;
1925 #ifdef RE_ENABLE_I18N
1926   if (input->mb_cur_max > 1)
1927     {
1928       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1929       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1930     }
1931   else
1932 #endif
1933     token->word_char = IS_WORD_CHAR (token->opr.c);
1934
1935   switch (c)
1936     {
1937     case '\n':
1938       if (syntax & RE_NEWLINE_ALT)
1939         token->type = OP_ALT;
1940       break;
1941     case '|':
1942       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1943         token->type = OP_ALT;
1944       break;
1945     case '*':
1946       token->type = OP_DUP_ASTERISK;
1947       break;
1948     case '+':
1949       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1950         token->type = OP_DUP_PLUS;
1951       break;
1952     case '?':
1953       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1954         token->type = OP_DUP_QUESTION;
1955       break;
1956     case '{':
1957       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1958         token->type = OP_OPEN_DUP_NUM;
1959       break;
1960     case '}':
1961       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1962         token->type = OP_CLOSE_DUP_NUM;
1963       break;
1964     case '(':
1965       if (syntax & RE_NO_BK_PARENS)
1966         token->type = OP_OPEN_SUBEXP;
1967       break;
1968     case ')':
1969       if (syntax & RE_NO_BK_PARENS)
1970         token->type = OP_CLOSE_SUBEXP;
1971       break;
1972     case '[':
1973       token->type = OP_OPEN_BRACKET;
1974       break;
1975     case '.':
1976       token->type = OP_PERIOD;
1977       break;
1978     case '^':
1979       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1980           re_string_cur_idx (input) != 0)
1981         {
1982           char prev = re_string_peek_byte (input, -1);
1983           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1984             break;
1985         }
1986       token->type = ANCHOR;
1987       token->opr.ctx_type = LINE_FIRST;
1988       break;
1989     case '$':
1990       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1991           re_string_cur_idx (input) + 1 != re_string_length (input))
1992         {
1993           re_token_t next;
1994           re_string_skip_bytes (input, 1);
1995           peek_token (&next, input, syntax);
1996           re_string_skip_bytes (input, -1);
1997           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1998             break;
1999         }
2000       token->type = ANCHOR;
2001       token->opr.ctx_type = LINE_LAST;
2002       break;
2003     default:
2004       break;
2005     }
2006   return 1;
2007 }
2008
2009 /* Peek a token from INPUT, and return the length of the token.
2010    We must not use this function out of bracket expressions.  */
2011
2012 static int
2013 internal_function
2014 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2015 {
2016   unsigned char c;
2017   if (re_string_eoi (input))
2018     {
2019       token->type = END_OF_RE;
2020       return 0;
2021     }
2022   c = re_string_peek_byte (input, 0);
2023   token->opr.c = c;
2024
2025 #ifdef RE_ENABLE_I18N
2026   if (input->mb_cur_max > 1 &&
2027       !re_string_first_byte (input, re_string_cur_idx (input)))
2028     {
2029       token->type = CHARACTER;
2030       return 1;
2031     }
2032 #endif /* RE_ENABLE_I18N */
2033
2034   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2035       && re_string_cur_idx (input) + 1 < re_string_length (input))
2036     {
2037       /* In this case, '\' escape a character.  */
2038       unsigned char c2;
2039       re_string_skip_bytes (input, 1);
2040       c2 = re_string_peek_byte (input, 0);
2041       token->opr.c = c2;
2042       token->type = CHARACTER;
2043       return 1;
2044     }
2045   if (c == '[') /* '[' is a special char in a bracket exps.  */
2046     {
2047       unsigned char c2;
2048       int token_len;
2049       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2050         c2 = re_string_peek_byte (input, 1);
2051       else
2052         c2 = 0;
2053       token->opr.c = c2;
2054       token_len = 2;
2055       switch (c2)
2056         {
2057         case '.':
2058           token->type = OP_OPEN_COLL_ELEM;
2059           break;
2060         case '=':
2061           token->type = OP_OPEN_EQUIV_CLASS;
2062           break;
2063         case ':':
2064           if (syntax & RE_CHAR_CLASSES)
2065             {
2066               token->type = OP_OPEN_CHAR_CLASS;
2067               break;
2068             }
2069           /* else fall through.  */
2070         default:
2071           token->type = CHARACTER;
2072           token->opr.c = c;
2073           token_len = 1;
2074           break;
2075         }
2076       return token_len;
2077     }
2078   switch (c)
2079     {
2080     case '-':
2081       token->type = OP_CHARSET_RANGE;
2082       break;
2083     case ']':
2084       token->type = OP_CLOSE_BRACKET;
2085       break;
2086     case '^':
2087       token->type = OP_NON_MATCH_LIST;
2088       break;
2089     default:
2090       token->type = CHARACTER;
2091     }
2092   return 1;
2093 }
2094 \f
2095 /* Functions for parser.  */
2096
2097 /* Entry point of the parser.
2098    Parse the regular expression REGEXP and return the structure tree.
2099    If an error occurs, ERR is set by error code, and return NULL.
2100    This function build the following tree, from regular expression <reg_exp>:
2101            CAT
2102            / \
2103           /   \
2104    <reg_exp>  EOR
2105
2106    CAT means concatenation.
2107    EOR means end of regular expression.  */
2108
2109 static bin_tree_t *
2110 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2111        reg_errcode_t *err)
2112 {
2113   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2114   bin_tree_t *tree, *eor, *root;
2115   re_token_t current_token;
2116   dfa->syntax = syntax;
2117   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2118   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2119   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2120     return NULL;
2121   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2122   if (tree != NULL)
2123     root = create_tree (dfa, tree, eor, CONCAT);
2124   else
2125     root = eor;
2126   if (BE (eor == NULL || root == NULL, 0))
2127     {
2128       *err = REG_ESPACE;
2129       return NULL;
2130     }
2131   return root;
2132 }
2133
2134 /* This function build the following tree, from regular expression
2135    <branch1>|<branch2>:
2136            ALT
2137            / \
2138           /   \
2139    <branch1> <branch2>
2140
2141    ALT means alternative, which represents the operator '|'.  */
2142
2143 static bin_tree_t *
2144 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2145                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2146 {
2147   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2148   bin_tree_t *tree, *branch = NULL;
2149   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2150   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2151     return NULL;
2152
2153   while (token->type == OP_ALT)
2154     {
2155       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2156       if (token->type != OP_ALT && token->type != END_OF_RE
2157           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2158         {
2159           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2160           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2161             {
2162               if (tree != NULL)
2163                 postorder (tree, free_tree, NULL);
2164               return NULL;
2165             }
2166         }
2167       else
2168         branch = NULL;
2169       tree = create_tree (dfa, tree, branch, OP_ALT);
2170       if (BE (tree == NULL, 0))
2171         {
2172           *err = REG_ESPACE;
2173           return NULL;
2174         }
2175     }
2176   return tree;
2177 }
2178
2179 /* This function build the following tree, from regular expression
2180    <exp1><exp2>:
2181         CAT
2182         / \
2183        /   \
2184    <exp1> <exp2>
2185
2186    CAT means concatenation.  */
2187
2188 static bin_tree_t *
2189 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2190               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2191 {
2192   bin_tree_t *tree, *exp;
2193   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2194   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2195   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2196     return NULL;
2197
2198   while (token->type != OP_ALT && token->type != END_OF_RE
2199          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2200     {
2201       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2202       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2203         {
2204           if (tree != NULL)
2205             postorder (tree, free_tree, NULL);
2206           return NULL;
2207         }
2208       if (tree != NULL && exp != NULL)
2209         {
2210           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2211           if (newtree == NULL)
2212             {
2213               postorder (exp, free_tree, NULL);
2214               postorder (tree, free_tree, NULL);
2215               *err = REG_ESPACE;
2216               return NULL;
2217             }
2218           tree = newtree;
2219         }
2220       else if (tree == NULL)
2221         tree = exp;
2222       /* Otherwise exp == NULL, we don't need to create new tree.  */
2223     }
2224   return tree;
2225 }
2226
2227 /* This function build the following tree, from regular expression a*:
2228          *
2229          |
2230          a
2231 */
2232
2233 static bin_tree_t *
2234 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2235                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2236 {
2237   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2238   bin_tree_t *tree;
2239   switch (token->type)
2240     {
2241     case CHARACTER:
2242       tree = create_token_tree (dfa, NULL, NULL, token);
2243       if (BE (tree == NULL, 0))
2244         {
2245           *err = REG_ESPACE;
2246           return NULL;
2247         }
2248 #ifdef RE_ENABLE_I18N
2249       if (dfa->mb_cur_max > 1)
2250         {
2251           while (!re_string_eoi (regexp)
2252                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2253             {
2254               bin_tree_t *mbc_remain;
2255               fetch_token (token, regexp, syntax);
2256               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2257               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2258               if (BE (mbc_remain == NULL || tree == NULL, 0))
2259                 {
2260                   *err = REG_ESPACE;
2261                   return NULL;
2262                 }
2263             }
2264         }
2265 #endif
2266       break;
2267     case OP_OPEN_SUBEXP:
2268       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2269       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2270         return NULL;
2271       break;
2272     case OP_OPEN_BRACKET:
2273       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2274       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2275         return NULL;
2276       break;
2277     case OP_BACK_REF:
2278       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2279         {
2280           *err = REG_ESUBREG;
2281           return NULL;
2282         }
2283       dfa->used_bkref_map |= 1 << token->opr.idx;
2284       tree = create_token_tree (dfa, NULL, NULL, token);
2285       if (BE (tree == NULL, 0))
2286         {
2287           *err = REG_ESPACE;
2288           return NULL;
2289         }
2290       ++dfa->nbackref;
2291       dfa->has_mb_node = 1;
2292       break;
2293     case OP_OPEN_DUP_NUM:
2294       if (syntax & RE_CONTEXT_INVALID_DUP)
2295         {
2296           *err = REG_BADRPT;
2297           return NULL;
2298         }
2299       /* FALLTHROUGH */
2300     case OP_DUP_ASTERISK:
2301     case OP_DUP_PLUS:
2302     case OP_DUP_QUESTION:
2303       if (syntax & RE_CONTEXT_INVALID_OPS)
2304         {
2305           *err = REG_BADRPT;
2306           return NULL;
2307         }
2308       else if (syntax & RE_CONTEXT_INDEP_OPS)
2309         {
2310           fetch_token (token, regexp, syntax);
2311           return parse_expression (regexp, preg, token, syntax, nest, err);
2312         }
2313       /* else fall through  */
2314     case OP_CLOSE_SUBEXP:
2315       if ((token->type == OP_CLOSE_SUBEXP) &&
2316           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2317         {
2318           *err = REG_ERPAREN;
2319           return NULL;
2320         }
2321       /* else fall through  */
2322     case OP_CLOSE_DUP_NUM:
2323       /* We treat it as a normal character.  */
2324
2325       /* Then we can these characters as normal characters.  */
2326       token->type = CHARACTER;
2327       /* mb_partial and word_char bits should be initialized already
2328          by peek_token.  */
2329       tree = create_token_tree (dfa, NULL, NULL, token);
2330       if (BE (tree == NULL, 0))
2331         {
2332           *err = REG_ESPACE;
2333           return NULL;
2334         }
2335       break;
2336     case ANCHOR:
2337       if ((token->opr.ctx_type
2338            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2339           && dfa->word_ops_used == 0)
2340         init_word_char (dfa);
2341       if (token->opr.ctx_type == WORD_DELIM
2342           || token->opr.ctx_type == NOT_WORD_DELIM)
2343         {
2344           bin_tree_t *tree_first, *tree_last;
2345           if (token->opr.ctx_type == WORD_DELIM)
2346             {
2347               token->opr.ctx_type = WORD_FIRST;
2348               tree_first = create_token_tree (dfa, NULL, NULL, token);
2349               token->opr.ctx_type = WORD_LAST;
2350             }
2351           else
2352             {
2353               token->opr.ctx_type = INSIDE_WORD;
2354               tree_first = create_token_tree (dfa, NULL, NULL, token);
2355               token->opr.ctx_type = INSIDE_NOTWORD;
2356             }
2357           tree_last = create_token_tree (dfa, NULL, NULL, token);
2358           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2359           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2360             {
2361               *err = REG_ESPACE;
2362               return NULL;
2363             }
2364         }
2365       else
2366         {
2367           tree = create_token_tree (dfa, NULL, NULL, token);
2368           if (BE (tree == NULL, 0))
2369             {
2370               *err = REG_ESPACE;
2371               return NULL;
2372             }
2373         }
2374       /* We must return here, since ANCHORs can't be followed
2375          by repetition operators.
2376          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2377              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2378       fetch_token (token, regexp, syntax);
2379       return tree;
2380     case OP_PERIOD:
2381       tree = create_token_tree (dfa, NULL, NULL, token);
2382       if (BE (tree == NULL, 0))
2383         {
2384           *err = REG_ESPACE;
2385           return NULL;
2386         }
2387       if (dfa->mb_cur_max > 1)
2388         dfa->has_mb_node = 1;
2389       break;
2390     case OP_WORD:
2391     case OP_NOTWORD:
2392       tree = build_charclass_op (dfa, regexp->trans,
2393                                  (const unsigned char *) "alnum",
2394                                  (const unsigned char *) "_",
2395                                  token->type == OP_NOTWORD, err);
2396       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2397         return NULL;
2398       break;
2399     case OP_SPACE:
2400     case OP_NOTSPACE:
2401       tree = build_charclass_op (dfa, regexp->trans,
2402                                  (const unsigned char *) "space",
2403                                  (const unsigned char *) "",
2404                                  token->type == OP_NOTSPACE, err);
2405       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2406         return NULL;
2407       break;
2408     case OP_ALT:
2409     case END_OF_RE:
2410       return NULL;
2411     case BACK_SLASH:
2412       *err = REG_EESCAPE;
2413       return NULL;
2414     default:
2415       /* Must not happen?  */
2416 #ifdef DEBUG
2417       assert (0);
2418 #endif
2419       return NULL;
2420     }
2421   fetch_token (token, regexp, syntax);
2422
2423   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2424          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2425     {
2426       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2427       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2428         {
2429           if (tree != NULL)
2430             postorder (tree, free_tree, NULL);
2431           return NULL;
2432         }
2433       tree = dup_tree;
2434       /* In BRE consecutive duplications are not allowed.  */
2435       if ((syntax & RE_CONTEXT_INVALID_DUP)
2436           && (token->type == OP_DUP_ASTERISK
2437               || token->type == OP_OPEN_DUP_NUM))
2438         {
2439           if (tree != NULL)
2440             postorder (tree, free_tree, NULL);
2441           *err = REG_BADRPT;
2442           return NULL;
2443         }
2444     }
2445
2446   return tree;
2447 }
2448
2449 /* This function build the following tree, from regular expression
2450    (<reg_exp>):
2451          SUBEXP
2452             |
2453         <reg_exp>
2454 */
2455
2456 static bin_tree_t *
2457 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2458                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2459 {
2460   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2461   bin_tree_t *tree;
2462   size_t cur_nsub;
2463   cur_nsub = preg->re_nsub++;
2464
2465   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2466
2467   /* The subexpression may be a null string.  */
2468   if (token->type == OP_CLOSE_SUBEXP)
2469     tree = NULL;
2470   else
2471     {
2472       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2473       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2474         {
2475           if (tree != NULL)
2476             postorder (tree, free_tree, NULL);
2477           *err = REG_EPAREN;
2478         }
2479       if (BE (*err != REG_NOERROR, 0))
2480         return NULL;
2481     }
2482
2483   if (cur_nsub <= '9' - '1')
2484     dfa->completed_bkref_map |= 1 << cur_nsub;
2485
2486   tree = create_tree (dfa, tree, NULL, SUBEXP);
2487   if (BE (tree == NULL, 0))
2488     {
2489       *err = REG_ESPACE;
2490       return NULL;
2491     }
2492   tree->token.opr.idx = cur_nsub;
2493   return tree;
2494 }
2495
2496 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2497
2498 static bin_tree_t *
2499 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2500               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2501 {
2502   bin_tree_t *tree = NULL, *old_tree = NULL;
2503   int i, start, end, start_idx = re_string_cur_idx (regexp);
2504   re_token_t start_token = *token;
2505
2506   if (token->type == OP_OPEN_DUP_NUM)
2507     {
2508       end = 0;
2509       start = fetch_number (regexp, token, syntax);
2510       if (start == -1)
2511         {
2512           if (token->type == CHARACTER && token->opr.c == ',')
2513             start = 0; /* We treat "{,m}" as "{0,m}".  */
2514           else
2515             {
2516               *err = REG_BADBR; /* <re>{} is invalid.  */
2517               return NULL;
2518             }
2519         }
2520       if (BE (start != -2, 1))
2521         {
2522           /* We treat "{n}" as "{n,n}".  */
2523           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2524                  : ((token->type == CHARACTER && token->opr.c == ',')
2525                     ? fetch_number (regexp, token, syntax) : -2));
2526         }
2527       if (BE (start == -2 || end == -2, 0))
2528         {
2529           /* Invalid sequence.  */
2530           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2531             {
2532               if (token->type == END_OF_RE)
2533                 *err = REG_EBRACE;
2534               else
2535                 *err = REG_BADBR;
2536
2537               return NULL;
2538             }
2539
2540           /* If the syntax bit is set, rollback.  */
2541           re_string_set_index (regexp, start_idx);
2542           *token = start_token;
2543           token->type = CHARACTER;
2544           /* mb_partial and word_char bits should be already initialized by
2545              peek_token.  */
2546           return elem;
2547         }
2548
2549       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2550         {
2551           /* First number greater than second.  */
2552           *err = REG_BADBR;
2553           return NULL;
2554         }
2555     }
2556   else
2557     {
2558       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2559       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2560     }
2561
2562   fetch_token (token, regexp, syntax);
2563
2564   if (BE (elem == NULL, 0))
2565     return NULL;
2566   if (BE (start == 0 && end == 0, 0))
2567     {
2568       postorder (elem, free_tree, NULL);
2569       return NULL;
2570     }
2571
2572   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2573   if (BE (start > 0, 0))
2574     {
2575       tree = elem;
2576       for (i = 2; i <= start; ++i)
2577         {
2578           elem = duplicate_tree (elem, dfa);
2579           tree = create_tree (dfa, tree, elem, CONCAT);
2580           if (BE (elem == NULL || tree == NULL, 0))
2581             goto parse_dup_op_espace;
2582         }
2583
2584       if (start == end)
2585         return tree;
2586
2587       /* Duplicate ELEM before it is marked optional.  */
2588       elem = duplicate_tree (elem, dfa);
2589       if (BE (elem == NULL, 0))
2590         goto parse_dup_op_espace;
2591       old_tree = tree;
2592     }
2593   else
2594     old_tree = NULL;
2595
2596   if (elem->token.type == SUBEXP)
2597     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2598
2599   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2600   if (BE (tree == NULL, 0))
2601     goto parse_dup_op_espace;
2602
2603   /* This loop is actually executed only when end != -1,
2604      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2605      already created the start+1-th copy.  */
2606   for (i = start + 2; i <= end; ++i)
2607     {
2608       elem = duplicate_tree (elem, dfa);
2609       tree = create_tree (dfa, tree, elem, CONCAT);
2610       if (BE (elem == NULL || tree == NULL, 0))
2611         goto parse_dup_op_espace;
2612
2613       tree = create_tree (dfa, tree, NULL, OP_ALT);
2614       if (BE (tree == NULL, 0))
2615         goto parse_dup_op_espace;
2616     }
2617
2618   if (old_tree)
2619     tree = create_tree (dfa, old_tree, tree, CONCAT);
2620
2621   return tree;
2622
2623  parse_dup_op_espace:
2624   *err = REG_ESPACE;
2625   return NULL;
2626 }
2627
2628 /* Size of the names for collating symbol/equivalence_class/character_class.
2629    I'm not sure, but maybe enough.  */
2630 #define BRACKET_NAME_BUF_SIZE 32
2631
2632 #ifndef _LIBC
2633   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2634      Build the range expression which starts from START_ELEM, and ends
2635      at END_ELEM.  The result are written to MBCSET and SBCSET.
2636      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2637      mbcset->range_ends, is a pointer argument since we may
2638      update it.  */
2639
2640 static reg_errcode_t
2641 internal_function
2642 # ifdef RE_ENABLE_I18N
2643 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2644                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2645 # else /* not RE_ENABLE_I18N */
2646 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2647                  bracket_elem_t *end_elem)
2648 # endif /* not RE_ENABLE_I18N */
2649 {
2650   unsigned int start_ch, end_ch;
2651   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2652   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2653           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2654           0))
2655     return REG_ERANGE;
2656
2657   /* We can handle no multi character collating elements without libc
2658      support.  */
2659   if (BE ((start_elem->type == COLL_SYM
2660            && strlen ((char *) start_elem->opr.name) > 1)
2661           || (end_elem->type == COLL_SYM
2662               && strlen ((char *) end_elem->opr.name) > 1), 0))
2663     return REG_ECOLLATE;
2664
2665 # ifdef RE_ENABLE_I18N
2666   {
2667     wchar_t wc;
2668     wint_t start_wc;
2669     wint_t end_wc;
2670     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2671
2672     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2673                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2674                    : 0));
2675     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2676               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2677                  : 0));
2678     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2679                 ? __btowc (start_ch) : start_elem->opr.wch);
2680     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2681               ? __btowc (end_ch) : end_elem->opr.wch);
2682     if (start_wc == WEOF || end_wc == WEOF)
2683       return REG_ECOLLATE;
2684     cmp_buf[0] = start_wc;
2685     cmp_buf[4] = end_wc;
2686     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2687       return REG_ERANGE;
2688
2689     /* Got valid collation sequence values, add them as a new entry.
2690        However, for !_LIBC we have no collation elements: if the
2691        character set is single byte, the single byte character set
2692        that we build below suffices.  parse_bracket_exp passes
2693        no MBCSET if dfa->mb_cur_max == 1.  */
2694     if (mbcset)
2695       {
2696         /* Check the space of the arrays.  */
2697         if (BE (*range_alloc == mbcset->nranges, 0))
2698           {
2699             /* There is not enough space, need realloc.  */
2700             wchar_t *new_array_start, *new_array_end;
2701             int new_nranges;
2702
2703             /* +1 in case of mbcset->nranges is 0.  */
2704             new_nranges = 2 * mbcset->nranges + 1;
2705             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706                are NULL if *range_alloc == 0.  */
2707             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2708                                           new_nranges);
2709             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2710                                         new_nranges);
2711
2712             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2713               return REG_ESPACE;
2714
2715             mbcset->range_starts = new_array_start;
2716             mbcset->range_ends = new_array_end;
2717             *range_alloc = new_nranges;
2718           }
2719
2720         mbcset->range_starts[mbcset->nranges] = start_wc;
2721         mbcset->range_ends[mbcset->nranges++] = end_wc;
2722       }
2723
2724     /* Build the table for single byte characters.  */
2725     for (wc = 0; wc < SBC_MAX; ++wc)
2726       {
2727         cmp_buf[2] = wc;
2728         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2729             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2730           bitset_set (sbcset, wc);
2731       }
2732   }
2733 # else /* not RE_ENABLE_I18N */
2734   {
2735     unsigned int ch;
2736     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2737                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2738                    : 0));
2739     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2740               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2741                  : 0));
2742     if (start_ch > end_ch)
2743       return REG_ERANGE;
2744     /* Build the table for single byte characters.  */
2745     for (ch = 0; ch < SBC_MAX; ++ch)
2746       if (start_ch <= ch  && ch <= end_ch)
2747         bitset_set (sbcset, ch);
2748   }
2749 # endif /* not RE_ENABLE_I18N */
2750   return REG_NOERROR;
2751 }
2752 #endif /* not _LIBC */
2753
2754 #ifndef _LIBC
2755 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2756    Build the collating element which is represented by NAME.
2757    The result are written to MBCSET and SBCSET.
2758    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2759    pointer argument since we may update it.  */
2760
2761 static reg_errcode_t
2762 internal_function
2763 # ifdef RE_ENABLE_I18N
2764 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2765                         int *coll_sym_alloc, const unsigned char *name)
2766 # else /* not RE_ENABLE_I18N */
2767 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2768 # endif /* not RE_ENABLE_I18N */
2769 {
2770   size_t name_len = strlen ((const char *) name);
2771   if (BE (name_len != 1, 0))
2772     return REG_ECOLLATE;
2773   else
2774     {
2775       bitset_set (sbcset, name[0]);
2776       return REG_NOERROR;
2777     }
2778 }
2779 #endif /* not _LIBC */
2780
2781 /* This function parse bracket expression like "[abc]", "[a-c]",
2782    "[[.a-a.]]" etc.  */
2783
2784 static bin_tree_t *
2785 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2786                    reg_syntax_t syntax, reg_errcode_t *err)
2787 {
2788 #ifdef _LIBC
2789   const unsigned char *collseqmb;
2790   const char *collseqwc;
2791   uint32_t nrules;
2792   int32_t table_size;
2793   const int32_t *symb_table;
2794   const unsigned char *extra;
2795
2796   /* Local function for parse_bracket_exp used in _LIBC environment.
2797      Seek the collating symbol entry corresponding to NAME.
2798      Return the index of the symbol in the SYMB_TABLE,
2799      or -1 if not found.  */
2800
2801   auto inline int32_t
2802   __attribute__ ((always_inline))
2803   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2804     {
2805       int32_t elem;
2806
2807       for (elem = 0; elem < table_size; elem++)
2808         if (symb_table[2 * elem] != 0)
2809           {
2810             int32_t idx = symb_table[2 * elem + 1];
2811             /* Skip the name of collating element name.  */
2812             idx += 1 + extra[idx];
2813             if (/* Compare the length of the name.  */
2814                 name_len == extra[idx]
2815                 /* Compare the name.  */
2816                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2817               /* Yep, this is the entry.  */
2818               return elem;
2819           }
2820       return -1;
2821     }
2822
2823   /* Local function for parse_bracket_exp used in _LIBC environment.
2824      Look up the collation sequence value of BR_ELEM.
2825      Return the value if succeeded, UINT_MAX otherwise.  */
2826
2827   auto inline unsigned int
2828   __attribute__ ((always_inline))
2829   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2830     {
2831       if (br_elem->type == SB_CHAR)
2832         {
2833           /*
2834           if (MB_CUR_MAX == 1)
2835           */
2836           if (nrules == 0)
2837             return collseqmb[br_elem->opr.ch];
2838           else
2839             {
2840               wint_t wc = __btowc (br_elem->opr.ch);
2841               return __collseq_table_lookup (collseqwc, wc);
2842             }
2843         }
2844       else if (br_elem->type == MB_CHAR)
2845         {
2846           if (nrules != 0)
2847             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2848         }
2849       else if (br_elem->type == COLL_SYM)
2850         {
2851           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2852           if (nrules != 0)
2853             {
2854               int32_t elem, idx;
2855               elem = seek_collating_symbol_entry (br_elem->opr.name,
2856                                                   sym_name_len);
2857               if (elem != -1)
2858                 {
2859                   /* We found the entry.  */
2860                   idx = symb_table[2 * elem + 1];
2861                   /* Skip the name of collating element name.  */
2862                   idx += 1 + extra[idx];
2863                   /* Skip the byte sequence of the collating element.  */
2864                   idx += 1 + extra[idx];
2865                   /* Adjust for the alignment.  */
2866                   idx = (idx + 3) & ~3;
2867                   /* Skip the multibyte collation sequence value.  */
2868                   idx += sizeof (unsigned int);
2869                   /* Skip the wide char sequence of the collating element.  */
2870                   idx += sizeof (unsigned int) *
2871                     (1 + *(unsigned int *) (extra + idx));
2872                   /* Return the collation sequence value.  */
2873                   return *(unsigned int *) (extra + idx);
2874                 }
2875               else if (sym_name_len == 1)
2876                 {
2877                   /* No valid character.  Match it as a single byte
2878                      character.  */
2879                   return collseqmb[br_elem->opr.name[0]];
2880                 }
2881             }
2882           else if (sym_name_len == 1)
2883             return collseqmb[br_elem->opr.name[0]];
2884         }
2885       return UINT_MAX;
2886     }
2887
2888   /* Local function for parse_bracket_exp used in _LIBC environment.
2889      Build the range expression which starts from START_ELEM, and ends
2890      at END_ELEM.  The result are written to MBCSET and SBCSET.
2891      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2892      mbcset->range_ends, is a pointer argument since we may
2893      update it.  */
2894
2895   auto inline reg_errcode_t
2896   __attribute__ ((always_inline))
2897   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2898                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2899     {
2900       unsigned int ch;
2901       uint32_t start_collseq;
2902       uint32_t end_collseq;
2903
2904       /* Equivalence Classes and Character Classes can't be a range
2905          start/end.  */
2906       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2907               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2908               0))
2909         return REG_ERANGE;
2910
2911       start_collseq = lookup_collation_sequence_value (start_elem);
2912       end_collseq = lookup_collation_sequence_value (end_elem);
2913       /* Check start/end collation sequence values.  */
2914       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2915         return REG_ECOLLATE;
2916       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2917         return REG_ERANGE;
2918
2919       /* Got valid collation sequence values, add them as a new entry.
2920          However, if we have no collation elements, and the character set
2921          is single byte, the single byte character set that we
2922          build below suffices. */
2923       if (nrules > 0 || dfa->mb_cur_max > 1)
2924         {
2925           /* Check the space of the arrays.  */
2926           if (BE (*range_alloc == mbcset->nranges, 0))
2927             {
2928               /* There is not enough space, need realloc.  */
2929               uint32_t *new_array_start;
2930               uint32_t *new_array_end;
2931               int new_nranges;
2932
2933               /* +1 in case of mbcset->nranges is 0.  */
2934               new_nranges = 2 * mbcset->nranges + 1;
2935               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2936                                             new_nranges);
2937               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2938                                           new_nranges);
2939
2940               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2941                 return REG_ESPACE;
2942
2943               mbcset->range_starts = new_array_start;
2944               mbcset->range_ends = new_array_end;
2945               *range_alloc = new_nranges;
2946             }
2947
2948           mbcset->range_starts[mbcset->nranges] = start_collseq;
2949           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2950         }
2951
2952       /* Build the table for single byte characters.  */
2953       for (ch = 0; ch < SBC_MAX; ch++)
2954         {
2955           uint32_t ch_collseq;
2956           /*
2957           if (MB_CUR_MAX == 1)
2958           */
2959           if (nrules == 0)
2960             ch_collseq = collseqmb[ch];
2961           else
2962             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2963           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2964             bitset_set (sbcset, ch);
2965         }
2966       return REG_NOERROR;
2967     }
2968
2969   /* Local function for parse_bracket_exp used in _LIBC environment.
2970      Build the collating element which is represented by NAME.
2971      The result are written to MBCSET and SBCSET.
2972      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2973      pointer argument since we may update it.  */
2974
2975   auto inline reg_errcode_t
2976   __attribute__ ((always_inline))
2977   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2978                           int *coll_sym_alloc, const unsigned char *name)
2979     {
2980       int32_t elem, idx;
2981       size_t name_len = strlen ((const char *) name);
2982       if (nrules != 0)
2983         {
2984           elem = seek_collating_symbol_entry (name, name_len);
2985           if (elem != -1)
2986             {
2987               /* We found the entry.  */
2988               idx = symb_table[2 * elem + 1];
2989               /* Skip the name of collating element name.  */
2990               idx += 1 + extra[idx];
2991             }
2992           else if (name_len == 1)
2993             {
2994               /* No valid character, treat it as a normal
2995                  character.  */
2996               bitset_set (sbcset, name[0]);
2997               return REG_NOERROR;
2998             }
2999           else
3000             return REG_ECOLLATE;
3001
3002           /* Got valid collation sequence, add it as a new entry.  */
3003           /* Check the space of the arrays.  */
3004           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3005             {
3006               /* Not enough, realloc it.  */
3007               /* +1 in case of mbcset->ncoll_syms is 0.  */
3008               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3009               /* Use realloc since mbcset->coll_syms is NULL
3010                  if *alloc == 0.  */
3011               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3012                                                    new_coll_sym_alloc);
3013               if (BE (new_coll_syms == NULL, 0))
3014                 return REG_ESPACE;
3015               mbcset->coll_syms = new_coll_syms;
3016               *coll_sym_alloc = new_coll_sym_alloc;
3017             }
3018           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3019           return REG_NOERROR;
3020         }
3021       else
3022         {
3023           if (BE (name_len != 1, 0))
3024             return REG_ECOLLATE;
3025           else
3026             {
3027               bitset_set (sbcset, name[0]);
3028               return REG_NOERROR;
3029             }
3030         }
3031     }
3032 #endif
3033
3034   re_token_t br_token;
3035   re_bitset_ptr_t sbcset;
3036 #ifdef RE_ENABLE_I18N
3037   re_charset_t *mbcset;
3038   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3039   int equiv_class_alloc = 0, char_class_alloc = 0;
3040 #endif /* not RE_ENABLE_I18N */
3041   int non_match = 0;
3042   bin_tree_t *work_tree;
3043   int token_len;
3044   int first_round = 1;
3045 #ifdef _LIBC
3046   collseqmb = (const unsigned char *)
3047     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3048   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3049   if (nrules)
3050     {
3051       /*
3052       if (MB_CUR_MAX > 1)
3053       */
3054       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3055       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3056       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3057                                                   _NL_COLLATE_SYMB_TABLEMB);
3058       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3059                                                    _NL_COLLATE_SYMB_EXTRAMB);
3060     }
3061 #endif
3062   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3063 #ifdef RE_ENABLE_I18N
3064   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3065 #endif /* RE_ENABLE_I18N */
3066 #ifdef RE_ENABLE_I18N
3067   if (BE (sbcset == NULL || mbcset == NULL, 0))
3068 #else
3069   if (BE (sbcset == NULL, 0))
3070 #endif /* RE_ENABLE_I18N */
3071     {
3072       re_free (sbcset);
3073 #ifdef RE_ENABLE_I18N
3074       re_free (mbcset);
3075 #endif
3076       *err = REG_ESPACE;
3077       return NULL;
3078     }
3079
3080   token_len = peek_token_bracket (token, regexp, syntax);
3081   if (BE (token->type == END_OF_RE, 0))
3082     {
3083       *err = REG_BADPAT;
3084       goto parse_bracket_exp_free_return;
3085     }
3086   if (token->type == OP_NON_MATCH_LIST)
3087     {
3088 #ifdef RE_ENABLE_I18N
3089       mbcset->non_match = 1;
3090 #endif /* not RE_ENABLE_I18N */
3091       non_match = 1;
3092       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3093         bitset_set (sbcset, '\n');
3094       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3095       token_len = peek_token_bracket (token, regexp, syntax);
3096       if (BE (token->type == END_OF_RE, 0))
3097         {
3098           *err = REG_BADPAT;
3099           goto parse_bracket_exp_free_return;
3100         }
3101     }
3102
3103   /* We treat the first ']' as a normal character.  */
3104   if (token->type == OP_CLOSE_BRACKET)
3105     token->type = CHARACTER;
3106
3107   while (1)
3108     {
3109       bracket_elem_t start_elem, end_elem;
3110       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3111       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3112       reg_errcode_t ret;
3113       int token_len2 = 0, is_range_exp = 0;
3114       re_token_t token2;
3115
3116       start_elem.opr.name = start_name_buf;
3117       start_elem.type = COLL_SYM;
3118       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3119                                    syntax, first_round);
3120       if (BE (ret != REG_NOERROR, 0))
3121         {
3122           *err = ret;
3123           goto parse_bracket_exp_free_return;
3124         }
3125       first_round = 0;
3126
3127       /* Get information about the next token.  We need it in any case.  */
3128       token_len = peek_token_bracket (token, regexp, syntax);
3129
3130       /* Do not check for ranges if we know they are not allowed.  */
3131       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3132         {
3133           if (BE (token->type == END_OF_RE, 0))
3134             {
3135               *err = REG_EBRACK;
3136               goto parse_bracket_exp_free_return;
3137             }
3138           if (token->type == OP_CHARSET_RANGE)
3139             {
3140               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3141               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3142               if (BE (token2.type == END_OF_RE, 0))
3143                 {
3144                   *err = REG_EBRACK;
3145                   goto parse_bracket_exp_free_return;
3146                 }
3147               if (token2.type == OP_CLOSE_BRACKET)
3148                 {
3149                   /* We treat the last '-' as a normal character.  */
3150                   re_string_skip_bytes (regexp, -token_len);
3151                   token->type = CHARACTER;
3152                 }
3153               else
3154                 is_range_exp = 1;
3155             }
3156         }
3157
3158       if (is_range_exp == 1)
3159         {
3160           end_elem.opr.name = end_name_buf;
3161           end_elem.type = COLL_SYM;
3162           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3163                                        dfa, syntax, 1);
3164           if (BE (ret != REG_NOERROR, 0))
3165             {
3166               *err = ret;
3167               goto parse_bracket_exp_free_return;
3168             }
3169
3170           token_len = peek_token_bracket (token, regexp, syntax);
3171
3172 #ifdef _LIBC
3173           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3174                                   &start_elem, &end_elem);
3175 #else
3176 # ifdef RE_ENABLE_I18N
3177           *err = build_range_exp (sbcset,
3178                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3179                                   &range_alloc, &start_elem, &end_elem);
3180 # else
3181           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3182 # endif
3183 #endif /* RE_ENABLE_I18N */
3184           if (BE (*err != REG_NOERROR, 0))
3185             goto parse_bracket_exp_free_return;
3186         }
3187       else
3188         {
3189           switch (start_elem.type)
3190             {
3191             case SB_CHAR:
3192               bitset_set (sbcset, start_elem.opr.ch);
3193               break;
3194 #ifdef RE_ENABLE_I18N
3195             case MB_CHAR:
3196               /* Check whether the array has enough space.  */
3197               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3198                 {
3199                   wchar_t *new_mbchars;
3200                   /* Not enough, realloc it.  */
3201                   /* +1 in case of mbcset->nmbchars is 0.  */
3202                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3203                   /* Use realloc since array is NULL if *alloc == 0.  */
3204                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3205                                             mbchar_alloc);
3206                   if (BE (new_mbchars == NULL, 0))
3207                     goto parse_bracket_exp_espace;
3208                   mbcset->mbchars = new_mbchars;
3209                 }
3210               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3211               break;
3212 #endif /* RE_ENABLE_I18N */
3213             case EQUIV_CLASS:
3214               *err = build_equiv_class (sbcset,
3215 #ifdef RE_ENABLE_I18N
3216                                         mbcset, &equiv_class_alloc,
3217 #endif /* RE_ENABLE_I18N */
3218                                         start_elem.opr.name);
3219               if (BE (*err != REG_NOERROR, 0))
3220                 goto parse_bracket_exp_free_return;
3221               break;
3222             case COLL_SYM:
3223               *err = build_collating_symbol (sbcset,
3224 #ifdef RE_ENABLE_I18N
3225                                              mbcset, &coll_sym_alloc,
3226 #endif /* RE_ENABLE_I18N */
3227                                              start_elem.opr.name);
3228               if (BE (*err != REG_NOERROR, 0))
3229                 goto parse_bracket_exp_free_return;
3230               break;
3231             case CHAR_CLASS:
3232               *err = build_charclass (regexp->trans, sbcset,
3233 #ifdef RE_ENABLE_I18N
3234                                       mbcset, &char_class_alloc,
3235 #endif /* RE_ENABLE_I18N */
3236                                       start_elem.opr.name, syntax);
3237               if (BE (*err != REG_NOERROR, 0))
3238                goto parse_bracket_exp_free_return;
3239               break;
3240             default:
3241               assert (0);
3242               break;
3243             }
3244         }
3245       if (BE (token->type == END_OF_RE, 0))
3246         {
3247           *err = REG_EBRACK;
3248           goto parse_bracket_exp_free_return;
3249         }
3250       if (token->type == OP_CLOSE_BRACKET)
3251         break;
3252     }
3253
3254   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3255
3256   /* If it is non-matching list.  */
3257   if (non_match)
3258     bitset_not (sbcset);
3259
3260 #ifdef RE_ENABLE_I18N
3261   /* Ensure only single byte characters are set.  */
3262   if (dfa->mb_cur_max > 1)
3263     bitset_mask (sbcset, dfa->sb_char);
3264
3265   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3266       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3267                                                      || mbcset->non_match)))
3268     {
3269       bin_tree_t *mbc_tree;
3270       int sbc_idx;
3271       /* Build a tree for complex bracket.  */
3272       dfa->has_mb_node = 1;
3273       br_token.type = COMPLEX_BRACKET;
3274       br_token.opr.mbcset = mbcset;
3275       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3276       if (BE (mbc_tree == NULL, 0))
3277         goto parse_bracket_exp_espace;
3278       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3279         if (sbcset[sbc_idx])
3280           break;
3281       /* If there are no bits set in sbcset, there is no point
3282          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3283       if (sbc_idx < BITSET_WORDS)
3284         {
3285           /* Build a tree for simple bracket.  */
3286           br_token.type = SIMPLE_BRACKET;
3287           br_token.opr.sbcset = sbcset;
3288           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3289           if (BE (work_tree == NULL, 0))
3290             goto parse_bracket_exp_espace;
3291
3292           /* Then join them by ALT node.  */
3293           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3294           if (BE (work_tree == NULL, 0))
3295             goto parse_bracket_exp_espace;
3296         }
3297       else
3298         {
3299           re_free (sbcset);
3300           work_tree = mbc_tree;
3301         }
3302     }
3303   else
3304 #endif /* not RE_ENABLE_I18N */
3305     {
3306 #ifdef RE_ENABLE_I18N
3307       free_charset (mbcset);
3308 #endif
3309       /* Build a tree for simple bracket.  */
3310       br_token.type = SIMPLE_BRACKET;
3311       br_token.opr.sbcset = sbcset;
3312       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3313       if (BE (work_tree == NULL, 0))
3314         goto parse_bracket_exp_espace;
3315     }
3316   return work_tree;
3317
3318  parse_bracket_exp_espace:
3319   *err = REG_ESPACE;
3320  parse_bracket_exp_free_return:
3321   re_free (sbcset);
3322 #ifdef RE_ENABLE_I18N
3323   free_charset (mbcset);
3324 #endif /* RE_ENABLE_I18N */
3325   return NULL;
3326 }
3327
3328 /* Parse an element in the bracket expression.  */
3329
3330 static reg_errcode_t
3331 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3332                        re_token_t *token, int token_len, re_dfa_t *dfa,
3333                        reg_syntax_t syntax, int accept_hyphen)
3334 {
3335 #ifdef RE_ENABLE_I18N
3336   int cur_char_size;
3337   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3338   if (cur_char_size > 1)
3339     {
3340       elem->type = MB_CHAR;
3341       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3342       re_string_skip_bytes (regexp, cur_char_size);
3343       return REG_NOERROR;
3344     }
3345 #endif /* RE_ENABLE_I18N */
3346   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3347   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3348       || token->type == OP_OPEN_EQUIV_CLASS)
3349     return parse_bracket_symbol (elem, regexp, token);
3350   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3351     {
3352       /* A '-' must only appear as anything but a range indicator before
3353          the closing bracket.  Everything else is an error.  */
3354       re_token_t token2;
3355       (void) peek_token_bracket (&token2, regexp, syntax);
3356       if (token2.type != OP_CLOSE_BRACKET)
3357         /* The actual error value is not standardized since this whole
3358            case is undefined.  But ERANGE makes good sense.  */
3359         return REG_ERANGE;
3360     }
3361   elem->type = SB_CHAR;
3362   elem->opr.ch = token->opr.c;
3363   return REG_NOERROR;
3364 }
3365
3366 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3367    such as [:<character_class>:], [.<collating_element>.], and
3368    [=<equivalent_class>=].  */
3369
3370 static reg_errcode_t
3371 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3372                       re_token_t *token)
3373 {
3374   unsigned char ch, delim = token->opr.c;
3375   int i = 0;
3376   if (re_string_eoi(regexp))
3377     return REG_EBRACK;
3378   for (;; ++i)
3379     {
3380       if (i >= BRACKET_NAME_BUF_SIZE)
3381         return REG_EBRACK;
3382       if (token->type == OP_OPEN_CHAR_CLASS)
3383         ch = re_string_fetch_byte_case (regexp);
3384       else
3385         ch = re_string_fetch_byte (regexp);
3386       if (re_string_eoi(regexp))
3387         return REG_EBRACK;
3388       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3389         break;
3390       elem->opr.name[i] = ch;
3391     }
3392   re_string_skip_bytes (regexp, 1);
3393   elem->opr.name[i] = '\0';
3394   switch (token->type)
3395     {
3396     case OP_OPEN_COLL_ELEM:
3397       elem->type = COLL_SYM;
3398       break;
3399     case OP_OPEN_EQUIV_CLASS:
3400       elem->type = EQUIV_CLASS;
3401       break;
3402     case OP_OPEN_CHAR_CLASS:
3403       elem->type = CHAR_CLASS;
3404       break;
3405     default:
3406       break;
3407     }
3408   return REG_NOERROR;
3409 }
3410
3411   /* Helper function for parse_bracket_exp.
3412      Build the equivalence class which is represented by NAME.
3413      The result are written to MBCSET and SBCSET.
3414      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3415      is a pointer argument since we may update it.  */
3416
3417 static reg_errcode_t
3418 #ifdef RE_ENABLE_I18N
3419 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3420                    int *equiv_class_alloc, const unsigned char *name)
3421 #else /* not RE_ENABLE_I18N */
3422 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3423 #endif /* not RE_ENABLE_I18N */
3424 {
3425 #ifdef _LIBC
3426   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3427   if (nrules != 0)
3428     {
3429       const int32_t *table, *indirect;
3430       const unsigned char *weights, *extra, *cp;
3431       unsigned char char_buf[2];
3432       int32_t idx1, idx2;
3433       unsigned int ch;
3434       size_t len;
3435       /* Calculate the index for equivalence class.  */
3436       cp = name;
3437       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3438       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3439                                                _NL_COLLATE_WEIGHTMB);
3440       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3441                                                    _NL_COLLATE_EXTRAMB);
3442       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3443                                                 _NL_COLLATE_INDIRECTMB);
3444       idx1 = findidx (table, indirect, extra, &cp, -1);
3445       if (BE (idx1 == 0 || *cp != '\0', 0))
3446         /* This isn't a valid character.  */
3447         return REG_ECOLLATE;
3448
3449       /* Build single byte matcing table for this equivalence class.  */
3450       len = weights[idx1 & 0xffffff];
3451       for (ch = 0; ch < SBC_MAX; ++ch)
3452         {
3453           char_buf[0] = ch;
3454           cp = char_buf;
3455           idx2 = findidx (table, indirect, extra, &cp, 1);
3456 /*
3457           idx2 = table[ch];
3458 */
3459           if (idx2 == 0)
3460             /* This isn't a valid character.  */
3461             continue;
3462           /* Compare only if the length matches and the collation rule
3463              index is the same.  */
3464           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3465             {
3466               int cnt = 0;
3467
3468               while (cnt <= len &&
3469                      weights[(idx1 & 0xffffff) + 1 + cnt]
3470                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3471                 ++cnt;
3472
3473               if (cnt > len)
3474                 bitset_set (sbcset, ch);
3475             }
3476         }
3477       /* Check whether the array has enough space.  */
3478       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3479         {
3480           /* Not enough, realloc it.  */
3481           /* +1 in case of mbcset->nequiv_classes is 0.  */
3482           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3483           /* Use realloc since the array is NULL if *alloc == 0.  */
3484           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3485                                                    int32_t,
3486                                                    new_equiv_class_alloc);
3487           if (BE (new_equiv_classes == NULL, 0))
3488             return REG_ESPACE;
3489           mbcset->equiv_classes = new_equiv_classes;
3490           *equiv_class_alloc = new_equiv_class_alloc;
3491         }
3492       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3493     }
3494   else
3495 #endif /* _LIBC */
3496     {
3497       if (BE (strlen ((const char *) name) != 1, 0))
3498         return REG_ECOLLATE;
3499       bitset_set (sbcset, *name);
3500     }
3501   return REG_NOERROR;
3502 }
3503
3504   /* Helper function for parse_bracket_exp.
3505      Build the character class which is represented by NAME.
3506      The result are written to MBCSET and SBCSET.
3507      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3508      is a pointer argument since we may update it.  */
3509
3510 static reg_errcode_t
3511 #ifdef RE_ENABLE_I18N
3512 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3513                  re_charset_t *mbcset, int *char_class_alloc,
3514                  const unsigned char *class_name, reg_syntax_t syntax)
3515 #else /* not RE_ENABLE_I18N */
3516 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3517                  const unsigned char *class_name, reg_syntax_t syntax)
3518 #endif /* not RE_ENABLE_I18N */
3519 {
3520   int i;
3521   const char *name = (const char *) class_name;
3522
3523   /* In case of REG_ICASE "upper" and "lower" match the both of
3524      upper and lower cases.  */
3525   if ((syntax & RE_ICASE)
3526       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3527     name = "alpha";
3528
3529 #ifdef RE_ENABLE_I18N
3530   /* Check the space of the arrays.  */
3531   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3532     {
3533       /* Not enough, realloc it.  */
3534       /* +1 in case of mbcset->nchar_classes is 0.  */
3535       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3536       /* Use realloc since array is NULL if *alloc == 0.  */
3537       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3538                                                new_char_class_alloc);
3539       if (BE (new_char_classes == NULL, 0))
3540         return REG_ESPACE;
3541       mbcset->char_classes = new_char_classes;
3542       *char_class_alloc = new_char_class_alloc;
3543     }
3544   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3545 #endif /* RE_ENABLE_I18N */
3546
3547 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3548   do {                                          \
3549     if (BE (trans != NULL, 0))                  \
3550       {                                         \
3551         for (i = 0; i < SBC_MAX; ++i)           \
3552           if (ctype_func (i))                   \
3553             bitset_set (sbcset, trans[i]);      \
3554       }                                         \
3555     else                                        \
3556       {                                         \
3557         for (i = 0; i < SBC_MAX; ++i)           \
3558           if (ctype_func (i))                   \
3559             bitset_set (sbcset, i);             \
3560       }                                         \
3561   } while (0)
3562
3563   if (strcmp (name, "alnum") == 0)
3564     BUILD_CHARCLASS_LOOP (isalnum);
3565   else if (strcmp (name, "cntrl") == 0)
3566     BUILD_CHARCLASS_LOOP (iscntrl);
3567   else if (strcmp (name, "lower") == 0)
3568     BUILD_CHARCLASS_LOOP (islower);
3569   else if (strcmp (name, "space") == 0)
3570     BUILD_CHARCLASS_LOOP (isspace);
3571   else if (strcmp (name, "alpha") == 0)
3572     BUILD_CHARCLASS_LOOP (isalpha);
3573   else if (strcmp (name, "digit") == 0)
3574     BUILD_CHARCLASS_LOOP (isdigit);
3575   else if (strcmp (name, "print") == 0)
3576     BUILD_CHARCLASS_LOOP (isprint);
3577   else if (strcmp (name, "upper") == 0)
3578     BUILD_CHARCLASS_LOOP (isupper);
3579   else if (strcmp (name, "blank") == 0)
3580     BUILD_CHARCLASS_LOOP (isblank);
3581   else if (strcmp (name, "graph") == 0)
3582     BUILD_CHARCLASS_LOOP (isgraph);
3583   else if (strcmp (name, "punct") == 0)
3584     BUILD_CHARCLASS_LOOP (ispunct);
3585   else if (strcmp (name, "xdigit") == 0)
3586     BUILD_CHARCLASS_LOOP (isxdigit);
3587   else
3588     return REG_ECTYPE;
3589
3590   return REG_NOERROR;
3591 }
3592
3593 static bin_tree_t *
3594 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3595                     const unsigned char *class_name,
3596                     const unsigned char *extra, int non_match,
3597                     reg_errcode_t *err)
3598 {
3599   re_bitset_ptr_t sbcset;
3600 #ifdef RE_ENABLE_I18N
3601   re_charset_t *mbcset;
3602   int alloc = 0;
3603 #endif /* not RE_ENABLE_I18N */
3604   reg_errcode_t ret;
3605   re_token_t br_token;
3606   bin_tree_t *tree;
3607
3608   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3609 #ifdef RE_ENABLE_I18N
3610   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3611 #endif /* RE_ENABLE_I18N */
3612
3613 #ifdef RE_ENABLE_I18N
3614   if (BE (sbcset == NULL || mbcset == NULL, 0))
3615 #else /* not RE_ENABLE_I18N */
3616   if (BE (sbcset == NULL, 0))
3617 #endif /* not RE_ENABLE_I18N */
3618     {
3619       *err = REG_ESPACE;
3620       return NULL;
3621     }
3622
3623   if (non_match)
3624     {
3625 #ifdef RE_ENABLE_I18N
3626       mbcset->non_match = 1;
3627 #endif /* not RE_ENABLE_I18N */
3628     }
3629
3630   /* We don't care the syntax in this case.  */
3631   ret = build_charclass (trans, sbcset,
3632 #ifdef RE_ENABLE_I18N
3633                          mbcset, &alloc,
3634 #endif /* RE_ENABLE_I18N */
3635                          class_name, 0);
3636
3637   if (BE (ret != REG_NOERROR, 0))
3638     {
3639       re_free (sbcset);
3640 #ifdef RE_ENABLE_I18N
3641       free_charset (mbcset);
3642 #endif /* RE_ENABLE_I18N */
3643       *err = ret;
3644       return NULL;
3645     }
3646   /* \w match '_' also.  */
3647   for (; *extra; extra++)
3648     bitset_set (sbcset, *extra);
3649
3650   /* If it is non-matching list.  */
3651   if (non_match)
3652     bitset_not (sbcset);
3653
3654 #ifdef RE_ENABLE_I18N
3655   /* Ensure only single byte characters are set.  */
3656   if (dfa->mb_cur_max > 1)
3657     bitset_mask (sbcset, dfa->sb_char);
3658 #endif
3659
3660   /* Build a tree for simple bracket.  */
3661   br_token.type = SIMPLE_BRACKET;
3662   br_token.opr.sbcset = sbcset;
3663   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3664   if (BE (tree == NULL, 0))
3665     goto build_word_op_espace;
3666
3667 #ifdef RE_ENABLE_I18N
3668   if (dfa->mb_cur_max > 1)
3669     {
3670       bin_tree_t *mbc_tree;
3671       /* Build a tree for complex bracket.  */
3672       br_token.type = COMPLEX_BRACKET;
3673       br_token.opr.mbcset = mbcset;
3674       dfa->has_mb_node = 1;
3675       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3676       if (BE (mbc_tree == NULL, 0))
3677         goto build_word_op_espace;
3678       /* Then join them by ALT node.  */
3679       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3680       if (BE (mbc_tree != NULL, 1))
3681         return tree;
3682     }
3683   else
3684     {
3685       free_charset (mbcset);
3686       return tree;
3687     }
3688 #else /* not RE_ENABLE_I18N */
3689   return tree;
3690 #endif /* not RE_ENABLE_I18N */
3691
3692  build_word_op_espace:
3693   re_free (sbcset);
3694 #ifdef RE_ENABLE_I18N
3695   free_charset (mbcset);
3696 #endif /* RE_ENABLE_I18N */
3697   *err = REG_ESPACE;
3698   return NULL;
3699 }
3700
3701 /* This is intended for the expressions like "a{1,3}".
3702    Fetch a number from `input', and return the number.
3703    Return -1, if the number field is empty like "{,1}".
3704    Return -2, If an error is occured.  */
3705
3706 static int
3707 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3708 {
3709   int num = -1;
3710   unsigned char c;
3711   while (1)
3712     {
3713       fetch_token (token, input, syntax);
3714       c = token->opr.c;
3715       if (BE (token->type == END_OF_RE, 0))
3716         return -2;
3717       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3718         break;
3719       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3720              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3721       num = (num > RE_DUP_MAX) ? -2 : num;
3722     }
3723   return num;
3724 }
3725 \f
3726 #ifdef RE_ENABLE_I18N
3727 static void
3728 free_charset (re_charset_t *cset)
3729 {
3730   re_free (cset->mbchars);
3731 # ifdef _LIBC
3732   re_free (cset->coll_syms);
3733   re_free (cset->equiv_classes);
3734   re_free (cset->range_starts);
3735   re_free (cset->range_ends);
3736 # endif
3737   re_free (cset->char_classes);
3738   re_free (cset);
3739 }
3740 #endif /* RE_ENABLE_I18N */
3741 \f
3742 /* Functions for binary tree operation.  */
3743
3744 /* Create a tree node.  */
3745
3746 static bin_tree_t *
3747 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3748              re_token_type_t type)
3749 {
3750   re_token_t t;
3751   t.type = type;
3752   return create_token_tree (dfa, left, right, &t);
3753 }
3754
3755 static bin_tree_t *
3756 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3757                    const re_token_t *token)
3758 {
3759   bin_tree_t *tree;
3760   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3761     {
3762       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3763
3764       if (storage == NULL)
3765         return NULL;
3766       storage->next = dfa->str_tree_storage;
3767       dfa->str_tree_storage = storage;
3768       dfa->str_tree_storage_idx = 0;
3769     }
3770   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3771
3772   tree->parent = NULL;
3773   tree->left = left;
3774   tree->right = right;
3775   tree->token = *token;
3776   tree->token.duplicated = 0;
3777   tree->token.opt_subexp = 0;
3778   tree->first = NULL;
3779   tree->next = NULL;
3780   tree->node_idx = -1;
3781
3782   if (left != NULL)
3783     left->parent = tree;
3784   if (right != NULL)
3785     right->parent = tree;
3786   return tree;
3787 }
3788
3789 /* Mark the tree SRC as an optional subexpression.
3790    To be called from preorder or postorder.  */
3791
3792 static reg_errcode_t
3793 mark_opt_subexp (void *extra, bin_tree_t *node)
3794 {
3795   int idx = (int) (long) extra;
3796   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3797     node->token.opt_subexp = 1;
3798
3799   return REG_NOERROR;
3800 }
3801
3802 /* Free the allocated memory inside NODE. */
3803
3804 static void
3805 free_token (re_token_t *node)
3806 {
3807 #ifdef RE_ENABLE_I18N
3808   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3809     free_charset (node->opr.mbcset);
3810   else
3811 #endif /* RE_ENABLE_I18N */
3812     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3813       re_free (node->opr.sbcset);
3814 }
3815
3816 /* Worker function for tree walking.  Free the allocated memory inside NODE
3817    and its children. */
3818
3819 static reg_errcode_t
3820 free_tree (void *extra, bin_tree_t *node)
3821 {
3822   free_token (&node->token);
3823   return REG_NOERROR;
3824 }
3825
3826
3827 /* Duplicate the node SRC, and return new node.  This is a preorder
3828    visit similar to the one implemented by the generic visitor, but
3829    we need more infrastructure to maintain two parallel trees --- so,
3830    it's easier to duplicate.  */
3831
3832 static bin_tree_t *
3833 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3834 {
3835   const bin_tree_t *node;
3836   bin_tree_t *dup_root;
3837   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3838
3839   for (node = root; ; )
3840     {
3841       /* Create a new tree and link it back to the current parent.  */
3842       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3843       if (*p_new == NULL)
3844         return NULL;
3845       (*p_new)->parent = dup_node;
3846       (*p_new)->token.duplicated = 1;
3847       dup_node = *p_new;
3848
3849       /* Go to the left node, or up and to the right.  */
3850       if (node->left)
3851         {
3852           node = node->left;
3853           p_new = &dup_node->left;
3854         }
3855       else
3856         {
3857           const bin_tree_t *prev = NULL;
3858           while (node->right == prev || node->right == NULL)
3859             {
3860               prev = node;
3861               node = node->parent;
3862               dup_node = dup_node->parent;
3863               if (!node)
3864                 return dup_root;
3865             }
3866           node = node->right;
3867           p_new = &dup_node->right;
3868         }
3869     }
3870 }