posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          int node, int root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static int fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             int accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         int *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       int *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        int non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 const char __re_error_msgid[] attribute_hidden =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 const size_t __re_error_msgid_idx[] attribute_hidden =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 const char *
 213 re_compile_pattern (pattern, length, bufp)
 214     const char *pattern;
 215     size_t length;
 216     struct re_pattern_buffer *bufp;
 217 {
 218   reg_errcode_t ret;
 219
 220   /* And GNU code determines whether or not to get register information
 221      by passing null for the REGS argument to re_match, etc., not by
 222      setting no_sub, unless RE_NO_SUB is set.  */
 223   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 224
 225   /* Match anchors at newline.  */
 226   bufp->newline_anchor = 1;
 227
 228   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 229
 230   if (!ret)
 231     return NULL;
 232   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 233 }
 234 #ifdef _LIBC
 235 weak_alias (__re_compile_pattern, re_compile_pattern)
 236 #endif
 237
 238 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 239    also be assigned to arbitrarily: each pattern buffer stores its own
 240    syntax, so it can be changed between regex compilations.  */
 241 /* This has no initializer because initialized variables in Emacs
 242    become read-only after dumping.  */
 243 reg_syntax_t re_syntax_options;
 244
 245
 246 /* Specify the precise syntax of regexps for compilation.  This provides
 247    for compatibility for various utilities which historically have
 248    different, incompatible syntaxes.
 249
 250    The argument SYNTAX is a bit mask comprised of the various bits
 251    defined in regex.h.  We return the old syntax.  */
 252
 253 reg_syntax_t
 254 re_set_syntax (syntax)
 255     reg_syntax_t syntax;
 256 {
 257   reg_syntax_t ret = re_syntax_options;
 258
 259   re_syntax_options = syntax;
 260   return ret;
 261 }
 262 #ifdef _LIBC
 263 weak_alias (__re_set_syntax, re_set_syntax)
 264 #endif
 265
 266 int
 267 re_compile_fastmap (bufp)
 268     struct re_pattern_buffer *bufp;
 269 {
 270   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 271   char *fastmap = bufp->fastmap;
 272
 273   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 274   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 275   if (dfa->init_state != dfa->init_state_word)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 277   if (dfa->init_state != dfa->init_state_nl)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 279   if (dfa->init_state != dfa->init_state_begbuf)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 281   bufp->fastmap_accurate = 1;
 282   return 0;
 283 }
 284 #ifdef _LIBC
 285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 286 #endif
 287
 288 static inline void
 289 __attribute ((always_inline))
 290 re_set_fastmap (char *fastmap, int icase, int ch)
 291 {
 292   fastmap[ch] = 1;
 293   if (icase)
 294     fastmap[tolower (ch)] = 1;
 295 }
 296
 297 /* Helper function for re_compile_fastmap.
 298    Compile fastmap for the initial_state INIT_STATE.  */
 299
 300 static void
 301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 302                          char *fastmap)
 303 {
 304   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 305   int node_cnt;
 306   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 307   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 308     {
 309       int node = init_state->nodes.elems[node_cnt];
 310       re_token_type_t type = dfa->nodes[node].type;
 311
 312       if (type == CHARACTER)
 313         {
 314           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 315 #ifdef RE_ENABLE_I18N
 316           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 317             {
 318               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 319               wchar_t wc;
 320               mbstate_t state;
 321
 322               p = buf;
 323               *p++ = dfa->nodes[node].opr.c;
 324               while (++node < dfa->nodes_len
 325                      && dfa->nodes[node].type == CHARACTER
 326                      && dfa->nodes[node].mb_partial)
 327                 *p++ = dfa->nodes[node].opr.c;
 328               memset (&state, '\0', sizeof (state));
 329               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 330                              &state) == p - buf
 331                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 332                       != (size_t) -1))
 333                 re_set_fastmap (fastmap, 0, buf[0]);
 334             }
 335 #endif
 336         }
 337       else if (type == SIMPLE_BRACKET)
 338         {
 339           int i, ch;
 340           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 341             {
 342               int j;
 343               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 344               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 345                 if (w & ((bitset_word_t) 1 << j))
 346                   re_set_fastmap (fastmap, icase, ch);
 347             }
 348         }
 349 #ifdef RE_ENABLE_I18N
 350       else if (type == COMPLEX_BRACKET)
 351         {
 352           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 353           int i;
 354
 355 # ifdef _LIBC
 356           /* See if we have to try all bytes which start multiple collation
 357              elements.
 358              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 359                   collation element, and don't catch 'b' since 'b' is
 360                   the only collation element which starts from 'b' (and
 361                   it is caught by SIMPLE_BRACKET).  */
 362               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 363                   && (cset->ncoll_syms || cset->nranges))
 364                 {
 365                   const int32_t *table = (const int32_t *)
 366                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 367                   for (i = 0; i < SBC_MAX; ++i)
 368                     if (table[i] < 0)
 369                       re_set_fastmap (fastmap, icase, i);
 370                 }
 371 # endif /* _LIBC */
 372
 373           /* See if we have to start the match at all multibyte characters,
 374              i.e. where we would not find an invalid sequence.  This only
 375              applies to multibyte character sets; for single byte character
 376              sets, the SIMPLE_BRACKET again suffices.  */
 377           if (dfa->mb_cur_max > 1
 378               && (cset->nchar_classes || cset->non_match || cset->nranges
 379 # ifdef _LIBC
 380                   || cset->nequiv_classes
 381 # endif /* _LIBC */
 382                  ))
 383             {
 384               unsigned char c = 0;
 385               do
 386                 {
 387                   mbstate_t mbs;
 388                   memset (&mbs, 0, sizeof (mbs));
 389                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 390                     re_set_fastmap (fastmap, false, (int) c);
 391                 }
 392               while (++c != 0);
 393             }
 394
 395           else
 396             {
 397               /* ... Else catch all bytes which can start the mbchars.  */
 398               for (i = 0; i < cset->nmbchars; ++i)
 399                 {
 400                   char buf[256];
 401                   mbstate_t state;
 402                   memset (&state, '\0', sizeof (state));
 403                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 404                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 405                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 406                     {
 407                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 408                           != (size_t) -1)
 409                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 410                     }
 411                 }
 412             }
 413         }
 414 #endif /* RE_ENABLE_I18N */
 415       else if (type == OP_PERIOD
 416 #ifdef RE_ENABLE_I18N
 417                || type == OP_UTF8_PERIOD
 418 #endif /* RE_ENABLE_I18N */
 419                || type == END_OF_RE)
 420         {
 421           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 422           if (type == END_OF_RE)
 423             bufp->can_be_null = 1;
 424           return;
 425         }
 426     }
 427 }
 428 \f
 429 /* Entry point for POSIX code.  */
 430 /* regcomp takes a regular expression as a string and compiles it.
 431
 432    PREG is a regex_t *.  We do not expect any fields to be initialized,
 433    since POSIX says we shouldn't.  Thus, we set
 434
 435      `buffer' to the compiled pattern;
 436      `used' to the length of the compiled pattern;
 437      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 438        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 439        RE_SYNTAX_POSIX_BASIC;
 440      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 441      `fastmap' to an allocated space for the fastmap;
 442      `fastmap_accurate' to zero;
 443      `re_nsub' to the number of subexpressions in PATTERN.
 444
 445    PATTERN is the address of the pattern string.
 446
 447    CFLAGS is a series of bits which affect compilation.
 448
 449      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 450      use POSIX basic syntax.
 451
 452      If REG_NEWLINE is set, then . and [^...] don't match newline.
 453      Also, regexec will try a match beginning after every newline.
 454
 455      If REG_ICASE is set, then we considers upper- and lowercase
 456      versions of letters to be equivalent when matching.
 457
 458      If REG_NOSUB is set, then when PREG is passed to regexec, that
 459      routine will report only success or failure, and nothing about the
 460      registers.
 461
 462    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 463    the return codes and their meanings.)  */
 464
 465 int
 466 regcomp (preg, pattern, cflags)
 467     regex_t *__restrict preg;
 468     const char *__restrict pattern;
 469     int cflags;
 470 {
 471   reg_errcode_t ret;
 472   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 473                          : RE_SYNTAX_POSIX_BASIC);
 474
 475   preg->buffer = NULL;
 476   preg->allocated = 0;
 477   preg->used = 0;
 478
 479   /* Try to allocate space for the fastmap.  */
 480   preg->fastmap = re_malloc (char, SBC_MAX);
 481   if (BE (preg->fastmap == NULL, 0))
 482     return REG_ESPACE;
 483
 484   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 485
 486   /* If REG_NEWLINE is set, newlines are treated differently.  */
 487   if (cflags & REG_NEWLINE)
 488     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 489       syntax &= ~RE_DOT_NEWLINE;
 490       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 491       /* It also changes the matching behavior.  */
 492       preg->newline_anchor = 1;
 493     }
 494   else
 495     preg->newline_anchor = 0;
 496   preg->no_sub = !!(cflags & REG_NOSUB);
 497   preg->translate = NULL;
 498
 499   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 500
 501   /* POSIX doesn't distinguish between an unmatched open-group and an
 502      unmatched close-group: both are REG_EPAREN.  */
 503   if (ret == REG_ERPAREN)
 504     ret = REG_EPAREN;
 505
 506   /* We have already checked preg->fastmap != NULL.  */
 507   if (BE (ret == REG_NOERROR, 1))
 508     /* Compute the fastmap now, since regexec cannot modify the pattern
 509        buffer.  This function never fails in this implementation.  */
 510     (void) re_compile_fastmap (preg);
 511   else
 512     {
 513       /* Some error occurred while compiling the expression.  */
 514       re_free (preg->fastmap);
 515       preg->fastmap = NULL;
 516     }
 517
 518   return (int) ret;
 519 }
 520 #ifdef _LIBC
 521 weak_alias (__regcomp, regcomp)
 522 #endif
 523
 524 /* Returns a message corresponding to an error code, ERRCODE, returned
 525    from either regcomp or regexec.   We don't use PREG here.  */
 526
 527 size_t
 528 regerror (errcode, preg, errbuf, errbuf_size)
 529     int errcode;
 530     const regex_t *__restrict preg;
 531     char *__restrict errbuf;
 532     size_t errbuf_size;
 533 {
 534   const char *msg;
 535   size_t msg_size;
 536
 537   if (BE (errcode < 0
 538           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 539                                / sizeof (__re_error_msgid_idx[0])), 0))
 540     /* Only error codes returned by the rest of the code should be passed
 541        to this routine.  If we are given anything else, or if other regex
 542        code generates an invalid error code, then the program has a bug.
 543        Dump core so we can fix it.  */
 544     abort ();
 545
 546   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 547
 548   msg_size = strlen (msg) + 1; /* Includes the null.  */
 549
 550   if (BE (errbuf_size != 0, 1))
 551     {
 552       if (BE (msg_size > errbuf_size, 0))
 553         {
 554 #if defined HAVE_MEMPCPY || defined _LIBC
 555           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 556 #else
 557           memcpy (errbuf, msg, errbuf_size - 1);
 558           errbuf[errbuf_size - 1] = 0;
 559 #endif
 560         }
 561       else
 562         memcpy (errbuf, msg, msg_size);
 563     }
 564
 565   return msg_size;
 566 }
 567 #ifdef _LIBC
 568 weak_alias (__regerror, regerror)
 569 #endif
 570
 571
 572 #ifdef RE_ENABLE_I18N
 573 /* This static array is used for the map to single-byte characters when
 574    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 575    it the same all the time.  UTF-8 is the preferred encoding so this is
 576    a worthwhile optimization.  */
 577 static const bitset_t utf8_sb_map =
 578 {
 579   /* Set the first 128 bits.  */
 580   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 581 };
 582 #endif
 583
 584
 585 static void
 586 free_dfa_content (re_dfa_t *dfa)
 587 {
 588   int i, j;
 589
 590   if (dfa->nodes)
 591     for (i = 0; i < dfa->nodes_len; ++i)
 592       free_token (dfa->nodes + i);
 593   re_free (dfa->nexts);
 594   for (i = 0; i < dfa->nodes_len; ++i)
 595     {
 596       if (dfa->eclosures != NULL)
 597         re_node_set_free (dfa->eclosures + i);
 598       if (dfa->inveclosures != NULL)
 599         re_node_set_free (dfa->inveclosures + i);
 600       if (dfa->edests != NULL)
 601         re_node_set_free (dfa->edests + i);
 602     }
 603   re_free (dfa->edests);
 604   re_free (dfa->eclosures);
 605   re_free (dfa->inveclosures);
 606   re_free (dfa->nodes);
 607
 608   if (dfa->state_table)
 609     for (i = 0; i <= dfa->state_hash_mask; ++i)
 610       {
 611         struct re_state_table_entry *entry = dfa->state_table + i;
 612         for (j = 0; j < entry->num; ++j)
 613           {
 614             re_dfastate_t *state = entry->array[j];
 615             free_state (state);
 616           }
 617         re_free (entry->array);
 618       }
 619   re_free (dfa->state_table);
 620 #ifdef RE_ENABLE_I18N
 621   if (dfa->sb_char != utf8_sb_map)
 622     re_free (dfa->sb_char);
 623 #endif
 624   re_free (dfa->subexp_map);
 625 #ifdef DEBUG
 626   re_free (dfa->re_str);
 627 #endif
 628
 629   re_free (dfa);
 630 }
 631
 632
 633 /* Free dynamically allocated space used by PREG.  */
 634
 635 void
 636 regfree (preg)
 637     regex_t *preg;
 638 {
 639   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 640   if (BE (dfa != NULL, 1))
 641     free_dfa_content (dfa);
 642   preg->buffer = NULL;
 643   preg->allocated = 0;
 644
 645   re_free (preg->fastmap);
 646   preg->fastmap = NULL;
 647
 648   re_free (preg->translate);
 649   preg->translate = NULL;
 650 }
 651 #ifdef _LIBC
 652 weak_alias (__regfree, regfree)
 653 #endif
 654 \f
 655 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 656    them unless specifically requested.  */
 657
 658 #if defined _REGEX_RE_COMP || defined _LIBC
 659
 660 /* BSD has one and only one pattern buffer.  */
 661 static struct re_pattern_buffer re_comp_buf;
 662
 663 char *
 664 # ifdef _LIBC
 665 /* Make these definitions weak in libc, so POSIX programs can redefine
 666    these names if they don't use our functions, and still use
 667    regcomp/regexec above without link errors.  */
 668 weak_function
 669 # endif
 670 re_comp (s)
 671      const char *s;
 672 {
 673   reg_errcode_t ret;
 674   char *fastmap;
 675
 676   if (!s)
 677     {
 678       if (!re_comp_buf.buffer)
 679         return gettext ("No previous regular expression");
 680       return 0;
 681     }
 682
 683   if (re_comp_buf.buffer)
 684     {
 685       fastmap = re_comp_buf.fastmap;
 686       re_comp_buf.fastmap = NULL;
 687       __regfree (&re_comp_buf);
 688       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 689       re_comp_buf.fastmap = fastmap;
 690     }
 691
 692   if (re_comp_buf.fastmap == NULL)
 693     {
 694       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 695       if (re_comp_buf.fastmap == NULL)
 696         return (char *) gettext (__re_error_msgid
 697                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 698     }
 699
 700   /* Since `re_exec' always passes NULL for the `regs' argument, we
 701      don't need to initialize the pattern buffer fields which affect it.  */
 702
 703   /* Match anchors at newlines.  */
 704   re_comp_buf.newline_anchor = 1;
 705
 706   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 707
 708   if (!ret)
 709     return NULL;
 710
 711   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 712   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 713 }
 714
 715 #ifdef _LIBC
 716 libc_freeres_fn (free_mem)
 717 {
 718   __regfree (&re_comp_buf);
 719 }
 720 #endif
 721
 722 #endif /* _REGEX_RE_COMP */
 723 \f
 724 /* Internal entry point.
 725    Compile the regular expression PATTERN, whose length is LENGTH.
 726    SYNTAX indicate regular expression's syntax.  */
 727
 728 static reg_errcode_t
 729 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 730                      reg_syntax_t syntax)
 731 {
 732   reg_errcode_t err = REG_NOERROR;
 733   re_dfa_t *dfa;
 734   re_string_t regexp;
 735
 736   /* Initialize the pattern buffer.  */
 737   preg->fastmap_accurate = 0;
 738   preg->syntax = syntax;
 739   preg->not_bol = preg->not_eol = 0;
 740   preg->used = 0;
 741   preg->re_nsub = 0;
 742   preg->can_be_null = 0;
 743   preg->regs_allocated = REGS_UNALLOCATED;
 744
 745   /* Initialize the dfa.  */
 746   dfa = (re_dfa_t *) preg->buffer;
 747   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 748     {
 749       /* If zero allocated, but buffer is non-null, try to realloc
 750          enough space.  This loses if buffer's address is bogus, but
 751          that is the user's responsibility.  If ->buffer is NULL this
 752          is a simple allocation.  */
 753       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 754       if (dfa == NULL)
 755         return REG_ESPACE;
 756       preg->allocated = sizeof (re_dfa_t);
 757       preg->buffer = (unsigned char *) dfa;
 758     }
 759   preg->used = sizeof (re_dfa_t);
 760
 761   err = init_dfa (dfa, length);
 762   if (BE (err != REG_NOERROR, 0))
 763     {
 764       free_dfa_content (dfa);
 765       preg->buffer = NULL;
 766       preg->allocated = 0;
 767       return err;
 768     }
 769 #ifdef DEBUG
 770   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 771   dfa->re_str = re_malloc (char, length + 1);
 772   strncpy (dfa->re_str, pattern, length + 1);
 773 #endif
 774
 775   __libc_lock_init (dfa->lock);
 776
 777   err = re_string_construct (&regexp, pattern, length, preg->translate,
 778                              syntax & RE_ICASE, dfa);
 779   if (BE (err != REG_NOERROR, 0))
 780     {
 781     re_compile_internal_free_return:
 782       free_workarea_compile (preg);
 783       re_string_destruct (&regexp);
 784       free_dfa_content (dfa);
 785       preg->buffer = NULL;
 786       preg->allocated = 0;
 787       return err;
 788     }
 789
 790   /* Parse the regular expression, and build a structure tree.  */
 791   preg->re_nsub = 0;
 792   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 793   if (BE (dfa->str_tree == NULL, 0))
 794     goto re_compile_internal_free_return;
 795
 796   /* Analyze the tree and create the nfa.  */
 797   err = analyze (preg);
 798   if (BE (err != REG_NOERROR, 0))
 799     goto re_compile_internal_free_return;
 800
 801 #ifdef RE_ENABLE_I18N
 802   /* If possible, do searching in single byte encoding to speed things up.  */
 803   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 804     optimize_utf8 (dfa);
 805 #endif
 806
 807   /* Then create the initial state of the dfa.  */
 808   err = create_initial_state (dfa);
 809
 810   /* Release work areas.  */
 811   free_workarea_compile (preg);
 812   re_string_destruct (&regexp);
 813
 814   if (BE (err != REG_NOERROR, 0))
 815     {
 816       free_dfa_content (dfa);
 817       preg->buffer = NULL;
 818       preg->allocated = 0;
 819     }
 820
 821   return err;
 822 }
 823
 824 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 825    as the initial length of some arrays.  */
 826
 827 static reg_errcode_t
 828 init_dfa (re_dfa_t *dfa, size_t pat_len)
 829 {
 830   unsigned int table_size;
 831 #ifndef _LIBC
 832   char *codeset_name;
 833 #endif
 834
 835   memset (dfa, '\0', sizeof (re_dfa_t));
 836
 837   /* Force allocation of str_tree_storage the first time.  */
 838   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 839
 840   /* Avoid overflows.  */
 841   if (pat_len == SIZE_MAX)
 842     return REG_ESPACE;
 843
 844   dfa->nodes_alloc = pat_len + 1;
 845   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 846
 847   /*  table_size = 2 ^ ceil(log pat_len) */
 848   for (table_size = 1; ; table_size <<= 1)
 849     if (table_size > pat_len)
 850       break;
 851
 852   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 853   dfa->state_hash_mask = table_size - 1;
 854
 855   dfa->mb_cur_max = MB_CUR_MAX;
 856 #ifdef _LIBC
 857   if (dfa->mb_cur_max == 6
 858       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 859     dfa->is_utf8 = 1;
 860   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 861                        != 0);
 862 #else
 863 # ifdef HAVE_LANGINFO_CODESET
 864   codeset_name = nl_langinfo (CODESET);
 865 # else
 866   codeset_name = getenv ("LC_ALL");
 867   if (codeset_name == NULL || codeset_name[0] == '\0')
 868     codeset_name = getenv ("LC_CTYPE");
 869   if (codeset_name == NULL || codeset_name[0] == '\0')
 870     codeset_name = getenv ("LANG");
 871   if (codeset_name == NULL)
 872     codeset_name = "";
 873   else if (strchr (codeset_name, '.') !=  NULL)
 874     codeset_name = strchr (codeset_name, '.') + 1;
 875 # endif
 876
 877   if (strcasecmp (codeset_name, "UTF-8") == 0
 878       || strcasecmp (codeset_name, "UTF8") == 0)
 879     dfa->is_utf8 = 1;
 880
 881   /* We check exhaustively in the loop below if this charset is a
 882      superset of ASCII.  */
 883   dfa->map_notascii = 0;
 884 #endif
 885
 886 #ifdef RE_ENABLE_I18N
 887   if (dfa->mb_cur_max > 1)
 888     {
 889       if (dfa->is_utf8)
 890         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 891       else
 892         {
 893           int i, j, ch;
 894
 895           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 896           if (BE (dfa->sb_char == NULL, 0))
 897             return REG_ESPACE;
 898
 899           /* Set the bits corresponding to single byte chars.  */
 900           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 901             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 902               {
 903                 wint_t wch = __btowc (ch);
 904                 if (wch != WEOF)
 905                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 906 # ifndef _LIBC
 907                 if (isascii (ch) && wch != ch)
 908                   dfa->map_notascii = 1;
 909 # endif
 910               }
 911         }
 912     }
 913 #endif
 914
 915   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 916     return REG_ESPACE;
 917   return REG_NOERROR;
 918 }
 919
 920 /* Initialize WORD_CHAR table, which indicate which character is
 921    "word".  In this case "word" means that it is the word construction
 922    character used by some operators like "\<", "\>", etc.  */
 923
 924 static void
 925 internal_function
 926 init_word_char (re_dfa_t *dfa)
 927 {
 928   dfa->word_ops_used = 1;
 929   int i = 0;
 930   int ch = 0;
 931   if (BE (dfa->map_notascii == 0, 1))
 932     {
 933       if (sizeof (dfa->word_char[0]) == 8)
 934         {
 935           /* The extra temporaries here avoid "implicitly truncated"
 936              warnings in the case when this is dead code, i.e. 32-bit.  */
 937           const uint64_t wc0 = UINT64_C (0x03ff000000000000);
 938           const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
 939           dfa->word_char[0] = wc0;
 940           dfa->word_char[1] = wc1;
 941           i = 2;
 942         }
 943       else if (sizeof (dfa->word_char[0]) == 4)
 944         {
 945           dfa->word_char[0] = UINT32_C (0x00000000);
 946           dfa->word_char[1] = UINT32_C (0x03ff0000);
 947           dfa->word_char[2] = UINT32_C (0x87fffffe);
 948           dfa->word_char[3] = UINT32_C (0x07fffffe);
 949           i = 4;
 950         }
 951       else
 952         abort ();
 953       ch = 128;
 954
 955       if (BE (dfa->is_utf8, 1))
 956         {
 957           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 958           return;
 959         }
 960     }
 961
 962   for (; i < BITSET_WORDS; ++i)
 963     for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 964       if (isalnum (ch) || ch == '_')
 965         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 966 }
 967
 968 /* Free the work area which are only used while compiling.  */
 969
 970 static void
 971 free_workarea_compile (regex_t *preg)
 972 {
 973   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 974   bin_tree_storage_t *storage, *next;
 975   for (storage = dfa->str_tree_storage; storage; storage = next)
 976     {
 977       next = storage->next;
 978       re_free (storage);
 979     }
 980   dfa->str_tree_storage = NULL;
 981   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 982   dfa->str_tree = NULL;
 983   re_free (dfa->org_indices);
 984   dfa->org_indices = NULL;
 985 }
 986
 987 /* Create initial states for all contexts.  */
 988
 989 static reg_errcode_t
 990 create_initial_state (re_dfa_t *dfa)
 991 {
 992   int first, i;
 993   reg_errcode_t err;
 994   re_node_set init_nodes;
 995
 996   /* Initial states have the epsilon closure of the node which is
 997      the first node of the regular expression.  */
 998   first = dfa->str_tree->first->node_idx;
 999   dfa->init_node = first;
1000   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1001   if (BE (err != REG_NOERROR, 0))
1002     return err;
1003
1004   /* The back-references which are in initial states can epsilon transit,
1005      since in this case all of the subexpressions can be null.
1006      Then we add epsilon closures of the nodes which are the next nodes of
1007      the back-references.  */
1008   if (dfa->nbackref > 0)
1009     for (i = 0; i < init_nodes.nelem; ++i)
1010       {
1011         int node_idx = init_nodes.elems[i];
1012         re_token_type_t type = dfa->nodes[node_idx].type;
1013
1014         int clexp_idx;
1015         if (type != OP_BACK_REF)
1016           continue;
1017         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1018           {
1019             re_token_t *clexp_node;
1020             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1021             if (clexp_node->type == OP_CLOSE_SUBEXP
1022                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1023               break;
1024           }
1025         if (clexp_idx == init_nodes.nelem)
1026           continue;
1027
1028         if (type == OP_BACK_REF)
1029           {
1030             int dest_idx = dfa->edests[node_idx].elems[0];
1031             if (!re_node_set_contains (&init_nodes, dest_idx))
1032               {
1033                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1034                                                        dfa->eclosures
1035                                                        + dest_idx);
1036                 if (err != REG_NOERROR)
1037                   return err;
1038                 i = 0;
1039               }
1040           }
1041       }
1042
1043   /* It must be the first time to invoke acquire_state.  */
1044   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1045   /* We don't check ERR here, since the initial state must not be NULL.  */
1046   if (BE (dfa->init_state == NULL, 0))
1047     return err;
1048   if (dfa->init_state->has_constraint)
1049     {
1050       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1051                                                        CONTEXT_WORD);
1052       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1053                                                      CONTEXT_NEWLINE);
1054       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1055                                                          &init_nodes,
1056                                                          CONTEXT_NEWLINE
1057                                                          | CONTEXT_BEGBUF);
1058       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1059               || dfa->init_state_begbuf == NULL, 0))
1060         return err;
1061     }
1062   else
1063     dfa->init_state_word = dfa->init_state_nl
1064       = dfa->init_state_begbuf = dfa->init_state;
1065
1066   re_node_set_free (&init_nodes);
1067   return REG_NOERROR;
1068 }
1069 \f
1070 #ifdef RE_ENABLE_I18N
1071 /* If it is possible to do searching in single byte encoding instead of UTF-8
1072    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1073    DFA nodes where needed.  */
1074
1075 static void
1076 optimize_utf8 (re_dfa_t *dfa)
1077 {
1078   int node, i, mb_chars = 0, has_period = 0;
1079
1080   for (node = 0; node < dfa->nodes_len; ++node)
1081     switch (dfa->nodes[node].type)
1082       {
1083       case CHARACTER:
1084         if (dfa->nodes[node].opr.c >= 0x80)
1085           mb_chars = 1;
1086         break;
1087       case ANCHOR:
1088         switch (dfa->nodes[node].opr.ctx_type)
1089           {
1090           case LINE_FIRST:
1091           case LINE_LAST:
1092           case BUF_FIRST:
1093           case BUF_LAST:
1094             break;
1095           default:
1096             /* Word anchors etc. cannot be handled.  It's okay to test
1097                opr.ctx_type since constraints (for all DFA nodes) are
1098                created by ORing one or more opr.ctx_type values.  */
1099             return;
1100           }
1101         break;
1102       case OP_PERIOD:
1103         has_period = 1;
1104         break;
1105       case OP_BACK_REF:
1106       case OP_ALT:
1107       case END_OF_RE:
1108       case OP_DUP_ASTERISK:
1109       case OP_OPEN_SUBEXP:
1110       case OP_CLOSE_SUBEXP:
1111         break;
1112       case COMPLEX_BRACKET:
1113         return;
1114       case SIMPLE_BRACKET:
1115         /* Just double check.  The non-ASCII range starts at 0x80.  */
1116         assert (0x80 % BITSET_WORD_BITS == 0);
1117         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1118           if (dfa->nodes[node].opr.sbcset[i])
1119             return;
1120         break;
1121       default:
1122         abort ();
1123       }
1124
1125   if (mb_chars || has_period)
1126     for (node = 0; node < dfa->nodes_len; ++node)
1127       {
1128         if (dfa->nodes[node].type == CHARACTER
1129             && dfa->nodes[node].opr.c >= 0x80)
1130           dfa->nodes[node].mb_partial = 0;
1131         else if (dfa->nodes[node].type == OP_PERIOD)
1132           dfa->nodes[node].type = OP_UTF8_PERIOD;
1133       }
1134
1135   /* The search can be in single byte locale.  */
1136   dfa->mb_cur_max = 1;
1137   dfa->is_utf8 = 0;
1138   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1139 }
1140 #endif
1141 \f
1142 /* Analyze the structure tree, and calculate "first", "next", "edest",
1143    "eclosure", and "inveclosure".  */
1144
1145 static reg_errcode_t
1146 analyze (regex_t *preg)
1147 {
1148   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1149   reg_errcode_t ret;
1150
1151   /* Allocate arrays.  */
1152   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1153   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1154   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1155   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1156   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1157           || dfa->eclosures == NULL, 0))
1158     return REG_ESPACE;
1159
1160   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1161   if (dfa->subexp_map != NULL)
1162     {
1163       int i;
1164       for (i = 0; i < preg->re_nsub; i++)
1165         dfa->subexp_map[i] = i;
1166       preorder (dfa->str_tree, optimize_subexps, dfa);
1167       for (i = 0; i < preg->re_nsub; i++)
1168         if (dfa->subexp_map[i] != i)
1169           break;
1170       if (i == preg->re_nsub)
1171         {
1172           free (dfa->subexp_map);
1173           dfa->subexp_map = NULL;
1174         }
1175     }
1176
1177   ret = postorder (dfa->str_tree, lower_subexps, preg);
1178   if (BE (ret != REG_NOERROR, 0))
1179     return ret;
1180   ret = postorder (dfa->str_tree, calc_first, dfa);
1181   if (BE (ret != REG_NOERROR, 0))
1182     return ret;
1183   preorder (dfa->str_tree, calc_next, dfa);
1184   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1185   if (BE (ret != REG_NOERROR, 0))
1186     return ret;
1187   ret = calc_eclosure (dfa);
1188   if (BE (ret != REG_NOERROR, 0))
1189     return ret;
1190
1191   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1192      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1193   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1194       || dfa->nbackref)
1195     {
1196       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1197       if (BE (dfa->inveclosures == NULL, 0))
1198         return REG_ESPACE;
1199       ret = calc_inveclosure (dfa);
1200     }
1201
1202   return ret;
1203 }
1204
1205 /* Our parse trees are very unbalanced, so we cannot use a stack to
1206    implement parse tree visits.  Instead, we use parent pointers and
1207    some hairy code in these two functions.  */
1208 static reg_errcode_t
1209 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1210            void *extra)
1211 {
1212   bin_tree_t *node, *prev;
1213
1214   for (node = root; ; )
1215     {
1216       /* Descend down the tree, preferably to the left (or to the right
1217          if that's the only child).  */
1218       while (node->left || node->right)
1219         if (node->left)
1220           node = node->left;
1221         else
1222           node = node->right;
1223
1224       do
1225         {
1226           reg_errcode_t err = fn (extra, node);
1227           if (BE (err != REG_NOERROR, 0))
1228             return err;
1229           if (node->parent == NULL)
1230             return REG_NOERROR;
1231           prev = node;
1232           node = node->parent;
1233         }
1234       /* Go up while we have a node that is reached from the right.  */
1235       while (node->right == prev || node->right == NULL);
1236       node = node->right;
1237     }
1238 }
1239
1240 static reg_errcode_t
1241 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1242           void *extra)
1243 {
1244   bin_tree_t *node;
1245
1246   for (node = root; ; )
1247     {
1248       reg_errcode_t err = fn (extra, node);
1249       if (BE (err != REG_NOERROR, 0))
1250         return err;
1251
1252       /* Go to the left node, or up and to the right.  */
1253       if (node->left)
1254         node = node->left;
1255       else
1256         {
1257           bin_tree_t *prev = NULL;
1258           while (node->right == prev || node->right == NULL)
1259             {
1260               prev = node;
1261               node = node->parent;
1262               if (!node)
1263                 return REG_NOERROR;
1264             }
1265           node = node->right;
1266         }
1267     }
1268 }
1269
1270 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1272    backreferences as well.  Requires a preorder visit.  */
1273 static reg_errcode_t
1274 optimize_subexps (void *extra, bin_tree_t *node)
1275 {
1276   re_dfa_t *dfa = (re_dfa_t *) extra;
1277
1278   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1279     {
1280       int idx = node->token.opr.idx;
1281       node->token.opr.idx = dfa->subexp_map[idx];
1282       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1283     }
1284
1285   else if (node->token.type == SUBEXP
1286            && node->left && node->left->token.type == SUBEXP)
1287     {
1288       int other_idx = node->left->token.opr.idx;
1289
1290       node->left = node->left->left;
1291       if (node->left)
1292         node->left->parent = node;
1293
1294       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295       if (other_idx < BITSET_WORD_BITS)
1296           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1297     }
1298
1299   return REG_NOERROR;
1300 }
1301
1302 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1304 static reg_errcode_t
1305 lower_subexps (void *extra, bin_tree_t *node)
1306 {
1307   regex_t *preg = (regex_t *) extra;
1308   reg_errcode_t err = REG_NOERROR;
1309
1310   if (node->left && node->left->token.type == SUBEXP)
1311     {
1312       node->left = lower_subexp (&err, preg, node->left);
1313       if (node->left)
1314         node->left->parent = node;
1315     }
1316   if (node->right && node->right->token.type == SUBEXP)
1317     {
1318       node->right = lower_subexp (&err, preg, node->right);
1319       if (node->right)
1320         node->right->parent = node;
1321     }
1322
1323   return err;
1324 }
1325
1326 static bin_tree_t *
1327 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1328 {
1329   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1330   bin_tree_t *body = node->left;
1331   bin_tree_t *op, *cls, *tree1, *tree;
1332
1333   if (preg->no_sub
1334       /* We do not optimize empty subexpressions, because otherwise we may
1335          have bad CONCAT nodes with NULL children.  This is obviously not
1336          very common, so we do not lose much.  An example that triggers
1337          this case is the sed "script" /\(\)/x.  */
1338       && node->left != NULL
1339       && (node->token.opr.idx >= BITSET_WORD_BITS
1340           || !(dfa->used_bkref_map
1341                & ((bitset_word_t) 1 << node->token.opr.idx))))
1342     return node->left;
1343
1344   /* Convert the SUBEXP node to the concatenation of an
1345      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1346   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349   tree = create_tree (dfa, op, tree1, CONCAT);
1350   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1351     {
1352       *err = REG_ESPACE;
1353       return NULL;
1354     }
1355
1356   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1358   return tree;
1359 }
1360
1361 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362    nodes.  Requires a postorder visit.  */
1363 static reg_errcode_t
1364 calc_first (void *extra, bin_tree_t *node)
1365 {
1366   re_dfa_t *dfa = (re_dfa_t *) extra;
1367   if (node->token.type == CONCAT)
1368     {
1369       node->first = node->left->first;
1370       node->node_idx = node->left->node_idx;
1371     }
1372   else
1373     {
1374       node->first = node;
1375       node->node_idx = re_dfa_add_node (dfa, node->token);
1376       if (BE (node->node_idx == -1, 0))
1377         return REG_ESPACE;
1378       if (node->token.type == ANCHOR)
1379         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1380     }
1381   return REG_NOERROR;
1382 }
1383
1384 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1385 static reg_errcode_t
1386 calc_next (void *extra, bin_tree_t *node)
1387 {
1388   switch (node->token.type)
1389     {
1390     case OP_DUP_ASTERISK:
1391       node->left->next = node;
1392       break;
1393     case CONCAT:
1394       node->left->next = node->right->first;
1395       node->right->next = node->next;
1396       break;
1397     default:
1398       if (node->left)
1399         node->left->next = node->next;
1400       if (node->right)
1401         node->right->next = node->next;
1402       break;
1403     }
1404   return REG_NOERROR;
1405 }
1406
1407 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1408 static reg_errcode_t
1409 link_nfa_nodes (void *extra, bin_tree_t *node)
1410 {
1411   re_dfa_t *dfa = (re_dfa_t *) extra;
1412   int idx = node->node_idx;
1413   reg_errcode_t err = REG_NOERROR;
1414
1415   switch (node->token.type)
1416     {
1417     case CONCAT:
1418       break;
1419
1420     case END_OF_RE:
1421       assert (node->next == NULL);
1422       break;
1423
1424     case OP_DUP_ASTERISK:
1425     case OP_ALT:
1426       {
1427         int left, right;
1428         dfa->has_plural_match = 1;
1429         if (node->left != NULL)
1430           left = node->left->first->node_idx;
1431         else
1432           left = node->next->node_idx;
1433         if (node->right != NULL)
1434           right = node->right->first->node_idx;
1435         else
1436           right = node->next->node_idx;
1437         assert (left > -1);
1438         assert (right > -1);
1439         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1440       }
1441       break;
1442
1443     case ANCHOR:
1444     case OP_OPEN_SUBEXP:
1445     case OP_CLOSE_SUBEXP:
1446       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1447       break;
1448
1449     case OP_BACK_REF:
1450       dfa->nexts[idx] = node->next->node_idx;
1451       if (node->token.type == OP_BACK_REF)
1452         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1453       break;
1454
1455     default:
1456       assert (!IS_EPSILON_NODE (node->token.type));
1457       dfa->nexts[idx] = node->next->node_idx;
1458       break;
1459     }
1460
1461   return err;
1462 }
1463
1464 /* Duplicate the epsilon closure of the node ROOT_NODE.
1465    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466    to their own constraint.  */
1467
1468 static reg_errcode_t
1469 internal_function
1470 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1471                         int root_node, unsigned int init_constraint)
1472 {
1473   int org_node, clone_node, ret;
1474   unsigned int constraint = init_constraint;
1475   for (org_node = top_org_node, clone_node = top_clone_node;;)
1476     {
1477       int org_dest, clone_dest;
1478       if (dfa->nodes[org_node].type == OP_BACK_REF)
1479         {
1480           /* If the back reference epsilon-transit, its destination must
1481              also have the constraint.  Then duplicate the epsilon closure
1482              of the destination of the back reference, and store it in
1483              edests of the back reference.  */
1484           org_dest = dfa->nexts[org_node];
1485           re_node_set_empty (dfa->edests + clone_node);
1486           clone_dest = duplicate_node (dfa, org_dest, constraint);
1487           if (BE (clone_dest == -1, 0))
1488             return REG_ESPACE;
1489           dfa->nexts[clone_node] = dfa->nexts[org_node];
1490           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1491           if (BE (ret < 0, 0))
1492             return REG_ESPACE;
1493         }
1494       else if (dfa->edests[org_node].nelem == 0)
1495         {
1496           /* In case of the node can't epsilon-transit, don't duplicate the
1497              destination and store the original destination as the
1498              destination of the node.  */
1499           dfa->nexts[clone_node] = dfa->nexts[org_node];
1500           break;
1501         }
1502       else if (dfa->edests[org_node].nelem == 1)
1503         {
1504           /* In case of the node can epsilon-transit, and it has only one
1505              destination.  */
1506           org_dest = dfa->edests[org_node].elems[0];
1507           re_node_set_empty (dfa->edests + clone_node);
1508           /* If the node is root_node itself, it means the epsilon clsoure
1509              has a loop.   Then tie it to the destination of the root_node.  */
1510           if (org_node == root_node && clone_node != org_node)
1511             {
1512               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1513               if (BE (ret < 0, 0))
1514                 return REG_ESPACE;
1515               break;
1516             }
1517           /* In case of the node has another constraint, add it.  */
1518           constraint |= dfa->nodes[org_node].constraint;
1519           clone_dest = duplicate_node (dfa, org_dest, constraint);
1520           if (BE (clone_dest == -1, 0))
1521             return REG_ESPACE;
1522           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1523           if (BE (ret < 0, 0))
1524             return REG_ESPACE;
1525         }
1526       else /* dfa->edests[org_node].nelem == 2 */
1527         {
1528           /* In case of the node can epsilon-transit, and it has two
1529              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1530           org_dest = dfa->edests[org_node].elems[0];
1531           re_node_set_empty (dfa->edests + clone_node);
1532           /* Search for a duplicated node which satisfies the constraint.  */
1533           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1534           if (clone_dest == -1)
1535             {
1536               /* There is no such duplicated node, create a new one.  */
1537               reg_errcode_t err;
1538               clone_dest = duplicate_node (dfa, org_dest, constraint);
1539               if (BE (clone_dest == -1, 0))
1540                 return REG_ESPACE;
1541               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542               if (BE (ret < 0, 0))
1543                 return REG_ESPACE;
1544               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1545                                             root_node, constraint);
1546               if (BE (err != REG_NOERROR, 0))
1547                 return err;
1548             }
1549           else
1550             {
1551               /* There is a duplicated node which satisfies the constraint,
1552                  use it to avoid infinite loop.  */
1553               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554               if (BE (ret < 0, 0))
1555                 return REG_ESPACE;
1556             }
1557
1558           org_dest = dfa->edests[org_node].elems[1];
1559           clone_dest = duplicate_node (dfa, org_dest, constraint);
1560           if (BE (clone_dest == -1, 0))
1561             return REG_ESPACE;
1562           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563           if (BE (ret < 0, 0))
1564             return REG_ESPACE;
1565         }
1566       org_node = org_dest;
1567       clone_node = clone_dest;
1568     }
1569   return REG_NOERROR;
1570 }
1571
1572 /* Search for a node which is duplicated from the node ORG_NODE, and
1573    satisfies the constraint CONSTRAINT.  */
1574
1575 static int
1576 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1577                         unsigned int constraint)
1578 {
1579   int idx;
1580   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1581     {
1582       if (org_node == dfa->org_indices[idx]
1583           && constraint == dfa->nodes[idx].constraint)
1584         return idx; /* Found.  */
1585     }
1586   return -1; /* Not found.  */
1587 }
1588
1589 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1590    Return the index of the new node, or -1 if insufficient storage is
1591    available.  */
1592
1593 static int
1594 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1595 {
1596   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1597   if (BE (dup_idx != -1, 1))
1598     {
1599       dfa->nodes[dup_idx].constraint = constraint;
1600       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1601       dfa->nodes[dup_idx].duplicated = 1;
1602
1603       /* Store the index of the original node.  */
1604       dfa->org_indices[dup_idx] = org_idx;
1605     }
1606   return dup_idx;
1607 }
1608
1609 static reg_errcode_t
1610 calc_inveclosure (re_dfa_t *dfa)
1611 {
1612   int src, idx, ret;
1613   for (idx = 0; idx < dfa->nodes_len; ++idx)
1614     re_node_set_init_empty (dfa->inveclosures + idx);
1615
1616   for (src = 0; src < dfa->nodes_len; ++src)
1617     {
1618       int *elems = dfa->eclosures[src].elems;
1619       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1620         {
1621           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1622           if (BE (ret == -1, 0))
1623             return REG_ESPACE;
1624         }
1625     }
1626
1627   return REG_NOERROR;
1628 }
1629
1630 /* Calculate "eclosure" for all the node in DFA.  */
1631
1632 static reg_errcode_t
1633 calc_eclosure (re_dfa_t *dfa)
1634 {
1635   int node_idx, incomplete;
1636 #ifdef DEBUG
1637   assert (dfa->nodes_len > 0);
1638 #endif
1639   incomplete = 0;
1640   /* For each nodes, calculate epsilon closure.  */
1641   for (node_idx = 0; ; ++node_idx)
1642     {
1643       reg_errcode_t err;
1644       re_node_set eclosure_elem;
1645       if (node_idx == dfa->nodes_len)
1646         {
1647           if (!incomplete)
1648             break;
1649           incomplete = 0;
1650           node_idx = 0;
1651         }
1652
1653 #ifdef DEBUG
1654       assert (dfa->eclosures[node_idx].nelem != -1);
1655 #endif
1656
1657       /* If we have already calculated, skip it.  */
1658       if (dfa->eclosures[node_idx].nelem != 0)
1659         continue;
1660       /* Calculate epsilon closure of `node_idx'.  */
1661       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1662       if (BE (err != REG_NOERROR, 0))
1663         return err;
1664
1665       if (dfa->eclosures[node_idx].nelem == 0)
1666         {
1667           incomplete = 1;
1668           re_node_set_free (&eclosure_elem);
1669         }
1670     }
1671   return REG_NOERROR;
1672 }
1673
1674 /* Calculate epsilon closure of NODE.  */
1675
1676 static reg_errcode_t
1677 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1678 {
1679   reg_errcode_t err;
1680   int i;
1681   re_node_set eclosure;
1682   int ret;
1683   int incomplete = 0;
1684   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1685   if (BE (err != REG_NOERROR, 0))
1686     return err;
1687
1688   /* This indicates that we are calculating this node now.
1689      We reference this value to avoid infinite loop.  */
1690   dfa->eclosures[node].nelem = -1;
1691
1692   /* If the current node has constraints, duplicate all nodes
1693      since they must inherit the constraints.  */
1694   if (dfa->nodes[node].constraint
1695       && dfa->edests[node].nelem
1696       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1697     {
1698       err = duplicate_node_closure (dfa, node, node, node,
1699                                     dfa->nodes[node].constraint);
1700       if (BE (err != REG_NOERROR, 0))
1701         return err;
1702     }
1703
1704   /* Expand each epsilon destination nodes.  */
1705   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1706     for (i = 0; i < dfa->edests[node].nelem; ++i)
1707       {
1708         re_node_set eclosure_elem;
1709         int edest = dfa->edests[node].elems[i];
1710         /* If calculating the epsilon closure of `edest' is in progress,
1711            return intermediate result.  */
1712         if (dfa->eclosures[edest].nelem == -1)
1713           {
1714             incomplete = 1;
1715             continue;
1716           }
1717         /* If we haven't calculated the epsilon closure of `edest' yet,
1718            calculate now. Otherwise use calculated epsilon closure.  */
1719         if (dfa->eclosures[edest].nelem == 0)
1720           {
1721             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1722             if (BE (err != REG_NOERROR, 0))
1723               return err;
1724           }
1725         else
1726           eclosure_elem = dfa->eclosures[edest];
1727         /* Merge the epsilon closure of `edest'.  */
1728         err = re_node_set_merge (&eclosure, &eclosure_elem);
1729         if (BE (err != REG_NOERROR, 0))
1730           return err;
1731         /* If the epsilon closure of `edest' is incomplete,
1732            the epsilon closure of this node is also incomplete.  */
1733         if (dfa->eclosures[edest].nelem == 0)
1734           {
1735             incomplete = 1;
1736             re_node_set_free (&eclosure_elem);
1737           }
1738       }
1739
1740   /* An epsilon closure includes itself.  */
1741   ret = re_node_set_insert (&eclosure, node);
1742   if (BE (ret < 0, 0))
1743     return REG_ESPACE;
1744   if (incomplete && !root)
1745     dfa->eclosures[node].nelem = 0;
1746   else
1747     dfa->eclosures[node] = eclosure;
1748   *new_set = eclosure;
1749   return REG_NOERROR;
1750 }
1751 \f
1752 /* Functions for token which are used in the parser.  */
1753
1754 /* Fetch a token from INPUT.
1755    We must not use this function inside bracket expressions.  */
1756
1757 static void
1758 internal_function
1759 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1760 {
1761   re_string_skip_bytes (input, peek_token (result, input, syntax));
1762 }
1763
1764 /* Peek a token from INPUT, and return the length of the token.
1765    We must not use this function inside bracket expressions.  */
1766
1767 static int
1768 internal_function
1769 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1770 {
1771   unsigned char c;
1772
1773   if (re_string_eoi (input))
1774     {
1775       token->type = END_OF_RE;
1776       return 0;
1777     }
1778
1779   c = re_string_peek_byte (input, 0);
1780   token->opr.c = c;
1781
1782   token->word_char = 0;
1783 #ifdef RE_ENABLE_I18N
1784   token->mb_partial = 0;
1785   if (input->mb_cur_max > 1 &&
1786       !re_string_first_byte (input, re_string_cur_idx (input)))
1787     {
1788       token->type = CHARACTER;
1789       token->mb_partial = 1;
1790       return 1;
1791     }
1792 #endif
1793   if (c == '\\')
1794     {
1795       unsigned char c2;
1796       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1797         {
1798           token->type = BACK_SLASH;
1799           return 1;
1800         }
1801
1802       c2 = re_string_peek_byte_case (input, 1);
1803       token->opr.c = c2;
1804       token->type = CHARACTER;
1805 #ifdef RE_ENABLE_I18N
1806       if (input->mb_cur_max > 1)
1807         {
1808           wint_t wc = re_string_wchar_at (input,
1809                                           re_string_cur_idx (input) + 1);
1810           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1811         }
1812       else
1813 #endif
1814         token->word_char = IS_WORD_CHAR (c2) != 0;
1815
1816       switch (c2)
1817         {
1818         case '|':
1819           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1820             token->type = OP_ALT;
1821           break;
1822         case '1': case '2': case '3': case '4': case '5':
1823         case '6': case '7': case '8': case '9':
1824           if (!(syntax & RE_NO_BK_REFS))
1825             {
1826               token->type = OP_BACK_REF;
1827               token->opr.idx = c2 - '1';
1828             }
1829           break;
1830         case '<':
1831           if (!(syntax & RE_NO_GNU_OPS))
1832             {
1833               token->type = ANCHOR;
1834               token->opr.ctx_type = WORD_FIRST;
1835             }
1836           break;
1837         case '>':
1838           if (!(syntax & RE_NO_GNU_OPS))
1839             {
1840               token->type = ANCHOR;
1841               token->opr.ctx_type = WORD_LAST;
1842             }
1843           break;
1844         case 'b':
1845           if (!(syntax & RE_NO_GNU_OPS))
1846             {
1847               token->type = ANCHOR;
1848               token->opr.ctx_type = WORD_DELIM;
1849             }
1850           break;
1851         case 'B':
1852           if (!(syntax & RE_NO_GNU_OPS))
1853             {
1854               token->type = ANCHOR;
1855               token->opr.ctx_type = NOT_WORD_DELIM;
1856             }
1857           break;
1858         case 'w':
1859           if (!(syntax & RE_NO_GNU_OPS))
1860             token->type = OP_WORD;
1861           break;
1862         case 'W':
1863           if (!(syntax & RE_NO_GNU_OPS))
1864             token->type = OP_NOTWORD;
1865           break;
1866         case 's':
1867           if (!(syntax & RE_NO_GNU_OPS))
1868             token->type = OP_SPACE;
1869           break;
1870         case 'S':
1871           if (!(syntax & RE_NO_GNU_OPS))
1872             token->type = OP_NOTSPACE;
1873           break;
1874         case '`':
1875           if (!(syntax & RE_NO_GNU_OPS))
1876             {
1877               token->type = ANCHOR;
1878               token->opr.ctx_type = BUF_FIRST;
1879             }
1880           break;
1881         case '\'':
1882           if (!(syntax & RE_NO_GNU_OPS))
1883             {
1884               token->type = ANCHOR;
1885               token->opr.ctx_type = BUF_LAST;
1886             }
1887           break;
1888         case '(':
1889           if (!(syntax & RE_NO_BK_PARENS))
1890             token->type = OP_OPEN_SUBEXP;
1891           break;
1892         case ')':
1893           if (!(syntax & RE_NO_BK_PARENS))
1894             token->type = OP_CLOSE_SUBEXP;
1895           break;
1896         case '+':
1897           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898             token->type = OP_DUP_PLUS;
1899           break;
1900         case '?':
1901           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902             token->type = OP_DUP_QUESTION;
1903           break;
1904         case '{':
1905           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906             token->type = OP_OPEN_DUP_NUM;
1907           break;
1908         case '}':
1909           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910             token->type = OP_CLOSE_DUP_NUM;
1911           break;
1912         default:
1913           break;
1914         }
1915       return 2;
1916     }
1917
1918   token->type = CHARACTER;
1919 #ifdef RE_ENABLE_I18N
1920   if (input->mb_cur_max > 1)
1921     {
1922       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1923       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1924     }
1925   else
1926 #endif
1927     token->word_char = IS_WORD_CHAR (token->opr.c);
1928
1929   switch (c)
1930     {
1931     case '\n':
1932       if (syntax & RE_NEWLINE_ALT)
1933         token->type = OP_ALT;
1934       break;
1935     case '|':
1936       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1937         token->type = OP_ALT;
1938       break;
1939     case '*':
1940       token->type = OP_DUP_ASTERISK;
1941       break;
1942     case '+':
1943       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944         token->type = OP_DUP_PLUS;
1945       break;
1946     case '?':
1947       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948         token->type = OP_DUP_QUESTION;
1949       break;
1950     case '{':
1951       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952         token->type = OP_OPEN_DUP_NUM;
1953       break;
1954     case '}':
1955       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956         token->type = OP_CLOSE_DUP_NUM;
1957       break;
1958     case '(':
1959       if (syntax & RE_NO_BK_PARENS)
1960         token->type = OP_OPEN_SUBEXP;
1961       break;
1962     case ')':
1963       if (syntax & RE_NO_BK_PARENS)
1964         token->type = OP_CLOSE_SUBEXP;
1965       break;
1966     case '[':
1967       token->type = OP_OPEN_BRACKET;
1968       break;
1969     case '.':
1970       token->type = OP_PERIOD;
1971       break;
1972     case '^':
1973       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1974           re_string_cur_idx (input) != 0)
1975         {
1976           char prev = re_string_peek_byte (input, -1);
1977           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1978             break;
1979         }
1980       token->type = ANCHOR;
1981       token->opr.ctx_type = LINE_FIRST;
1982       break;
1983     case '$':
1984       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1985           re_string_cur_idx (input) + 1 != re_string_length (input))
1986         {
1987           re_token_t next;
1988           re_string_skip_bytes (input, 1);
1989           peek_token (&next, input, syntax);
1990           re_string_skip_bytes (input, -1);
1991           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1992             break;
1993         }
1994       token->type = ANCHOR;
1995       token->opr.ctx_type = LINE_LAST;
1996       break;
1997     default:
1998       break;
1999     }
2000   return 1;
2001 }
2002
2003 /* Peek a token from INPUT, and return the length of the token.
2004    We must not use this function out of bracket expressions.  */
2005
2006 static int
2007 internal_function
2008 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2009 {
2010   unsigned char c;
2011   if (re_string_eoi (input))
2012     {
2013       token->type = END_OF_RE;
2014       return 0;
2015     }
2016   c = re_string_peek_byte (input, 0);
2017   token->opr.c = c;
2018
2019 #ifdef RE_ENABLE_I18N
2020   if (input->mb_cur_max > 1 &&
2021       !re_string_first_byte (input, re_string_cur_idx (input)))
2022     {
2023       token->type = CHARACTER;
2024       return 1;
2025     }
2026 #endif /* RE_ENABLE_I18N */
2027
2028   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2029       && re_string_cur_idx (input) + 1 < re_string_length (input))
2030     {
2031       /* In this case, '\' escape a character.  */
2032       unsigned char c2;
2033       re_string_skip_bytes (input, 1);
2034       c2 = re_string_peek_byte (input, 0);
2035       token->opr.c = c2;
2036       token->type = CHARACTER;
2037       return 1;
2038     }
2039   if (c == '[') /* '[' is a special char in a bracket exps.  */
2040     {
2041       unsigned char c2;
2042       int token_len;
2043       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2044         c2 = re_string_peek_byte (input, 1);
2045       else
2046         c2 = 0;
2047       token->opr.c = c2;
2048       token_len = 2;
2049       switch (c2)
2050         {
2051         case '.':
2052           token->type = OP_OPEN_COLL_ELEM;
2053           break;
2054         case '=':
2055           token->type = OP_OPEN_EQUIV_CLASS;
2056           break;
2057         case ':':
2058           if (syntax & RE_CHAR_CLASSES)
2059             {
2060               token->type = OP_OPEN_CHAR_CLASS;
2061               break;
2062             }
2063           /* else fall through.  */
2064         default:
2065           token->type = CHARACTER;
2066           token->opr.c = c;
2067           token_len = 1;
2068           break;
2069         }
2070       return token_len;
2071     }
2072   switch (c)
2073     {
2074     case '-':
2075       token->type = OP_CHARSET_RANGE;
2076       break;
2077     case ']':
2078       token->type = OP_CLOSE_BRACKET;
2079       break;
2080     case '^':
2081       token->type = OP_NON_MATCH_LIST;
2082       break;
2083     default:
2084       token->type = CHARACTER;
2085     }
2086   return 1;
2087 }
2088 \f
2089 /* Functions for parser.  */
2090
2091 /* Entry point of the parser.
2092    Parse the regular expression REGEXP and return the structure tree.
2093    If an error is occured, ERR is set by error code, and return NULL.
2094    This function build the following tree, from regular expression <reg_exp>:
2095            CAT
2096            / \
2097           /   \
2098    <reg_exp>  EOR
2099
2100    CAT means concatenation.
2101    EOR means end of regular expression.  */
2102
2103 static bin_tree_t *
2104 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2105        reg_errcode_t *err)
2106 {
2107   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2108   bin_tree_t *tree, *eor, *root;
2109   re_token_t current_token;
2110   dfa->syntax = syntax;
2111   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2112   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2113   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2114     return NULL;
2115   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2116   if (tree != NULL)
2117     root = create_tree (dfa, tree, eor, CONCAT);
2118   else
2119     root = eor;
2120   if (BE (eor == NULL || root == NULL, 0))
2121     {
2122       *err = REG_ESPACE;
2123       return NULL;
2124     }
2125   return root;
2126 }
2127
2128 /* This function build the following tree, from regular expression
2129    <branch1>|<branch2>:
2130            ALT
2131            / \
2132           /   \
2133    <branch1> <branch2>
2134
2135    ALT means alternative, which represents the operator `|'.  */
2136
2137 static bin_tree_t *
2138 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2139                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2140 {
2141   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2142   bin_tree_t *tree, *branch = NULL;
2143   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2144   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2145     return NULL;
2146
2147   while (token->type == OP_ALT)
2148     {
2149       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2150       if (token->type != OP_ALT && token->type != END_OF_RE
2151           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2152         {
2153           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2154           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2155             return NULL;
2156         }
2157       else
2158         branch = NULL;
2159       tree = create_tree (dfa, tree, branch, OP_ALT);
2160       if (BE (tree == NULL, 0))
2161         {
2162           *err = REG_ESPACE;
2163           return NULL;
2164         }
2165     }
2166   return tree;
2167 }
2168
2169 /* This function build the following tree, from regular expression
2170    <exp1><exp2>:
2171         CAT
2172         / \
2173        /   \
2174    <exp1> <exp2>
2175
2176    CAT means concatenation.  */
2177
2178 static bin_tree_t *
2179 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2180               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2181 {
2182   bin_tree_t *tree, *exp;
2183   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2184   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2185   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2186     return NULL;
2187
2188   while (token->type != OP_ALT && token->type != END_OF_RE
2189          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2190     {
2191       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2192       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2193         {
2194           if (tree != NULL)
2195             postorder (tree, free_tree, NULL);
2196           return NULL;
2197         }
2198       if (tree != NULL && exp != NULL)
2199         {
2200           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2201           if (newtree == NULL)
2202             {
2203               postorder (exp, free_tree, NULL);
2204               postorder (tree, free_tree, NULL);
2205               *err = REG_ESPACE;
2206               return NULL;
2207             }
2208           tree = newtree;
2209         }
2210       else if (tree == NULL)
2211         tree = exp;
2212       /* Otherwise exp == NULL, we don't need to create new tree.  */
2213     }
2214   return tree;
2215 }
2216
2217 /* This function build the following tree, from regular expression a*:
2218          *
2219          |
2220          a
2221 */
2222
2223 static bin_tree_t *
2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2226 {
2227   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228   bin_tree_t *tree;
2229   switch (token->type)
2230     {
2231     case CHARACTER:
2232       tree = create_token_tree (dfa, NULL, NULL, token);
2233       if (BE (tree == NULL, 0))
2234         {
2235           *err = REG_ESPACE;
2236           return NULL;
2237         }
2238 #ifdef RE_ENABLE_I18N
2239       if (dfa->mb_cur_max > 1)
2240         {
2241           while (!re_string_eoi (regexp)
2242                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243             {
2244               bin_tree_t *mbc_remain;
2245               fetch_token (token, regexp, syntax);
2246               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248               if (BE (mbc_remain == NULL || tree == NULL, 0))
2249                 {
2250                   *err = REG_ESPACE;
2251                   return NULL;
2252                 }
2253             }
2254         }
2255 #endif
2256       break;
2257     case OP_OPEN_SUBEXP:
2258       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260         return NULL;
2261       break;
2262     case OP_OPEN_BRACKET:
2263       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265         return NULL;
2266       break;
2267     case OP_BACK_REF:
2268       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269         {
2270           *err = REG_ESUBREG;
2271           return NULL;
2272         }
2273       dfa->used_bkref_map |= 1 << token->opr.idx;
2274       tree = create_token_tree (dfa, NULL, NULL, token);
2275       if (BE (tree == NULL, 0))
2276         {
2277           *err = REG_ESPACE;
2278           return NULL;
2279         }
2280       ++dfa->nbackref;
2281       dfa->has_mb_node = 1;
2282       break;
2283     case OP_OPEN_DUP_NUM:
2284       if (syntax & RE_CONTEXT_INVALID_DUP)
2285         {
2286           *err = REG_BADRPT;
2287           return NULL;
2288         }
2289       /* FALLTHROUGH */
2290     case OP_DUP_ASTERISK:
2291     case OP_DUP_PLUS:
2292     case OP_DUP_QUESTION:
2293       if (syntax & RE_CONTEXT_INVALID_OPS)
2294         {
2295           *err = REG_BADRPT;
2296           return NULL;
2297         }
2298       else if (syntax & RE_CONTEXT_INDEP_OPS)
2299         {
2300           fetch_token (token, regexp, syntax);
2301           return parse_expression (regexp, preg, token, syntax, nest, err);
2302         }
2303       /* else fall through  */
2304     case OP_CLOSE_SUBEXP:
2305       if ((token->type == OP_CLOSE_SUBEXP) &&
2306           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307         {
2308           *err = REG_ERPAREN;
2309           return NULL;
2310         }
2311       /* else fall through  */
2312     case OP_CLOSE_DUP_NUM:
2313       /* We treat it as a normal character.  */
2314
2315       /* Then we can these characters as normal characters.  */
2316       token->type = CHARACTER;
2317       /* mb_partial and word_char bits should be initialized already
2318          by peek_token.  */
2319       tree = create_token_tree (dfa, NULL, NULL, token);
2320       if (BE (tree == NULL, 0))
2321         {
2322           *err = REG_ESPACE;
2323           return NULL;
2324         }
2325       break;
2326     case ANCHOR:
2327       if ((token->opr.ctx_type
2328            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329           && dfa->word_ops_used == 0)
2330         init_word_char (dfa);
2331       if (token->opr.ctx_type == WORD_DELIM
2332           || token->opr.ctx_type == NOT_WORD_DELIM)
2333         {
2334           bin_tree_t *tree_first, *tree_last;
2335           if (token->opr.ctx_type == WORD_DELIM)
2336             {
2337               token->opr.ctx_type = WORD_FIRST;
2338               tree_first = create_token_tree (dfa, NULL, NULL, token);
2339               token->opr.ctx_type = WORD_LAST;
2340             }
2341           else
2342             {
2343               token->opr.ctx_type = INSIDE_WORD;
2344               tree_first = create_token_tree (dfa, NULL, NULL, token);
2345               token->opr.ctx_type = INSIDE_NOTWORD;
2346             }
2347           tree_last = create_token_tree (dfa, NULL, NULL, token);
2348           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350             {
2351               *err = REG_ESPACE;
2352               return NULL;
2353             }
2354         }
2355       else
2356         {
2357           tree = create_token_tree (dfa, NULL, NULL, token);
2358           if (BE (tree == NULL, 0))
2359             {
2360               *err = REG_ESPACE;
2361               return NULL;
2362             }
2363         }
2364       /* We must return here, since ANCHORs can't be followed
2365          by repetition operators.
2366          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2368       fetch_token (token, regexp, syntax);
2369       return tree;
2370     case OP_PERIOD:
2371       tree = create_token_tree (dfa, NULL, NULL, token);
2372       if (BE (tree == NULL, 0))
2373         {
2374           *err = REG_ESPACE;
2375           return NULL;
2376         }
2377       if (dfa->mb_cur_max > 1)
2378         dfa->has_mb_node = 1;
2379       break;
2380     case OP_WORD:
2381     case OP_NOTWORD:
2382       tree = build_charclass_op (dfa, regexp->trans,
2383                                  (const unsigned char *) "alnum",
2384                                  (const unsigned char *) "_",
2385                                  token->type == OP_NOTWORD, err);
2386       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387         return NULL;
2388       break;
2389     case OP_SPACE:
2390     case OP_NOTSPACE:
2391       tree = build_charclass_op (dfa, regexp->trans,
2392                                  (const unsigned char *) "space",
2393                                  (const unsigned char *) "",
2394                                  token->type == OP_NOTSPACE, err);
2395       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396         return NULL;
2397       break;
2398     case OP_ALT:
2399     case END_OF_RE:
2400       return NULL;
2401     case BACK_SLASH:
2402       *err = REG_EESCAPE;
2403       return NULL;
2404     default:
2405       /* Must not happen?  */
2406 #ifdef DEBUG
2407       assert (0);
2408 #endif
2409       return NULL;
2410     }
2411   fetch_token (token, regexp, syntax);
2412
2413   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415     {
2416       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418         return NULL;
2419       /* In BRE consecutive duplications are not allowed.  */
2420       if ((syntax & RE_CONTEXT_INVALID_DUP)
2421           && (token->type == OP_DUP_ASTERISK
2422               || token->type == OP_OPEN_DUP_NUM))
2423         {
2424           *err = REG_BADRPT;
2425           return NULL;
2426         }
2427     }
2428
2429   return tree;
2430 }
2431
2432 /* This function build the following tree, from regular expression
2433    (<reg_exp>):
2434          SUBEXP
2435             |
2436         <reg_exp>
2437 */
2438
2439 static bin_tree_t *
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2442 {
2443   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444   bin_tree_t *tree;
2445   size_t cur_nsub;
2446   cur_nsub = preg->re_nsub++;
2447
2448   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449
2450   /* The subexpression may be a null string.  */
2451   if (token->type == OP_CLOSE_SUBEXP)
2452     tree = NULL;
2453   else
2454     {
2455       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457         {
2458           if (tree != NULL)
2459             postorder (tree, free_tree, NULL);
2460           *err = REG_EPAREN;
2461         }
2462       if (BE (*err != REG_NOERROR, 0))
2463         return NULL;
2464     }
2465
2466   if (cur_nsub <= '9' - '1')
2467     dfa->completed_bkref_map |= 1 << cur_nsub;
2468
2469   tree = create_tree (dfa, tree, NULL, SUBEXP);
2470   if (BE (tree == NULL, 0))
2471     {
2472       *err = REG_ESPACE;
2473       return NULL;
2474     }
2475   tree->token.opr.idx = cur_nsub;
2476   return tree;
2477 }
2478
2479 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2480
2481 static bin_tree_t *
2482 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2483               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2484 {
2485   bin_tree_t *tree = NULL, *old_tree = NULL;
2486   int i, start, end, start_idx = re_string_cur_idx (regexp);
2487   re_token_t start_token = *token;
2488
2489   if (token->type == OP_OPEN_DUP_NUM)
2490     {
2491       end = 0;
2492       start = fetch_number (regexp, token, syntax);
2493       if (start == -1)
2494         {
2495           if (token->type == CHARACTER && token->opr.c == ',')
2496             start = 0; /* We treat "{,m}" as "{0,m}".  */
2497           else
2498             {
2499               *err = REG_BADBR; /* <re>{} is invalid.  */
2500               return NULL;
2501             }
2502         }
2503       if (BE (start != -2, 1))
2504         {
2505           /* We treat "{n}" as "{n,n}".  */
2506           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2507                  : ((token->type == CHARACTER && token->opr.c == ',')
2508                     ? fetch_number (regexp, token, syntax) : -2));
2509         }
2510       if (BE (start == -2 || end == -2, 0))
2511         {
2512           /* Invalid sequence.  */
2513           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2514             {
2515               if (token->type == END_OF_RE)
2516                 *err = REG_EBRACE;
2517               else
2518                 *err = REG_BADBR;
2519
2520               return NULL;
2521             }
2522
2523           /* If the syntax bit is set, rollback.  */
2524           re_string_set_index (regexp, start_idx);
2525           *token = start_token;
2526           token->type = CHARACTER;
2527           /* mb_partial and word_char bits should be already initialized by
2528              peek_token.  */
2529           return elem;
2530         }
2531
2532       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2533         {
2534           /* First number greater than second.  */
2535           *err = REG_BADBR;
2536           return NULL;
2537         }
2538     }
2539   else
2540     {
2541       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2542       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2543     }
2544
2545   fetch_token (token, regexp, syntax);
2546
2547   if (BE (elem == NULL, 0))
2548     return NULL;
2549   if (BE (start == 0 && end == 0, 0))
2550     {
2551       postorder (elem, free_tree, NULL);
2552       return NULL;
2553     }
2554
2555   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2556   if (BE (start > 0, 0))
2557     {
2558       tree = elem;
2559       for (i = 2; i <= start; ++i)
2560         {
2561           elem = duplicate_tree (elem, dfa);
2562           tree = create_tree (dfa, tree, elem, CONCAT);
2563           if (BE (elem == NULL || tree == NULL, 0))
2564             goto parse_dup_op_espace;
2565         }
2566
2567       if (start == end)
2568         return tree;
2569
2570       /* Duplicate ELEM before it is marked optional.  */
2571       elem = duplicate_tree (elem, dfa);
2572       old_tree = tree;
2573     }
2574   else
2575     old_tree = NULL;
2576
2577   if (elem->token.type == SUBEXP)
2578     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2579
2580   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2581   if (BE (tree == NULL, 0))
2582     goto parse_dup_op_espace;
2583
2584   /* This loop is actually executed only when end != -1,
2585      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2586      already created the start+1-th copy.  */
2587   for (i = start + 2; i <= end; ++i)
2588     {
2589       elem = duplicate_tree (elem, dfa);
2590       tree = create_tree (dfa, tree, elem, CONCAT);
2591       if (BE (elem == NULL || tree == NULL, 0))
2592         goto parse_dup_op_espace;
2593
2594       tree = create_tree (dfa, tree, NULL, OP_ALT);
2595       if (BE (tree == NULL, 0))
2596         goto parse_dup_op_espace;
2597     }
2598
2599   if (old_tree)
2600     tree = create_tree (dfa, old_tree, tree, CONCAT);
2601
2602   return tree;
2603
2604  parse_dup_op_espace:
2605   *err = REG_ESPACE;
2606   return NULL;
2607 }
2608
2609 /* Size of the names for collating symbol/equivalence_class/character_class.
2610    I'm not sure, but maybe enough.  */
2611 #define BRACKET_NAME_BUF_SIZE 32
2612
2613 #ifndef _LIBC
2614   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615      Build the range expression which starts from START_ELEM, and ends
2616      at END_ELEM.  The result are written to MBCSET and SBCSET.
2617      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618      mbcset->range_ends, is a pointer argument sinse we may
2619      update it.  */
2620
2621 static reg_errcode_t
2622 internal_function
2623 # ifdef RE_ENABLE_I18N
2624 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2625                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2626 # else /* not RE_ENABLE_I18N */
2627 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2628                  bracket_elem_t *end_elem)
2629 # endif /* not RE_ENABLE_I18N */
2630 {
2631   unsigned int start_ch, end_ch;
2632   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2633   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2634           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2635           0))
2636     return REG_ERANGE;
2637
2638   /* We can handle no multi character collating elements without libc
2639      support.  */
2640   if (BE ((start_elem->type == COLL_SYM
2641            && strlen ((char *) start_elem->opr.name) > 1)
2642           || (end_elem->type == COLL_SYM
2643               && strlen ((char *) end_elem->opr.name) > 1), 0))
2644     return REG_ECOLLATE;
2645
2646 # ifdef RE_ENABLE_I18N
2647   {
2648     wchar_t wc;
2649     wint_t start_wc;
2650     wint_t end_wc;
2651     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2652
2653     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2654                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2655                    : 0));
2656     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2657               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2658                  : 0));
2659     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2660                 ? __btowc (start_ch) : start_elem->opr.wch);
2661     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2662               ? __btowc (end_ch) : end_elem->opr.wch);
2663     if (start_wc == WEOF || end_wc == WEOF)
2664       return REG_ECOLLATE;
2665     cmp_buf[0] = start_wc;
2666     cmp_buf[4] = end_wc;
2667     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2668       return REG_ERANGE;
2669
2670     /* Got valid collation sequence values, add them as a new entry.
2671        However, for !_LIBC we have no collation elements: if the
2672        character set is single byte, the single byte character set
2673        that we build below suffices.  parse_bracket_exp passes
2674        no MBCSET if dfa->mb_cur_max == 1.  */
2675     if (mbcset)
2676       {
2677         /* Check the space of the arrays.  */
2678         if (BE (*range_alloc == mbcset->nranges, 0))
2679           {
2680             /* There is not enough space, need realloc.  */
2681             wchar_t *new_array_start, *new_array_end;
2682             int new_nranges;
2683
2684             /* +1 in case of mbcset->nranges is 0.  */
2685             new_nranges = 2 * mbcset->nranges + 1;
2686             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2687                are NULL if *range_alloc == 0.  */
2688             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2689                                           new_nranges);
2690             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2691                                         new_nranges);
2692
2693             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2694               return REG_ESPACE;
2695
2696             mbcset->range_starts = new_array_start;
2697             mbcset->range_ends = new_array_end;
2698             *range_alloc = new_nranges;
2699           }
2700
2701         mbcset->range_starts[mbcset->nranges] = start_wc;
2702         mbcset->range_ends[mbcset->nranges++] = end_wc;
2703       }
2704
2705     /* Build the table for single byte characters.  */
2706     for (wc = 0; wc < SBC_MAX; ++wc)
2707       {
2708         cmp_buf[2] = wc;
2709         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2710             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2711           bitset_set (sbcset, wc);
2712       }
2713   }
2714 # else /* not RE_ENABLE_I18N */
2715   {
2716     unsigned int ch;
2717     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2718                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719                    : 0));
2720     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2721               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2722                  : 0));
2723     if (start_ch > end_ch)
2724       return REG_ERANGE;
2725     /* Build the table for single byte characters.  */
2726     for (ch = 0; ch < SBC_MAX; ++ch)
2727       if (start_ch <= ch  && ch <= end_ch)
2728         bitset_set (sbcset, ch);
2729   }
2730 # endif /* not RE_ENABLE_I18N */
2731   return REG_NOERROR;
2732 }
2733 #endif /* not _LIBC */
2734
2735 #ifndef _LIBC
2736 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2737    Build the collating element which is represented by NAME.
2738    The result are written to MBCSET and SBCSET.
2739    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2740    pointer argument since we may update it.  */
2741
2742 static reg_errcode_t
2743 internal_function
2744 # ifdef RE_ENABLE_I18N
2745 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2746                         int *coll_sym_alloc, const unsigned char *name)
2747 # else /* not RE_ENABLE_I18N */
2748 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2749 # endif /* not RE_ENABLE_I18N */
2750 {
2751   size_t name_len = strlen ((const char *) name);
2752   if (BE (name_len != 1, 0))
2753     return REG_ECOLLATE;
2754   else
2755     {
2756       bitset_set (sbcset, name[0]);
2757       return REG_NOERROR;
2758     }
2759 }
2760 #endif /* not _LIBC */
2761
2762 /* This function parse bracket expression like "[abc]", "[a-c]",
2763    "[[.a-a.]]" etc.  */
2764
2765 static bin_tree_t *
2766 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2767                    reg_syntax_t syntax, reg_errcode_t *err)
2768 {
2769 #ifdef _LIBC
2770   const unsigned char *collseqmb;
2771   const char *collseqwc;
2772   uint32_t nrules;
2773   int32_t table_size;
2774   const int32_t *symb_table;
2775   const unsigned char *extra;
2776
2777   /* Local function for parse_bracket_exp used in _LIBC environement.
2778      Seek the collating symbol entry correspondings to NAME.
2779      Return the index of the symbol in the SYMB_TABLE,
2780      or -1 if not found.  */
2781
2782   auto inline int32_t
2783   __attribute ((always_inline))
2784   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2785     {
2786       int32_t elem;
2787
2788       for (elem = 0; elem < table_size; elem++)
2789         if (symb_table[2 * elem] != 0)
2790           {
2791             int32_t idx = symb_table[2 * elem + 1];
2792             /* Skip the name of collating element name.  */
2793             idx += 1 + extra[idx];
2794             if (/* Compare the length of the name.  */
2795                 name_len == extra[idx]
2796                 /* Compare the name.  */
2797                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2798               /* Yep, this is the entry.  */
2799               return elem;
2800           }
2801       return -1;
2802     }
2803
2804   /* Local function for parse_bracket_exp used in _LIBC environment.
2805      Look up the collation sequence value of BR_ELEM.
2806      Return the value if succeeded, UINT_MAX otherwise.  */
2807
2808   auto inline unsigned int
2809   __attribute ((always_inline))
2810   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2811     {
2812       if (br_elem->type == SB_CHAR)
2813         {
2814           /*
2815           if (MB_CUR_MAX == 1)
2816           */
2817           if (nrules == 0)
2818             return collseqmb[br_elem->opr.ch];
2819           else
2820             {
2821               wint_t wc = __btowc (br_elem->opr.ch);
2822               return __collseq_table_lookup (collseqwc, wc);
2823             }
2824         }
2825       else if (br_elem->type == MB_CHAR)
2826         {
2827           if (nrules != 0)
2828             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2829         }
2830       else if (br_elem->type == COLL_SYM)
2831         {
2832           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2833           if (nrules != 0)
2834             {
2835               int32_t elem, idx;
2836               elem = seek_collating_symbol_entry (br_elem->opr.name,
2837                                                   sym_name_len);
2838               if (elem != -1)
2839                 {
2840                   /* We found the entry.  */
2841                   idx = symb_table[2 * elem + 1];
2842                   /* Skip the name of collating element name.  */
2843                   idx += 1 + extra[idx];
2844                   /* Skip the byte sequence of the collating element.  */
2845                   idx += 1 + extra[idx];
2846                   /* Adjust for the alignment.  */
2847                   idx = (idx + 3) & ~3;
2848                   /* Skip the multibyte collation sequence value.  */
2849                   idx += sizeof (unsigned int);
2850                   /* Skip the wide char sequence of the collating element.  */
2851                   idx += sizeof (unsigned int) *
2852                     (1 + *(unsigned int *) (extra + idx));
2853                   /* Return the collation sequence value.  */
2854                   return *(unsigned int *) (extra + idx);
2855                 }
2856               else if (sym_name_len == 1)
2857                 {
2858                   /* No valid character.  Match it as a single byte
2859                      character.  */
2860                   return collseqmb[br_elem->opr.name[0]];
2861                 }
2862             }
2863           else if (sym_name_len == 1)
2864             return collseqmb[br_elem->opr.name[0]];
2865         }
2866       return UINT_MAX;
2867     }
2868
2869   /* Local function for parse_bracket_exp used in _LIBC environement.
2870      Build the range expression which starts from START_ELEM, and ends
2871      at END_ELEM.  The result are written to MBCSET and SBCSET.
2872      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2873      mbcset->range_ends, is a pointer argument sinse we may
2874      update it.  */
2875
2876   auto inline reg_errcode_t
2877   __attribute ((always_inline))
2878   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2879                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2880     {
2881       unsigned int ch;
2882       uint32_t start_collseq;
2883       uint32_t end_collseq;
2884
2885       /* Equivalence Classes and Character Classes can't be a range
2886          start/end.  */
2887       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2888               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2889               0))
2890         return REG_ERANGE;
2891
2892       start_collseq = lookup_collation_sequence_value (start_elem);
2893       end_collseq = lookup_collation_sequence_value (end_elem);
2894       /* Check start/end collation sequence values.  */
2895       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2896         return REG_ECOLLATE;
2897       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2898         return REG_ERANGE;
2899
2900       /* Got valid collation sequence values, add them as a new entry.
2901          However, if we have no collation elements, and the character set
2902          is single byte, the single byte character set that we
2903          build below suffices. */
2904       if (nrules > 0 || dfa->mb_cur_max > 1)
2905         {
2906           /* Check the space of the arrays.  */
2907           if (BE (*range_alloc == mbcset->nranges, 0))
2908             {
2909               /* There is not enough space, need realloc.  */
2910               uint32_t *new_array_start;
2911               uint32_t *new_array_end;
2912               int new_nranges;
2913
2914               /* +1 in case of mbcset->nranges is 0.  */
2915               new_nranges = 2 * mbcset->nranges + 1;
2916               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2917                                             new_nranges);
2918               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2919                                           new_nranges);
2920
2921               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2922                 return REG_ESPACE;
2923
2924               mbcset->range_starts = new_array_start;
2925               mbcset->range_ends = new_array_end;
2926               *range_alloc = new_nranges;
2927             }
2928
2929           mbcset->range_starts[mbcset->nranges] = start_collseq;
2930           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2931         }
2932
2933       /* Build the table for single byte characters.  */
2934       for (ch = 0; ch < SBC_MAX; ch++)
2935         {
2936           uint32_t ch_collseq;
2937           /*
2938           if (MB_CUR_MAX == 1)
2939           */
2940           if (nrules == 0)
2941             ch_collseq = collseqmb[ch];
2942           else
2943             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2944           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2945             bitset_set (sbcset, ch);
2946         }
2947       return REG_NOERROR;
2948     }
2949
2950   /* Local function for parse_bracket_exp used in _LIBC environement.
2951      Build the collating element which is represented by NAME.
2952      The result are written to MBCSET and SBCSET.
2953      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2954      pointer argument sinse we may update it.  */
2955
2956   auto inline reg_errcode_t
2957   __attribute ((always_inline))
2958   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2959                           int *coll_sym_alloc, const unsigned char *name)
2960     {
2961       int32_t elem, idx;
2962       size_t name_len = strlen ((const char *) name);
2963       if (nrules != 0)
2964         {
2965           elem = seek_collating_symbol_entry (name, name_len);
2966           if (elem != -1)
2967             {
2968               /* We found the entry.  */
2969               idx = symb_table[2 * elem + 1];
2970               /* Skip the name of collating element name.  */
2971               idx += 1 + extra[idx];
2972             }
2973           else if (name_len == 1)
2974             {
2975               /* No valid character, treat it as a normal
2976                  character.  */
2977               bitset_set (sbcset, name[0]);
2978               return REG_NOERROR;
2979             }
2980           else
2981             return REG_ECOLLATE;
2982
2983           /* Got valid collation sequence, add it as a new entry.  */
2984           /* Check the space of the arrays.  */
2985           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2986             {
2987               /* Not enough, realloc it.  */
2988               /* +1 in case of mbcset->ncoll_syms is 0.  */
2989               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2990               /* Use realloc since mbcset->coll_syms is NULL
2991                  if *alloc == 0.  */
2992               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2993                                                    new_coll_sym_alloc);
2994               if (BE (new_coll_syms == NULL, 0))
2995                 return REG_ESPACE;
2996               mbcset->coll_syms = new_coll_syms;
2997               *coll_sym_alloc = new_coll_sym_alloc;
2998             }
2999           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3000           return REG_NOERROR;
3001         }
3002       else
3003         {
3004           if (BE (name_len != 1, 0))
3005             return REG_ECOLLATE;
3006           else
3007             {
3008               bitset_set (sbcset, name[0]);
3009               return REG_NOERROR;
3010             }
3011         }
3012     }
3013 #endif
3014
3015   re_token_t br_token;
3016   re_bitset_ptr_t sbcset;
3017 #ifdef RE_ENABLE_I18N
3018   re_charset_t *mbcset;
3019   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3020   int equiv_class_alloc = 0, char_class_alloc = 0;
3021 #endif /* not RE_ENABLE_I18N */
3022   int non_match = 0;
3023   bin_tree_t *work_tree;
3024   int token_len;
3025   int first_round = 1;
3026 #ifdef _LIBC
3027   collseqmb = (const unsigned char *)
3028     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3029   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3030   if (nrules)
3031     {
3032       /*
3033       if (MB_CUR_MAX > 1)
3034       */
3035       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3036       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3037       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3038                                                   _NL_COLLATE_SYMB_TABLEMB);
3039       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3040                                                    _NL_COLLATE_SYMB_EXTRAMB);
3041     }
3042 #endif
3043   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3044 #ifdef RE_ENABLE_I18N
3045   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3046 #endif /* RE_ENABLE_I18N */
3047 #ifdef RE_ENABLE_I18N
3048   if (BE (sbcset == NULL || mbcset == NULL, 0))
3049 #else
3050   if (BE (sbcset == NULL, 0))
3051 #endif /* RE_ENABLE_I18N */
3052     {
3053       re_free (sbcset);
3054 #ifdef RE_ENABLE_I18N
3055       re_free (mbcset);
3056 #endif
3057       *err = REG_ESPACE;
3058       return NULL;
3059     }
3060
3061   token_len = peek_token_bracket (token, regexp, syntax);
3062   if (BE (token->type == END_OF_RE, 0))
3063     {
3064       *err = REG_BADPAT;
3065       goto parse_bracket_exp_free_return;
3066     }
3067   if (token->type == OP_NON_MATCH_LIST)
3068     {
3069 #ifdef RE_ENABLE_I18N
3070       mbcset->non_match = 1;
3071 #endif /* not RE_ENABLE_I18N */
3072       non_match = 1;
3073       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3074         bitset_set (sbcset, '\n');
3075       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3076       token_len = peek_token_bracket (token, regexp, syntax);
3077       if (BE (token->type == END_OF_RE, 0))
3078         {
3079           *err = REG_BADPAT;
3080           goto parse_bracket_exp_free_return;
3081         }
3082     }
3083
3084   /* We treat the first ']' as a normal character.  */
3085   if (token->type == OP_CLOSE_BRACKET)
3086     token->type = CHARACTER;
3087
3088   while (1)
3089     {
3090       bracket_elem_t start_elem, end_elem;
3091       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3092       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3093       reg_errcode_t ret;
3094       int token_len2 = 0, is_range_exp = 0;
3095       re_token_t token2;
3096
3097       start_elem.opr.name = start_name_buf;
3098       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3099                                    syntax, first_round);
3100       if (BE (ret != REG_NOERROR, 0))
3101         {
3102           *err = ret;
3103           goto parse_bracket_exp_free_return;
3104         }
3105       first_round = 0;
3106
3107       /* Get information about the next token.  We need it in any case.  */
3108       token_len = peek_token_bracket (token, regexp, syntax);
3109
3110       /* Do not check for ranges if we know they are not allowed.  */
3111       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3112         {
3113           if (BE (token->type == END_OF_RE, 0))
3114             {
3115               *err = REG_EBRACK;
3116               goto parse_bracket_exp_free_return;
3117             }
3118           if (token->type == OP_CHARSET_RANGE)
3119             {
3120               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3121               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3122               if (BE (token2.type == END_OF_RE, 0))
3123                 {
3124                   *err = REG_EBRACK;
3125                   goto parse_bracket_exp_free_return;
3126                 }
3127               if (token2.type == OP_CLOSE_BRACKET)
3128                 {
3129                   /* We treat the last '-' as a normal character.  */
3130                   re_string_skip_bytes (regexp, -token_len);
3131                   token->type = CHARACTER;
3132                 }
3133               else
3134                 is_range_exp = 1;
3135             }
3136         }
3137
3138       if (is_range_exp == 1)
3139         {
3140           end_elem.opr.name = end_name_buf;
3141           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3142                                        dfa, syntax, 1);
3143           if (BE (ret != REG_NOERROR, 0))
3144             {
3145               *err = ret;
3146               goto parse_bracket_exp_free_return;
3147             }
3148
3149           token_len = peek_token_bracket (token, regexp, syntax);
3150
3151 #ifdef _LIBC
3152           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3153                                   &start_elem, &end_elem);
3154 #else
3155 # ifdef RE_ENABLE_I18N
3156           *err = build_range_exp (sbcset,
3157                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3158                                   &range_alloc, &start_elem, &end_elem);
3159 # else
3160           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3161 # endif
3162 #endif /* RE_ENABLE_I18N */
3163           if (BE (*err != REG_NOERROR, 0))
3164             goto parse_bracket_exp_free_return;
3165         }
3166       else
3167         {
3168           switch (start_elem.type)
3169             {
3170             case SB_CHAR:
3171               bitset_set (sbcset, start_elem.opr.ch);
3172               break;
3173 #ifdef RE_ENABLE_I18N
3174             case MB_CHAR:
3175               /* Check whether the array has enough space.  */
3176               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3177                 {
3178                   wchar_t *new_mbchars;
3179                   /* Not enough, realloc it.  */
3180                   /* +1 in case of mbcset->nmbchars is 0.  */
3181                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3182                   /* Use realloc since array is NULL if *alloc == 0.  */
3183                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3184                                             mbchar_alloc);
3185                   if (BE (new_mbchars == NULL, 0))
3186                     goto parse_bracket_exp_espace;
3187                   mbcset->mbchars = new_mbchars;
3188                 }
3189               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3190               break;
3191 #endif /* RE_ENABLE_I18N */
3192             case EQUIV_CLASS:
3193               *err = build_equiv_class (sbcset,
3194 #ifdef RE_ENABLE_I18N
3195                                         mbcset, &equiv_class_alloc,
3196 #endif /* RE_ENABLE_I18N */
3197                                         start_elem.opr.name);
3198               if (BE (*err != REG_NOERROR, 0))
3199                 goto parse_bracket_exp_free_return;
3200               break;
3201             case COLL_SYM:
3202               *err = build_collating_symbol (sbcset,
3203 #ifdef RE_ENABLE_I18N
3204                                              mbcset, &coll_sym_alloc,
3205 #endif /* RE_ENABLE_I18N */
3206                                              start_elem.opr.name);
3207               if (BE (*err != REG_NOERROR, 0))
3208                 goto parse_bracket_exp_free_return;
3209               break;
3210             case CHAR_CLASS:
3211               *err = build_charclass (regexp->trans, sbcset,
3212 #ifdef RE_ENABLE_I18N
3213                                       mbcset, &char_class_alloc,
3214 #endif /* RE_ENABLE_I18N */
3215                                       start_elem.opr.name, syntax);
3216               if (BE (*err != REG_NOERROR, 0))
3217                goto parse_bracket_exp_free_return;
3218               break;
3219             default:
3220               assert (0);
3221               break;
3222             }
3223         }
3224       if (BE (token->type == END_OF_RE, 0))
3225         {
3226           *err = REG_EBRACK;
3227           goto parse_bracket_exp_free_return;
3228         }
3229       if (token->type == OP_CLOSE_BRACKET)
3230         break;
3231     }
3232
3233   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3234
3235   /* If it is non-matching list.  */
3236   if (non_match)
3237     bitset_not (sbcset);
3238
3239 #ifdef RE_ENABLE_I18N
3240   /* Ensure only single byte characters are set.  */
3241   if (dfa->mb_cur_max > 1)
3242     bitset_mask (sbcset, dfa->sb_char);
3243
3244   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3245       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3246                                                      || mbcset->non_match)))
3247     {
3248       bin_tree_t *mbc_tree;
3249       int sbc_idx;
3250       /* Build a tree for complex bracket.  */
3251       dfa->has_mb_node = 1;
3252       br_token.type = COMPLEX_BRACKET;
3253       br_token.opr.mbcset = mbcset;
3254       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3255       if (BE (mbc_tree == NULL, 0))
3256         goto parse_bracket_exp_espace;
3257       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3258         if (sbcset[sbc_idx])
3259           break;
3260       /* If there are no bits set in sbcset, there is no point
3261          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3262       if (sbc_idx < BITSET_WORDS)
3263         {
3264           /* Build a tree for simple bracket.  */
3265           br_token.type = SIMPLE_BRACKET;
3266           br_token.opr.sbcset = sbcset;
3267           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3268           if (BE (work_tree == NULL, 0))
3269             goto parse_bracket_exp_espace;
3270
3271           /* Then join them by ALT node.  */
3272           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3273           if (BE (work_tree == NULL, 0))
3274             goto parse_bracket_exp_espace;
3275         }
3276       else
3277         {
3278           re_free (sbcset);
3279           work_tree = mbc_tree;
3280         }
3281     }
3282   else
3283 #endif /* not RE_ENABLE_I18N */
3284     {
3285 #ifdef RE_ENABLE_I18N
3286       free_charset (mbcset);
3287 #endif
3288       /* Build a tree for simple bracket.  */
3289       br_token.type = SIMPLE_BRACKET;
3290       br_token.opr.sbcset = sbcset;
3291       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3292       if (BE (work_tree == NULL, 0))
3293         goto parse_bracket_exp_espace;
3294     }
3295   return work_tree;
3296
3297  parse_bracket_exp_espace:
3298   *err = REG_ESPACE;
3299  parse_bracket_exp_free_return:
3300   re_free (sbcset);
3301 #ifdef RE_ENABLE_I18N
3302   free_charset (mbcset);
3303 #endif /* RE_ENABLE_I18N */
3304   return NULL;
3305 }
3306
3307 /* Parse an element in the bracket expression.  */
3308
3309 static reg_errcode_t
3310 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3311                        re_token_t *token, int token_len, re_dfa_t *dfa,
3312                        reg_syntax_t syntax, int accept_hyphen)
3313 {
3314 #ifdef RE_ENABLE_I18N
3315   int cur_char_size;
3316   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3317   if (cur_char_size > 1)
3318     {
3319       elem->type = MB_CHAR;
3320       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3321       re_string_skip_bytes (regexp, cur_char_size);
3322       return REG_NOERROR;
3323     }
3324 #endif /* RE_ENABLE_I18N */
3325   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3326   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3327       || token->type == OP_OPEN_EQUIV_CLASS)
3328     return parse_bracket_symbol (elem, regexp, token);
3329   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3330     {
3331       /* A '-' must only appear as anything but a range indicator before
3332          the closing bracket.  Everything else is an error.  */
3333       re_token_t token2;
3334       (void) peek_token_bracket (&token2, regexp, syntax);
3335       if (token2.type != OP_CLOSE_BRACKET)
3336         /* The actual error value is not standardized since this whole
3337            case is undefined.  But ERANGE makes good sense.  */
3338         return REG_ERANGE;
3339     }
3340   elem->type = SB_CHAR;
3341   elem->opr.ch = token->opr.c;
3342   return REG_NOERROR;
3343 }
3344
3345 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3346    such as [:<character_class>:], [.<collating_element>.], and
3347    [=<equivalent_class>=].  */
3348
3349 static reg_errcode_t
3350 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3351                       re_token_t *token)
3352 {
3353   unsigned char ch, delim = token->opr.c;
3354   int i = 0;
3355   if (re_string_eoi(regexp))
3356     return REG_EBRACK;
3357   for (;; ++i)
3358     {
3359       if (i >= BRACKET_NAME_BUF_SIZE)
3360         return REG_EBRACK;
3361       if (token->type == OP_OPEN_CHAR_CLASS)
3362         ch = re_string_fetch_byte_case (regexp);
3363       else
3364         ch = re_string_fetch_byte (regexp);
3365       if (re_string_eoi(regexp))
3366         return REG_EBRACK;
3367       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3368         break;
3369       elem->opr.name[i] = ch;
3370     }
3371   re_string_skip_bytes (regexp, 1);
3372   elem->opr.name[i] = '\0';
3373   switch (token->type)
3374     {
3375     case OP_OPEN_COLL_ELEM:
3376       elem->type = COLL_SYM;
3377       break;
3378     case OP_OPEN_EQUIV_CLASS:
3379       elem->type = EQUIV_CLASS;
3380       break;
3381     case OP_OPEN_CHAR_CLASS:
3382       elem->type = CHAR_CLASS;
3383       break;
3384     default:
3385       break;
3386     }
3387   return REG_NOERROR;
3388 }
3389
3390   /* Helper function for parse_bracket_exp.
3391      Build the equivalence class which is represented by NAME.
3392      The result are written to MBCSET and SBCSET.
3393      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3394      is a pointer argument sinse we may update it.  */
3395
3396 static reg_errcode_t
3397 #ifdef RE_ENABLE_I18N
3398 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3399                    int *equiv_class_alloc, const unsigned char *name)
3400 #else /* not RE_ENABLE_I18N */
3401 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3402 #endif /* not RE_ENABLE_I18N */
3403 {
3404 #ifdef _LIBC
3405   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3406   if (nrules != 0)
3407     {
3408       const int32_t *table, *indirect;
3409       const unsigned char *weights, *extra, *cp;
3410       unsigned char char_buf[2];
3411       int32_t idx1, idx2;
3412       unsigned int ch;
3413       size_t len;
3414       /* This #include defines a local function!  */
3415 # include <locale/weight.h>
3416       /* Calculate the index for equivalence class.  */
3417       cp = name;
3418       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3419       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3420                                                _NL_COLLATE_WEIGHTMB);
3421       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422                                                    _NL_COLLATE_EXTRAMB);
3423       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3424                                                 _NL_COLLATE_INDIRECTMB);
3425       idx1 = findidx (&cp, -1);
3426       if (BE (idx1 == 0 || *cp != '\0', 0))
3427         /* This isn't a valid character.  */
3428         return REG_ECOLLATE;
3429
3430       /* Build single byte matcing table for this equivalence class.  */
3431       len = weights[idx1 & 0xffffff];
3432       for (ch = 0; ch < SBC_MAX; ++ch)
3433         {
3434           char_buf[0] = ch;
3435           cp = char_buf;
3436           idx2 = findidx (&cp, 1);
3437 /*
3438           idx2 = table[ch];
3439 */
3440           if (idx2 == 0)
3441             /* This isn't a valid character.  */
3442             continue;
3443           /* Compare only if the length matches and the collation rule
3444              index is the same.  */
3445           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3446             {
3447               int cnt = 0;
3448
3449               while (cnt <= len &&
3450                      weights[(idx1 & 0xffffff) + 1 + cnt]
3451                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3452                 ++cnt;
3453
3454               if (cnt > len)
3455                 bitset_set (sbcset, ch);
3456             }
3457         }
3458       /* Check whether the array has enough space.  */
3459       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3460         {
3461           /* Not enough, realloc it.  */
3462           /* +1 in case of mbcset->nequiv_classes is 0.  */
3463           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3464           /* Use realloc since the array is NULL if *alloc == 0.  */
3465           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3466                                                    int32_t,
3467                                                    new_equiv_class_alloc);
3468           if (BE (new_equiv_classes == NULL, 0))
3469             return REG_ESPACE;
3470           mbcset->equiv_classes = new_equiv_classes;
3471           *equiv_class_alloc = new_equiv_class_alloc;
3472         }
3473       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3474     }
3475   else
3476 #endif /* _LIBC */
3477     {
3478       if (BE (strlen ((const char *) name) != 1, 0))
3479         return REG_ECOLLATE;
3480       bitset_set (sbcset, *name);
3481     }
3482   return REG_NOERROR;
3483 }
3484
3485   /* Helper function for parse_bracket_exp.
3486      Build the character class which is represented by NAME.
3487      The result are written to MBCSET and SBCSET.
3488      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3489      is a pointer argument sinse we may update it.  */
3490
3491 static reg_errcode_t
3492 #ifdef RE_ENABLE_I18N
3493 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3494                  re_charset_t *mbcset, int *char_class_alloc,
3495                  const unsigned char *class_name, reg_syntax_t syntax)
3496 #else /* not RE_ENABLE_I18N */
3497 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3498                  const unsigned char *class_name, reg_syntax_t syntax)
3499 #endif /* not RE_ENABLE_I18N */
3500 {
3501   int i;
3502   const char *name = (const char *) class_name;
3503
3504   /* In case of REG_ICASE "upper" and "lower" match the both of
3505      upper and lower cases.  */
3506   if ((syntax & RE_ICASE)
3507       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3508     name = "alpha";
3509
3510 #ifdef RE_ENABLE_I18N
3511   /* Check the space of the arrays.  */
3512   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3513     {
3514       /* Not enough, realloc it.  */
3515       /* +1 in case of mbcset->nchar_classes is 0.  */
3516       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3517       /* Use realloc since array is NULL if *alloc == 0.  */
3518       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3519                                                new_char_class_alloc);
3520       if (BE (new_char_classes == NULL, 0))
3521         return REG_ESPACE;
3522       mbcset->char_classes = new_char_classes;
3523       *char_class_alloc = new_char_class_alloc;
3524     }
3525   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3526 #endif /* RE_ENABLE_I18N */
3527
3528 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3529   do {                                          \
3530     if (BE (trans != NULL, 0))                  \
3531       {                                         \
3532         for (i = 0; i < SBC_MAX; ++i)           \
3533           if (ctype_func (i))                   \
3534             bitset_set (sbcset, trans[i]);      \
3535       }                                         \
3536     else                                        \
3537       {                                         \
3538         for (i = 0; i < SBC_MAX; ++i)           \
3539           if (ctype_func (i))                   \
3540             bitset_set (sbcset, i);             \
3541       }                                         \
3542   } while (0)
3543
3544   if (strcmp (name, "alnum") == 0)
3545     BUILD_CHARCLASS_LOOP (isalnum);
3546   else if (strcmp (name, "cntrl") == 0)
3547     BUILD_CHARCLASS_LOOP (iscntrl);
3548   else if (strcmp (name, "lower") == 0)
3549     BUILD_CHARCLASS_LOOP (islower);
3550   else if (strcmp (name, "space") == 0)
3551     BUILD_CHARCLASS_LOOP (isspace);
3552   else if (strcmp (name, "alpha") == 0)
3553     BUILD_CHARCLASS_LOOP (isalpha);
3554   else if (strcmp (name, "digit") == 0)
3555     BUILD_CHARCLASS_LOOP (isdigit);
3556   else if (strcmp (name, "print") == 0)
3557     BUILD_CHARCLASS_LOOP (isprint);
3558   else if (strcmp (name, "upper") == 0)
3559     BUILD_CHARCLASS_LOOP (isupper);
3560   else if (strcmp (name, "blank") == 0)
3561     BUILD_CHARCLASS_LOOP (isblank);
3562   else if (strcmp (name, "graph") == 0)
3563     BUILD_CHARCLASS_LOOP (isgraph);
3564   else if (strcmp (name, "punct") == 0)
3565     BUILD_CHARCLASS_LOOP (ispunct);
3566   else if (strcmp (name, "xdigit") == 0)
3567     BUILD_CHARCLASS_LOOP (isxdigit);
3568   else
3569     return REG_ECTYPE;
3570
3571   return REG_NOERROR;
3572 }
3573
3574 static bin_tree_t *
3575 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3576                     const unsigned char *class_name,
3577                     const unsigned char *extra, int non_match,
3578                     reg_errcode_t *err)
3579 {
3580   re_bitset_ptr_t sbcset;
3581 #ifdef RE_ENABLE_I18N
3582   re_charset_t *mbcset;
3583   int alloc = 0;
3584 #endif /* not RE_ENABLE_I18N */
3585   reg_errcode_t ret;
3586   re_token_t br_token;
3587   bin_tree_t *tree;
3588
3589   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3590 #ifdef RE_ENABLE_I18N
3591   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3592 #endif /* RE_ENABLE_I18N */
3593
3594 #ifdef RE_ENABLE_I18N
3595   if (BE (sbcset == NULL || mbcset == NULL, 0))
3596 #else /* not RE_ENABLE_I18N */
3597   if (BE (sbcset == NULL, 0))
3598 #endif /* not RE_ENABLE_I18N */
3599     {
3600       *err = REG_ESPACE;
3601       return NULL;
3602     }
3603
3604   if (non_match)
3605     {
3606 #ifdef RE_ENABLE_I18N
3607       mbcset->non_match = 1;
3608 #endif /* not RE_ENABLE_I18N */
3609     }
3610
3611   /* We don't care the syntax in this case.  */
3612   ret = build_charclass (trans, sbcset,
3613 #ifdef RE_ENABLE_I18N
3614                          mbcset, &alloc,
3615 #endif /* RE_ENABLE_I18N */
3616                          class_name, 0);
3617
3618   if (BE (ret != REG_NOERROR, 0))
3619     {
3620       re_free (sbcset);
3621 #ifdef RE_ENABLE_I18N
3622       free_charset (mbcset);
3623 #endif /* RE_ENABLE_I18N */
3624       *err = ret;
3625       return NULL;
3626     }
3627   /* \w match '_' also.  */
3628   for (; *extra; extra++)
3629     bitset_set (sbcset, *extra);
3630
3631   /* If it is non-matching list.  */
3632   if (non_match)
3633     bitset_not (sbcset);
3634
3635 #ifdef RE_ENABLE_I18N
3636   /* Ensure only single byte characters are set.  */
3637   if (dfa->mb_cur_max > 1)
3638     bitset_mask (sbcset, dfa->sb_char);
3639 #endif
3640
3641   /* Build a tree for simple bracket.  */
3642   br_token.type = SIMPLE_BRACKET;
3643   br_token.opr.sbcset = sbcset;
3644   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3645   if (BE (tree == NULL, 0))
3646     goto build_word_op_espace;
3647
3648 #ifdef RE_ENABLE_I18N
3649   if (dfa->mb_cur_max > 1)
3650     {
3651       bin_tree_t *mbc_tree;
3652       /* Build a tree for complex bracket.  */
3653       br_token.type = COMPLEX_BRACKET;
3654       br_token.opr.mbcset = mbcset;
3655       dfa->has_mb_node = 1;
3656       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3657       if (BE (mbc_tree == NULL, 0))
3658         goto build_word_op_espace;
3659       /* Then join them by ALT node.  */
3660       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3661       if (BE (mbc_tree != NULL, 1))
3662         return tree;
3663     }
3664   else
3665     {
3666       free_charset (mbcset);
3667       return tree;
3668     }
3669 #else /* not RE_ENABLE_I18N */
3670   return tree;
3671 #endif /* not RE_ENABLE_I18N */
3672
3673  build_word_op_espace:
3674   re_free (sbcset);
3675 #ifdef RE_ENABLE_I18N
3676   free_charset (mbcset);
3677 #endif /* RE_ENABLE_I18N */
3678   *err = REG_ESPACE;
3679   return NULL;
3680 }
3681
3682 /* This is intended for the expressions like "a{1,3}".
3683    Fetch a number from `input', and return the number.
3684    Return -1, if the number field is empty like "{,1}".
3685    Return -2, If an error is occured.  */
3686
3687 static int
3688 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3689 {
3690   int num = -1;
3691   unsigned char c;
3692   while (1)
3693     {
3694       fetch_token (token, input, syntax);
3695       c = token->opr.c;
3696       if (BE (token->type == END_OF_RE, 0))
3697         return -2;
3698       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3699         break;
3700       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3701              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3702       num = (num > RE_DUP_MAX) ? -2 : num;
3703     }
3704   return num;
3705 }
3706 \f
3707 #ifdef RE_ENABLE_I18N
3708 static void
3709 free_charset (re_charset_t *cset)
3710 {
3711   re_free (cset->mbchars);
3712 # ifdef _LIBC
3713   re_free (cset->coll_syms);
3714   re_free (cset->equiv_classes);
3715   re_free (cset->range_starts);
3716   re_free (cset->range_ends);
3717 # endif
3718   re_free (cset->char_classes);
3719   re_free (cset);
3720 }
3721 #endif /* RE_ENABLE_I18N */
3722 \f
3723 /* Functions for binary tree operation.  */
3724
3725 /* Create a tree node.  */
3726
3727 static bin_tree_t *
3728 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3729              re_token_type_t type)
3730 {
3731   re_token_t t;
3732   t.type = type;
3733   return create_token_tree (dfa, left, right, &t);
3734 }
3735
3736 static bin_tree_t *
3737 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3738                    const re_token_t *token)
3739 {
3740   bin_tree_t *tree;
3741   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3742     {
3743       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3744
3745       if (storage == NULL)
3746         return NULL;
3747       storage->next = dfa->str_tree_storage;
3748       dfa->str_tree_storage = storage;
3749       dfa->str_tree_storage_idx = 0;
3750     }
3751   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3752
3753   tree->parent = NULL;
3754   tree->left = left;
3755   tree->right = right;
3756   tree->token = *token;
3757   tree->token.duplicated = 0;
3758   tree->token.opt_subexp = 0;
3759   tree->first = NULL;
3760   tree->next = NULL;
3761   tree->node_idx = -1;
3762
3763   if (left != NULL)
3764     left->parent = tree;
3765   if (right != NULL)
3766     right->parent = tree;
3767   return tree;
3768 }
3769
3770 /* Mark the tree SRC as an optional subexpression.
3771    To be called from preorder or postorder.  */
3772
3773 static reg_errcode_t
3774 mark_opt_subexp (void *extra, bin_tree_t *node)
3775 {
3776   int idx = (int) (long) extra;
3777   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3778     node->token.opt_subexp = 1;
3779
3780   return REG_NOERROR;
3781 }
3782
3783 /* Free the allocated memory inside NODE. */
3784
3785 static void
3786 free_token (re_token_t *node)
3787 {
3788 #ifdef RE_ENABLE_I18N
3789   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3790     free_charset (node->opr.mbcset);
3791   else
3792 #endif /* RE_ENABLE_I18N */
3793     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3794       re_free (node->opr.sbcset);
3795 }
3796
3797 /* Worker function for tree walking.  Free the allocated memory inside NODE
3798    and its children. */
3799
3800 static reg_errcode_t
3801 free_tree (void *extra, bin_tree_t *node)
3802 {
3803   free_token (&node->token);
3804   return REG_NOERROR;
3805 }
3806
3807
3808 /* Duplicate the node SRC, and return new node.  This is a preorder
3809    visit similar to the one implemented by the generic visitor, but
3810    we need more infrastructure to maintain two parallel trees --- so,
3811    it's easier to duplicate.  */
3812
3813 static bin_tree_t *
3814 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3815 {
3816   const bin_tree_t *node;
3817   bin_tree_t *dup_root;
3818   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3819
3820   for (node = root; ; )
3821     {
3822       /* Create a new tree and link it back to the current parent.  */
3823       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3824       if (*p_new == NULL)
3825         return NULL;
3826       (*p_new)->parent = dup_node;
3827       (*p_new)->token.duplicated = 1;
3828       dup_node = *p_new;
3829
3830       /* Go to the left node, or up and to the right.  */
3831       if (node->left)
3832         {
3833           node = node->left;
3834           p_new = &dup_node->left;
3835         }
3836       else
3837         {
3838           const bin_tree_t *prev = NULL;
3839           while (node->right == prev || node->right == NULL)
3840             {
3841               prev = node;
3842               node = node->parent;
3843               dup_node = dup_node->parent;
3844               if (!node)
3845                 return dup_root;
3846             }
3847           node = node->right;
3848           p_new = &dup_node->right;
3849         }
3850     }
3851 }