lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #ifdef _LIBC
  21 # include <locale/weight.h>
  22 #endif
  23
  24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  25                                           size_t length, reg_syntax_t syntax);
  26 static void re_compile_fastmap_iter (regex_t *bufp,
  27                                      const re_dfastate_t *init_state,
  28                                      char *fastmap);
  29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  30 static void free_charset (re_charset_t *cset);
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax);
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  89                                         re_charset_t *mbcset,
  90                                         Idx *equiv_class_alloc,
  91                                         const unsigned char *name);
  92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  93                                       bitset_t sbcset,
  94                                       re_charset_t *mbcset,
  95                                       Idx *char_class_alloc,
  96                                       const char *class_name,
  97                                       reg_syntax_t syntax);
  98 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
  99                                        RE_TRANSLATE_TYPE trans,
 100                                        const char *class_name,
 101                                        const char *extra,
 102                                        bool non_match, reg_errcode_t *err);
 103 static bin_tree_t *create_tree (re_dfa_t *dfa,
 104                                 bin_tree_t *left, bin_tree_t *right,
 105                                 re_token_type_t type);
 106 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 107                                       bin_tree_t *left, bin_tree_t *right,
 108                                       const re_token_t *token);
 109 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 110 static void free_token (re_token_t *node);
 111 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 112 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 113 \f
 114 /* This table gives an error message for each of the error codes listed
 115    in regex.h.  Obviously the order here has to be same as there.
 116    POSIX doesn't require that we do anything for REG_NOERROR,
 117    but why not be nice?  */
 118
 119 static const char __re_error_msgid[] =
 120   {
 121 #define REG_NOERROR_IDX 0
 122     gettext_noop ("Success")    /* REG_NOERROR */
 123     "\0"
 124 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 125     gettext_noop ("No match")   /* REG_NOMATCH */
 126     "\0"
 127 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 128     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 129     "\0"
 130 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 131     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 132     "\0"
 133 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 134     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 135     "\0"
 136 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 137     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 138     "\0"
 139 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 140     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 141     "\0"
 142 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 143     gettext_noop ("Unmatched [, [^, [:, [., or [=")     /* REG_EBRACK */
 144     "\0"
 145 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
 146     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 147     "\0"
 148 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 149     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 150     "\0"
 151 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 152     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 153     "\0"
 154 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 155     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 156     "\0"
 157 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 158     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 159     "\0"
 160 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 161     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 162     "\0"
 163 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 164     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 165     "\0"
 166 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 167     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 168     "\0"
 169 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 170     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 171   };
 172
 173 static const size_t __re_error_msgid_idx[] =
 174   {
 175     REG_NOERROR_IDX,
 176     REG_NOMATCH_IDX,
 177     REG_BADPAT_IDX,
 178     REG_ECOLLATE_IDX,
 179     REG_ECTYPE_IDX,
 180     REG_EESCAPE_IDX,
 181     REG_ESUBREG_IDX,
 182     REG_EBRACK_IDX,
 183     REG_EPAREN_IDX,
 184     REG_EBRACE_IDX,
 185     REG_BADBR_IDX,
 186     REG_ERANGE_IDX,
 187     REG_ESPACE_IDX,
 188     REG_BADRPT_IDX,
 189     REG_EEND_IDX,
 190     REG_ESIZE_IDX,
 191     REG_ERPAREN_IDX
 192   };
 193 \f
 194 /* Entry points for GNU code.  */
 195
 196 /* re_compile_pattern is the GNU regular expression compiler: it
 197    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 198    Returns 0 if the pattern was valid, otherwise an error string.
 199
 200    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 201    are set in BUFP on entry.  */
 202
 203 const char *
 204 re_compile_pattern (const char *pattern, size_t length,
 205                     struct re_pattern_buffer *bufp)
 206 {
 207   reg_errcode_t ret;
 208
 209   /* And GNU code determines whether or not to get register information
 210      by passing null for the REGS argument to re_match, etc., not by
 211      setting no_sub, unless RE_NO_SUB is set.  */
 212   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 213
 214   /* Match anchors at newline.  */
 215   bufp->newline_anchor = 1;
 216
 217   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 218
 219   if (!ret)
 220     return NULL;
 221   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 222 }
 223 weak_alias (__re_compile_pattern, re_compile_pattern)
 224
 225 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 226    also be assigned to arbitrarily: each pattern buffer stores its own
 227    syntax, so it can be changed between regex compilations.  */
 228 /* This has no initializer because initialized variables in Emacs
 229    become read-only after dumping.  */
 230 reg_syntax_t re_syntax_options;
 231
 232
 233 /* Specify the precise syntax of regexps for compilation.  This provides
 234    for compatibility for various utilities which historically have
 235    different, incompatible syntaxes.
 236
 237    The argument SYNTAX is a bit mask comprised of the various bits
 238    defined in regex.h.  We return the old syntax.  */
 239
 240 reg_syntax_t
 241 re_set_syntax (reg_syntax_t syntax)
 242 {
 243   reg_syntax_t ret = re_syntax_options;
 244
 245   re_syntax_options = syntax;
 246   return ret;
 247 }
 248 weak_alias (__re_set_syntax, re_set_syntax)
 249
 250 int
 251 re_compile_fastmap (struct re_pattern_buffer *bufp)
 252 {
 253   re_dfa_t *dfa = bufp->buffer;
 254   char *fastmap = bufp->fastmap;
 255
 256   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 257   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 258   if (dfa->init_state != dfa->init_state_word)
 259     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 260   if (dfa->init_state != dfa->init_state_nl)
 261     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 262   if (dfa->init_state != dfa->init_state_begbuf)
 263     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 264   bufp->fastmap_accurate = 1;
 265   return 0;
 266 }
 267 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 268
 269 static __always_inline void
 270 re_set_fastmap (char *fastmap, bool icase, int ch)
 271 {
 272   fastmap[ch] = 1;
 273   if (icase)
 274     fastmap[tolower (ch)] = 1;
 275 }
 276
 277 /* Helper function for re_compile_fastmap.
 278    Compile fastmap for the initial_state INIT_STATE.  */
 279
 280 static void
 281 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 282                          char *fastmap)
 283 {
 284   re_dfa_t *dfa = bufp->buffer;
 285   Idx node_cnt;
 286   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 287   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 288     {
 289       Idx node = init_state->nodes.elems[node_cnt];
 290       re_token_type_t type = dfa->nodes[node].type;
 291
 292       if (type == CHARACTER)
 293         {
 294           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 295           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 296             {
 297               unsigned char buf[MB_LEN_MAX];
 298               unsigned char *p;
 299               wchar_t wc;
 300               mbstate_t state;
 301
 302               p = buf;
 303               *p++ = dfa->nodes[node].opr.c;
 304               while (++node < dfa->nodes_len
 305                      && dfa->nodes[node].type == CHARACTER
 306                      && dfa->nodes[node].mb_partial)
 307                 *p++ = dfa->nodes[node].opr.c;
 308               memset (&state, '\0', sizeof (state));
 309               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 310                              &state) == p - buf
 311                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 312                       != (size_t) -1))
 313                 re_set_fastmap (fastmap, false, buf[0]);
 314             }
 315         }
 316       else if (type == SIMPLE_BRACKET)
 317         {
 318           int i, ch;
 319           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 320             {
 321               int j;
 322               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 323               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 324                 if (w & ((bitset_word_t) 1 << j))
 325                   re_set_fastmap (fastmap, icase, ch);
 326             }
 327         }
 328       else if (type == COMPLEX_BRACKET)
 329         {
 330           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 331           Idx i;
 332
 333 #ifdef _LIBC
 334           /* See if we have to try all bytes which start multiple collation
 335              elements.
 336              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 337                   collation element, and don't catch 'b' since 'b' is
 338                   the only collation element which starts from 'b' (and
 339                   it is caught by SIMPLE_BRACKET).  */
 340               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 341                   && (cset->ncoll_syms || cset->nranges))
 342                 {
 343                   const int32_t *table = (const int32_t *)
 344                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 345                   for (i = 0; i < SBC_MAX; ++i)
 346                     if (table[i] < 0)
 347                       re_set_fastmap (fastmap, icase, i);
 348                 }
 349 #endif /* _LIBC */
 350
 351           /* See if we have to start the match at all multibyte characters,
 352              i.e. where we would not find an invalid sequence.  This only
 353              applies to multibyte character sets; for single byte character
 354              sets, the SIMPLE_BRACKET again suffices.  */
 355           if (dfa->mb_cur_max > 1
 356               && (cset->nchar_classes || cset->non_match || cset->nranges
 357 #ifdef _LIBC
 358                   || cset->nequiv_classes
 359 #endif /* _LIBC */
 360                  ))
 361             {
 362               unsigned char c = 0;
 363               do
 364                 {
 365                   mbstate_t mbs;
 366                   memset (&mbs, 0, sizeof (mbs));
 367                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 368                     re_set_fastmap (fastmap, false, (int) c);
 369                 }
 370               while (++c != 0);
 371             }
 372
 373           else
 374             {
 375               /* ... Else catch all bytes which can start the mbchars.  */
 376               for (i = 0; i < cset->nmbchars; ++i)
 377                 {
 378                   char buf[256];
 379                   mbstate_t state;
 380                   memset (&state, '\0', sizeof (state));
 381                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 382                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 383                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 384                     {
 385                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 386                           != (size_t) -1)
 387                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 388                     }
 389                 }
 390             }
 391         }
 392       else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
 393         {
 394           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 395           if (type == END_OF_RE)
 396             bufp->can_be_null = 1;
 397           return;
 398         }
 399     }
 400 }
 401 \f
 402 /* Entry point for POSIX code.  */
 403 /* regcomp takes a regular expression as a string and compiles it.
 404
 405    PREG is a regex_t *.  We do not expect any fields to be initialized,
 406    since POSIX says we shouldn't.  Thus, we set
 407
 408      'buffer' to the compiled pattern;
 409      'used' to the length of the compiled pattern;
 410      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 411        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 412        RE_SYNTAX_POSIX_BASIC;
 413      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 414      'fastmap' to an allocated space for the fastmap;
 415      'fastmap_accurate' to zero;
 416      're_nsub' to the number of subexpressions in PATTERN.
 417
 418    PATTERN is the address of the pattern string.
 419
 420    CFLAGS is a series of bits which affect compilation.
 421
 422      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 423      use POSIX basic syntax.
 424
 425      If REG_NEWLINE is set, then . and [^...] don't match newline.
 426      Also, regexec will try a match beginning after every newline.
 427
 428      If REG_ICASE is set, then we considers upper- and lowercase
 429      versions of letters to be equivalent when matching.
 430
 431      If REG_NOSUB is set, then when PREG is passed to regexec, that
 432      routine will report only success or failure, and nothing about the
 433      registers.
 434
 435    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 436    the return codes and their meanings.)  */
 437
 438 int
 439 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 440 {
 441   reg_errcode_t ret;
 442   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 443                          : RE_SYNTAX_POSIX_BASIC);
 444
 445   preg->buffer = NULL;
 446   preg->allocated = 0;
 447   preg->used = 0;
 448
 449   /* Try to allocate space for the fastmap.  */
 450   preg->fastmap = re_malloc (char, SBC_MAX);
 451   if (__glibc_unlikely (preg->fastmap == NULL))
 452     return REG_ESPACE;
 453
 454   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 455
 456   /* If REG_NEWLINE is set, newlines are treated differently.  */
 457   if (cflags & REG_NEWLINE)
 458     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 459       syntax &= ~RE_DOT_NEWLINE;
 460       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 461       /* It also changes the matching behavior.  */
 462       preg->newline_anchor = 1;
 463     }
 464   else
 465     preg->newline_anchor = 0;
 466   preg->no_sub = !!(cflags & REG_NOSUB);
 467   preg->translate = NULL;
 468
 469   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 470
 471   /* POSIX doesn't distinguish between an unmatched open-group and an
 472      unmatched close-group: both are REG_EPAREN.  */
 473   if (ret == REG_ERPAREN)
 474     ret = REG_EPAREN;
 475
 476   /* We have already checked preg->fastmap != NULL.  */
 477   if (__glibc_likely (ret == REG_NOERROR))
 478     /* Compute the fastmap now, since regexec cannot modify the pattern
 479        buffer.  This function never fails in this implementation.  */
 480     (void) re_compile_fastmap (preg);
 481   else
 482     {
 483       /* Some error occurred while compiling the expression.  */
 484       re_free (preg->fastmap);
 485       preg->fastmap = NULL;
 486     }
 487
 488   return (int) ret;
 489 }
 490 libc_hidden_def (__regcomp)
 491 weak_alias (__regcomp, regcomp)
 492
 493 /* Returns a message corresponding to an error code, ERRCODE, returned
 494    from either regcomp or regexec.   We don't use PREG here.  */
 495
 496 size_t
 497 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
 498           size_t errbuf_size)
 499 {
 500   const char *msg;
 501   size_t msg_size;
 502   int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
 503
 504   if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
 505     /* Only error codes returned by the rest of the code should be passed
 506        to this routine.  If we are given anything else, or if other regex
 507        code generates an invalid error code, then the program has a bug.
 508        Dump core so we can fix it.  */
 509     abort ();
 510
 511   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 512
 513   msg_size = strlen (msg) + 1; /* Includes the null.  */
 514
 515   if (__glibc_likely (errbuf_size != 0))
 516     {
 517       size_t cpy_size = msg_size;
 518       if (__glibc_unlikely (msg_size > errbuf_size))
 519         {
 520           cpy_size = errbuf_size - 1;
 521           errbuf[cpy_size] = '\0';
 522         }
 523       memcpy (errbuf, msg, cpy_size);
 524     }
 525
 526   return msg_size;
 527 }
 528 weak_alias (__regerror, regerror)
 529
 530
 531 /* This static array is used for the map to single-byte characters when
 532    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 533    it the same all the time.  UTF-8 is the preferred encoding so this is
 534    a worthwhile optimization.  */
 535 static const bitset_t utf8_sb_map =
 536 {
 537   /* Set the first 128 bits.  */
 538 #if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
 539   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 540 #else
 541 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 542 #  error "bitset_word_t is narrower than 32 bits"
 543 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 544   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 545 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 546   BITSET_WORD_MAX, BITSET_WORD_MAX,
 547 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 548   BITSET_WORD_MAX,
 549 # endif
 550   (BITSET_WORD_MAX
 551    >> (SBC_MAX % BITSET_WORD_BITS == 0
 552        ? 0
 553        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 554 #endif
 555 };
 556
 557
 558 static void
 559 free_dfa_content (re_dfa_t *dfa)
 560 {
 561   Idx i, j;
 562
 563   if (dfa->nodes)
 564     for (i = 0; i < dfa->nodes_len; ++i)
 565       free_token (dfa->nodes + i);
 566   re_free (dfa->nexts);
 567   for (i = 0; i < dfa->nodes_len; ++i)
 568     {
 569       if (dfa->eclosures != NULL)
 570         re_node_set_free (dfa->eclosures + i);
 571       if (dfa->inveclosures != NULL)
 572         re_node_set_free (dfa->inveclosures + i);
 573       if (dfa->edests != NULL)
 574         re_node_set_free (dfa->edests + i);
 575     }
 576   re_free (dfa->edests);
 577   re_free (dfa->eclosures);
 578   re_free (dfa->inveclosures);
 579   re_free (dfa->nodes);
 580
 581   if (dfa->state_table)
 582     for (i = 0; i <= dfa->state_hash_mask; ++i)
 583       {
 584         struct re_state_table_entry *entry = dfa->state_table + i;
 585         for (j = 0; j < entry->num; ++j)
 586           {
 587             re_dfastate_t *state = entry->array[j];
 588             free_state (state);
 589           }
 590         re_free (entry->array);
 591       }
 592   re_free (dfa->state_table);
 593   if (dfa->sb_char != utf8_sb_map)
 594     re_free (dfa->sb_char);
 595   re_free (dfa->subexp_map);
 596 #ifdef DEBUG
 597   re_free (dfa->re_str);
 598 #endif
 599
 600   re_free (dfa);
 601 }
 602
 603
 604 /* Free dynamically allocated space used by PREG.  */
 605
 606 void
 607 regfree (regex_t *preg)
 608 {
 609   re_dfa_t *dfa = preg->buffer;
 610   if (__glibc_likely (dfa != NULL))
 611     {
 612       lock_fini (dfa->lock);
 613       free_dfa_content (dfa);
 614     }
 615   preg->buffer = NULL;
 616   preg->allocated = 0;
 617
 618   re_free (preg->fastmap);
 619   preg->fastmap = NULL;
 620
 621   re_free (preg->translate);
 622   preg->translate = NULL;
 623 }
 624 libc_hidden_def (__regfree)
 625 weak_alias (__regfree, regfree)
 626 \f
 627 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 628    them unless specifically requested.  */
 629
 630 #if defined _REGEX_RE_COMP || defined _LIBC
 631
 632 /* BSD has one and only one pattern buffer.  */
 633 static struct re_pattern_buffer re_comp_buf;
 634
 635 char *
 636 # ifdef _LIBC
 637 /* Make these definitions weak in libc, so POSIX programs can redefine
 638    these names if they don't use our functions, and still use
 639    regcomp/regexec above without link errors.  */
 640 weak_function
 641 # endif
 642 re_comp (const char *s)
 643 {
 644   reg_errcode_t ret;
 645   char *fastmap;
 646
 647   if (!s)
 648     {
 649       if (!re_comp_buf.buffer)
 650         return gettext ("No previous regular expression");
 651       return 0;
 652     }
 653
 654   if (re_comp_buf.buffer)
 655     {
 656       fastmap = re_comp_buf.fastmap;
 657       re_comp_buf.fastmap = NULL;
 658       __regfree (&re_comp_buf);
 659       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 660       re_comp_buf.fastmap = fastmap;
 661     }
 662
 663   if (re_comp_buf.fastmap == NULL)
 664     {
 665       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
 666       if (re_comp_buf.fastmap == NULL)
 667         return (char *) gettext (__re_error_msgid
 668                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 669     }
 670
 671   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 672      don't need to initialize the pattern buffer fields which affect it.  */
 673
 674   /* Match anchors at newlines.  */
 675   re_comp_buf.newline_anchor = 1;
 676
 677   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 678
 679   if (!ret)
 680     return NULL;
 681
 682   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 683   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 684 }
 685
 686 #ifdef _LIBC
 687 libc_freeres_fn (free_mem)
 688 {
 689   __regfree (&re_comp_buf);
 690 }
 691 #endif
 692
 693 #endif /* _REGEX_RE_COMP */
 694 \f
 695 /* Internal entry point.
 696    Compile the regular expression PATTERN, whose length is LENGTH.
 697    SYNTAX indicate regular expression's syntax.  */
 698
 699 static reg_errcode_t
 700 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 701                      reg_syntax_t syntax)
 702 {
 703   reg_errcode_t err = REG_NOERROR;
 704   re_dfa_t *dfa;
 705   re_string_t regexp;
 706
 707   /* Initialize the pattern buffer.  */
 708   preg->fastmap_accurate = 0;
 709   preg->syntax = syntax;
 710   preg->not_bol = preg->not_eol = 0;
 711   preg->used = 0;
 712   preg->re_nsub = 0;
 713   preg->can_be_null = 0;
 714   preg->regs_allocated = REGS_UNALLOCATED;
 715
 716   /* Initialize the dfa.  */
 717   dfa = preg->buffer;
 718   if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
 719     {
 720       /* If zero allocated, but buffer is non-null, try to realloc
 721          enough space.  This loses if buffer's address is bogus, but
 722          that is the user's responsibility.  If ->buffer is NULL this
 723          is a simple allocation.  */
 724       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 725       if (dfa == NULL)
 726         return REG_ESPACE;
 727       preg->allocated = sizeof (re_dfa_t);
 728       preg->buffer = dfa;
 729     }
 730   preg->used = sizeof (re_dfa_t);
 731
 732   err = init_dfa (dfa, length);
 733   if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
 734     err = REG_ESPACE;
 735   if (__glibc_unlikely (err != REG_NOERROR))
 736     {
 737       free_dfa_content (dfa);
 738       preg->buffer = NULL;
 739       preg->allocated = 0;
 740       return err;
 741     }
 742 #ifdef DEBUG
 743   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 744   dfa->re_str = re_malloc (char, length + 1);
 745   strncpy (dfa->re_str, pattern, length + 1);
 746 #endif
 747
 748   err = re_string_construct (&regexp, pattern, length, preg->translate,
 749                              (syntax & RE_ICASE) != 0, dfa);
 750   if (__glibc_unlikely (err != REG_NOERROR))
 751     {
 752     re_compile_internal_free_return:
 753       free_workarea_compile (preg);
 754       re_string_destruct (&regexp);
 755       lock_fini (dfa->lock);
 756       free_dfa_content (dfa);
 757       preg->buffer = NULL;
 758       preg->allocated = 0;
 759       return err;
 760     }
 761
 762   /* Parse the regular expression, and build a structure tree.  */
 763   preg->re_nsub = 0;
 764   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 765   if (__glibc_unlikely (dfa->str_tree == NULL))
 766     goto re_compile_internal_free_return;
 767
 768   /* Analyze the tree and create the nfa.  */
 769   err = analyze (preg);
 770   if (__glibc_unlikely (err != REG_NOERROR))
 771     goto re_compile_internal_free_return;
 772
 773   /* If possible, do searching in single byte encoding to speed things up.  */
 774   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 775     optimize_utf8 (dfa);
 776
 777   /* Then create the initial state of the dfa.  */
 778   err = create_initial_state (dfa);
 779
 780   /* Release work areas.  */
 781   free_workarea_compile (preg);
 782   re_string_destruct (&regexp);
 783
 784   if (__glibc_unlikely (err != REG_NOERROR))
 785     {
 786       lock_fini (dfa->lock);
 787       free_dfa_content (dfa);
 788       preg->buffer = NULL;
 789       preg->allocated = 0;
 790     }
 791
 792   return err;
 793 }
 794
 795 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 796    as the initial length of some arrays.  */
 797
 798 static reg_errcode_t
 799 init_dfa (re_dfa_t *dfa, size_t pat_len)
 800 {
 801   __re_size_t table_size;
 802 #ifndef _LIBC
 803   const char *codeset_name;
 804 #endif
 805   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 806   size_t max_object_size =
 807     MAX (sizeof (struct re_state_table_entry),
 808          MAX (sizeof (re_token_t),
 809               MAX (sizeof (re_node_set),
 810                    MAX (sizeof (regmatch_t),
 811                         max_i18n_object_size))));
 812
 813   memset (dfa, '\0', sizeof (re_dfa_t));
 814
 815   /* Force allocation of str_tree_storage the first time.  */
 816   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 817
 818   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 819      calculation below, and for similar doubling calculations
 820      elsewhere.  And it's <= rather than <, because some of the
 821      doubling calculations add 1 afterwards.  */
 822   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
 823                         <= pat_len))
 824     return REG_ESPACE;
 825
 826   dfa->nodes_alloc = pat_len + 1;
 827   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 828
 829   /*  table_size = 2 ^ ceil(log pat_len) */
 830   for (table_size = 1; ; table_size <<= 1)
 831     if (table_size > pat_len)
 832       break;
 833
 834   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 835   dfa->state_hash_mask = table_size - 1;
 836
 837   dfa->mb_cur_max = MB_CUR_MAX;
 838 #ifdef _LIBC
 839   if (dfa->mb_cur_max == 6
 840       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 841     dfa->is_utf8 = 1;
 842   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 843                        != 0);
 844 #else
 845   codeset_name = nl_langinfo (CODESET);
 846   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 847       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 848       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 849       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 850     dfa->is_utf8 = 1;
 851
 852   /* We check exhaustively in the loop below if this charset is a
 853      superset of ASCII.  */
 854   dfa->map_notascii = 0;
 855 #endif
 856
 857   if (dfa->mb_cur_max > 1)
 858     {
 859       if (dfa->is_utf8)
 860         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 861       else
 862         {
 863           int i, j, ch;
 864
 865           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 866           if (__glibc_unlikely (dfa->sb_char == NULL))
 867             return REG_ESPACE;
 868
 869           /* Set the bits corresponding to single byte chars.  */
 870           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 871             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 872               {
 873                 wint_t wch = __btowc (ch);
 874                 if (wch != WEOF)
 875                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 876 #ifndef _LIBC
 877                 if (isascii (ch) && wch != ch)
 878                   dfa->map_notascii = 1;
 879 #endif
 880               }
 881         }
 882     }
 883
 884   if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
 885     return REG_ESPACE;
 886   return REG_NOERROR;
 887 }
 888
 889 /* Initialize WORD_CHAR table, which indicate which character is
 890    "word".  In this case "word" means that it is the word construction
 891    character used by some operators like "\<", "\>", etc.  */
 892
 893 static void
 894 init_word_char (re_dfa_t *dfa)
 895 {
 896   int i = 0;
 897   int j;
 898   int ch = 0;
 899   dfa->word_ops_used = 1;
 900   if (__glibc_likely (dfa->map_notascii == 0))
 901     {
 902       bitset_word_t bits0 = 0x00000000;
 903       bitset_word_t bits1 = 0x03ff0000;
 904       bitset_word_t bits2 = 0x87fffffe;
 905       bitset_word_t bits3 = 0x07fffffe;
 906       if (BITSET_WORD_BITS == 64)
 907         {
 908           /* Pacify gcc -Woverflow on 32-bit platforms.  */
 909           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 910           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 911           i = 2;
 912         }
 913       else if (BITSET_WORD_BITS == 32)
 914         {
 915           dfa->word_char[0] = bits0;
 916           dfa->word_char[1] = bits1;
 917           dfa->word_char[2] = bits2;
 918           dfa->word_char[3] = bits3;
 919           i = 4;
 920         }
 921       else
 922         goto general_case;
 923       ch = 128;
 924
 925       if (__glibc_likely (dfa->is_utf8))
 926         {
 927           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 928           return;
 929         }
 930     }
 931
 932  general_case:
 933   for (; i < BITSET_WORDS; ++i)
 934     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 935       if (isalnum (ch) || ch == '_')
 936         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 937 }
 938
 939 /* Free the work area which are only used while compiling.  */
 940
 941 static void
 942 free_workarea_compile (regex_t *preg)
 943 {
 944   re_dfa_t *dfa = preg->buffer;
 945   bin_tree_storage_t *storage, *next;
 946   for (storage = dfa->str_tree_storage; storage; storage = next)
 947     {
 948       next = storage->next;
 949       re_free (storage);
 950     }
 951   dfa->str_tree_storage = NULL;
 952   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 953   dfa->str_tree = NULL;
 954   re_free (dfa->org_indices);
 955   dfa->org_indices = NULL;
 956 }
 957
 958 /* Create initial states for all contexts.  */
 959
 960 static reg_errcode_t
 961 create_initial_state (re_dfa_t *dfa)
 962 {
 963   Idx first, i;
 964   reg_errcode_t err;
 965   re_node_set init_nodes;
 966
 967   /* Initial states have the epsilon closure of the node which is
 968      the first node of the regular expression.  */
 969   first = dfa->str_tree->first->node_idx;
 970   dfa->init_node = first;
 971   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 972   if (__glibc_unlikely (err != REG_NOERROR))
 973     return err;
 974
 975   /* The back-references which are in initial states can epsilon transit,
 976      since in this case all of the subexpressions can be null.
 977      Then we add epsilon closures of the nodes which are the next nodes of
 978      the back-references.  */
 979   if (dfa->nbackref > 0)
 980     for (i = 0; i < init_nodes.nelem; ++i)
 981       {
 982         Idx node_idx = init_nodes.elems[i];
 983         re_token_type_t type = dfa->nodes[node_idx].type;
 984
 985         Idx clexp_idx;
 986         if (type != OP_BACK_REF)
 987           continue;
 988         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 989           {
 990             re_token_t *clexp_node;
 991             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 992             if (clexp_node->type == OP_CLOSE_SUBEXP
 993                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 994               break;
 995           }
 996         if (clexp_idx == init_nodes.nelem)
 997           continue;
 998
 999         if (type == OP_BACK_REF)
1000           {
1001             Idx dest_idx = dfa->edests[node_idx].elems[0];
1002             if (!re_node_set_contains (&init_nodes, dest_idx))
1003               {
1004                 reg_errcode_t merge_err
1005                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1006                 if (merge_err != REG_NOERROR)
1007                   return merge_err;
1008                 i = 0;
1009               }
1010           }
1011       }
1012
1013   /* It must be the first time to invoke acquire_state.  */
1014   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1015   /* We don't check ERR here, since the initial state must not be NULL.  */
1016   if (__glibc_unlikely (dfa->init_state == NULL))
1017     return err;
1018   if (dfa->init_state->has_constraint)
1019     {
1020       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1021                                                        CONTEXT_WORD);
1022       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1023                                                      CONTEXT_NEWLINE);
1024       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1025                                                          &init_nodes,
1026                                                          CONTEXT_NEWLINE
1027                                                          | CONTEXT_BEGBUF);
1028       if (__glibc_unlikely (dfa->init_state_word == NULL
1029                             || dfa->init_state_nl == NULL
1030                             || dfa->init_state_begbuf == NULL))
1031         return err;
1032     }
1033   else
1034     dfa->init_state_word = dfa->init_state_nl
1035       = dfa->init_state_begbuf = dfa->init_state;
1036
1037   re_node_set_free (&init_nodes);
1038   return REG_NOERROR;
1039 }
1040 \f
1041 /* If it is possible to do searching in single byte encoding instead of UTF-8
1042    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1043    DFA nodes where needed.  */
1044
1045 static void
1046 optimize_utf8 (re_dfa_t *dfa)
1047 {
1048   Idx node;
1049   int i;
1050   bool mb_chars = false;
1051   bool has_period = false;
1052
1053   for (node = 0; node < dfa->nodes_len; ++node)
1054     switch (dfa->nodes[node].type)
1055       {
1056       case CHARACTER:
1057         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1058           mb_chars = true;
1059         break;
1060       case ANCHOR:
1061         switch (dfa->nodes[node].opr.ctx_type)
1062           {
1063           case LINE_FIRST:
1064           case LINE_LAST:
1065           case BUF_FIRST:
1066           case BUF_LAST:
1067             break;
1068           default:
1069             /* Word anchors etc. cannot be handled.  It's okay to test
1070                opr.ctx_type since constraints (for all DFA nodes) are
1071                created by ORing one or more opr.ctx_type values.  */
1072             return;
1073           }
1074         break;
1075       case OP_PERIOD:
1076         has_period = true;
1077         break;
1078       case OP_BACK_REF:
1079       case OP_ALT:
1080       case END_OF_RE:
1081       case OP_DUP_ASTERISK:
1082       case OP_OPEN_SUBEXP:
1083       case OP_CLOSE_SUBEXP:
1084         break;
1085       case COMPLEX_BRACKET:
1086         return;
1087       case SIMPLE_BRACKET:
1088         /* Just double check.  */
1089         {
1090           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1091                         ? 0
1092                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1093           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1094             {
1095               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1096                 return;
1097               rshift = 0;
1098             }
1099         }
1100         break;
1101       default:
1102         abort ();
1103       }
1104
1105   if (mb_chars || has_period)
1106     for (node = 0; node < dfa->nodes_len; ++node)
1107       {
1108         if (dfa->nodes[node].type == CHARACTER
1109             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1110           dfa->nodes[node].mb_partial = 0;
1111         else if (dfa->nodes[node].type == OP_PERIOD)
1112           dfa->nodes[node].type = OP_UTF8_PERIOD;
1113       }
1114
1115   /* The search can be in single byte locale.  */
1116   dfa->mb_cur_max = 1;
1117   dfa->is_utf8 = 0;
1118   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1119 }
1120 \f
1121 /* Analyze the structure tree, and calculate "first", "next", "edest",
1122    "eclosure", and "inveclosure".  */
1123
1124 static reg_errcode_t
1125 analyze (regex_t *preg)
1126 {
1127   re_dfa_t *dfa = preg->buffer;
1128   reg_errcode_t ret;
1129
1130   /* Allocate arrays.  */
1131   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1132   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1133   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135   if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1136                         || dfa->edests == NULL || dfa->eclosures == NULL))
1137     return REG_ESPACE;
1138
1139   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1140   if (dfa->subexp_map != NULL)
1141     {
1142       Idx i;
1143       for (i = 0; i < preg->re_nsub; i++)
1144         dfa->subexp_map[i] = i;
1145       preorder (dfa->str_tree, optimize_subexps, dfa);
1146       for (i = 0; i < preg->re_nsub; i++)
1147         if (dfa->subexp_map[i] != i)
1148           break;
1149       if (i == preg->re_nsub)
1150         {
1151           re_free (dfa->subexp_map);
1152           dfa->subexp_map = NULL;
1153         }
1154     }
1155
1156   ret = postorder (dfa->str_tree, lower_subexps, preg);
1157   if (__glibc_unlikely (ret != REG_NOERROR))
1158     return ret;
1159   ret = postorder (dfa->str_tree, calc_first, dfa);
1160   if (__glibc_unlikely (ret != REG_NOERROR))
1161     return ret;
1162   preorder (dfa->str_tree, calc_next, dfa);
1163   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164   if (__glibc_unlikely (ret != REG_NOERROR))
1165     return ret;
1166   ret = calc_eclosure (dfa);
1167   if (__glibc_unlikely (ret != REG_NOERROR))
1168     return ret;
1169
1170   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1172   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173       || dfa->nbackref)
1174     {
1175       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176       if (__glibc_unlikely (dfa->inveclosures == NULL))
1177         return REG_ESPACE;
1178       ret = calc_inveclosure (dfa);
1179     }
1180
1181   return ret;
1182 }
1183
1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
1185    implement parse tree visits.  Instead, we use parent pointers and
1186    some hairy code in these two functions.  */
1187 static reg_errcode_t
1188 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189            void *extra)
1190 {
1191   bin_tree_t *node, *prev;
1192
1193   for (node = root; ; )
1194     {
1195       /* Descend down the tree, preferably to the left (or to the right
1196          if that's the only child).  */
1197       while (node->left || node->right)
1198         if (node->left)
1199           node = node->left;
1200         else
1201           node = node->right;
1202
1203       do
1204         {
1205           reg_errcode_t err = fn (extra, node);
1206           if (__glibc_unlikely (err != REG_NOERROR))
1207             return err;
1208           if (node->parent == NULL)
1209             return REG_NOERROR;
1210           prev = node;
1211           node = node->parent;
1212         }
1213       /* Go up while we have a node that is reached from the right.  */
1214       while (node->right == prev || node->right == NULL);
1215       node = node->right;
1216     }
1217 }
1218
1219 static reg_errcode_t
1220 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1221           void *extra)
1222 {
1223   bin_tree_t *node;
1224
1225   for (node = root; ; )
1226     {
1227       reg_errcode_t err = fn (extra, node);
1228       if (__glibc_unlikely (err != REG_NOERROR))
1229         return err;
1230
1231       /* Go to the left node, or up and to the right.  */
1232       if (node->left)
1233         node = node->left;
1234       else
1235         {
1236           bin_tree_t *prev = NULL;
1237           while (node->right == prev || node->right == NULL)
1238             {
1239               prev = node;
1240               node = node->parent;
1241               if (!node)
1242                 return REG_NOERROR;
1243             }
1244           node = node->right;
1245         }
1246     }
1247 }
1248
1249 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1250    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1251    backreferences as well.  Requires a preorder visit.  */
1252 static reg_errcode_t
1253 optimize_subexps (void *extra, bin_tree_t *node)
1254 {
1255   re_dfa_t *dfa = (re_dfa_t *) extra;
1256
1257   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1258     {
1259       int idx = node->token.opr.idx;
1260       node->token.opr.idx = dfa->subexp_map[idx];
1261       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1262     }
1263
1264   else if (node->token.type == SUBEXP
1265            && node->left && node->left->token.type == SUBEXP)
1266     {
1267       Idx other_idx = node->left->token.opr.idx;
1268
1269       node->left = node->left->left;
1270       if (node->left)
1271         node->left->parent = node;
1272
1273       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1274       if (other_idx < BITSET_WORD_BITS)
1275         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1276     }
1277
1278   return REG_NOERROR;
1279 }
1280
1281 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1282    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1283 static reg_errcode_t
1284 lower_subexps (void *extra, bin_tree_t *node)
1285 {
1286   regex_t *preg = (regex_t *) extra;
1287   reg_errcode_t err = REG_NOERROR;
1288
1289   if (node->left && node->left->token.type == SUBEXP)
1290     {
1291       node->left = lower_subexp (&err, preg, node->left);
1292       if (node->left)
1293         node->left->parent = node;
1294     }
1295   if (node->right && node->right->token.type == SUBEXP)
1296     {
1297       node->right = lower_subexp (&err, preg, node->right);
1298       if (node->right)
1299         node->right->parent = node;
1300     }
1301
1302   return err;
1303 }
1304
1305 static bin_tree_t *
1306 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1307 {
1308   re_dfa_t *dfa = preg->buffer;
1309   bin_tree_t *body = node->left;
1310   bin_tree_t *op, *cls, *tree1, *tree;
1311
1312   if (preg->no_sub
1313       /* We do not optimize empty subexpressions, because otherwise we may
1314          have bad CONCAT nodes with NULL children.  This is obviously not
1315          very common, so we do not lose much.  An example that triggers
1316          this case is the sed "script" /\(\)/x.  */
1317       && node->left != NULL
1318       && (node->token.opr.idx >= BITSET_WORD_BITS
1319           || !(dfa->used_bkref_map
1320                & ((bitset_word_t) 1 << node->token.opr.idx))))
1321     return node->left;
1322
1323   /* Convert the SUBEXP node to the concatenation of an
1324      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1325   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1326   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1327   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1328   tree = create_tree (dfa, op, tree1, CONCAT);
1329   if (__glibc_unlikely (tree == NULL || tree1 == NULL
1330                         || op == NULL || cls == NULL))
1331     {
1332       *err = REG_ESPACE;
1333       return NULL;
1334     }
1335
1336   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1337   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1338   return tree;
1339 }
1340
1341 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1342    nodes.  Requires a postorder visit.  */
1343 static reg_errcode_t
1344 calc_first (void *extra, bin_tree_t *node)
1345 {
1346   re_dfa_t *dfa = (re_dfa_t *) extra;
1347   if (node->token.type == CONCAT)
1348     {
1349       node->first = node->left->first;
1350       node->node_idx = node->left->node_idx;
1351     }
1352   else
1353     {
1354       node->first = node;
1355       node->node_idx = re_dfa_add_node (dfa, node->token);
1356       if (__glibc_unlikely (node->node_idx == -1))
1357         return REG_ESPACE;
1358       if (node->token.type == ANCHOR)
1359         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1360     }
1361   return REG_NOERROR;
1362 }
1363
1364 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1365 static reg_errcode_t
1366 calc_next (void *extra, bin_tree_t *node)
1367 {
1368   switch (node->token.type)
1369     {
1370     case OP_DUP_ASTERISK:
1371       node->left->next = node;
1372       break;
1373     case CONCAT:
1374       node->left->next = node->right->first;
1375       node->right->next = node->next;
1376       break;
1377     default:
1378       if (node->left)
1379         node->left->next = node->next;
1380       if (node->right)
1381         node->right->next = node->next;
1382       break;
1383     }
1384   return REG_NOERROR;
1385 }
1386
1387 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1388 static reg_errcode_t
1389 link_nfa_nodes (void *extra, bin_tree_t *node)
1390 {
1391   re_dfa_t *dfa = (re_dfa_t *) extra;
1392   Idx idx = node->node_idx;
1393   reg_errcode_t err = REG_NOERROR;
1394
1395   switch (node->token.type)
1396     {
1397     case CONCAT:
1398       break;
1399
1400     case END_OF_RE:
1401       DEBUG_ASSERT (node->next == NULL);
1402       break;
1403
1404     case OP_DUP_ASTERISK:
1405     case OP_ALT:
1406       {
1407         Idx left, right;
1408         dfa->has_plural_match = 1;
1409         if (node->left != NULL)
1410           left = node->left->first->node_idx;
1411         else
1412           left = node->next->node_idx;
1413         if (node->right != NULL)
1414           right = node->right->first->node_idx;
1415         else
1416           right = node->next->node_idx;
1417         DEBUG_ASSERT (left > -1);
1418         DEBUG_ASSERT (right > -1);
1419         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1420       }
1421       break;
1422
1423     case ANCHOR:
1424     case OP_OPEN_SUBEXP:
1425     case OP_CLOSE_SUBEXP:
1426       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1427       break;
1428
1429     case OP_BACK_REF:
1430       dfa->nexts[idx] = node->next->node_idx;
1431       if (node->token.type == OP_BACK_REF)
1432         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1433       break;
1434
1435     default:
1436       DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1437       dfa->nexts[idx] = node->next->node_idx;
1438       break;
1439     }
1440
1441   return err;
1442 }
1443
1444 /* Duplicate the epsilon closure of the node ROOT_NODE.
1445    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1446    to their own constraint.  */
1447
1448 static reg_errcode_t
1449 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1450                         Idx root_node, unsigned int init_constraint)
1451 {
1452   Idx org_node, clone_node;
1453   bool ok;
1454   unsigned int constraint = init_constraint;
1455   for (org_node = top_org_node, clone_node = top_clone_node;;)
1456     {
1457       Idx org_dest, clone_dest;
1458       if (dfa->nodes[org_node].type == OP_BACK_REF)
1459         {
1460           /* If the back reference epsilon-transit, its destination must
1461              also have the constraint.  Then duplicate the epsilon closure
1462              of the destination of the back reference, and store it in
1463              edests of the back reference.  */
1464           org_dest = dfa->nexts[org_node];
1465           re_node_set_empty (dfa->edests + clone_node);
1466           clone_dest = duplicate_node (dfa, org_dest, constraint);
1467           if (__glibc_unlikely (clone_dest == -1))
1468             return REG_ESPACE;
1469           dfa->nexts[clone_node] = dfa->nexts[org_node];
1470           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1471           if (__glibc_unlikely (! ok))
1472             return REG_ESPACE;
1473         }
1474       else if (dfa->edests[org_node].nelem == 0)
1475         {
1476           /* In case of the node can't epsilon-transit, don't duplicate the
1477              destination and store the original destination as the
1478              destination of the node.  */
1479           dfa->nexts[clone_node] = dfa->nexts[org_node];
1480           break;
1481         }
1482       else if (dfa->edests[org_node].nelem == 1)
1483         {
1484           /* In case of the node can epsilon-transit, and it has only one
1485              destination.  */
1486           org_dest = dfa->edests[org_node].elems[0];
1487           re_node_set_empty (dfa->edests + clone_node);
1488           /* If the node is root_node itself, it means the epsilon closure
1489              has a loop.  Then tie it to the destination of the root_node.  */
1490           if (org_node == root_node && clone_node != org_node)
1491             {
1492               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1493               if (__glibc_unlikely (! ok))
1494                 return REG_ESPACE;
1495               break;
1496             }
1497           /* In case the node has another constraint, append it.  */
1498           constraint |= dfa->nodes[org_node].constraint;
1499           clone_dest = duplicate_node (dfa, org_dest, constraint);
1500           if (__glibc_unlikely (clone_dest == -1))
1501             return REG_ESPACE;
1502           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503           if (__glibc_unlikely (! ok))
1504             return REG_ESPACE;
1505         }
1506       else /* dfa->edests[org_node].nelem == 2 */
1507         {
1508           /* In case of the node can epsilon-transit, and it has two
1509              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1510           org_dest = dfa->edests[org_node].elems[0];
1511           re_node_set_empty (dfa->edests + clone_node);
1512           /* Search for a duplicated node which satisfies the constraint.  */
1513           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1514           if (clone_dest == -1)
1515             {
1516               /* There is no such duplicated node, create a new one.  */
1517               reg_errcode_t err;
1518               clone_dest = duplicate_node (dfa, org_dest, constraint);
1519               if (__glibc_unlikely (clone_dest == -1))
1520                 return REG_ESPACE;
1521               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1522               if (__glibc_unlikely (! ok))
1523                 return REG_ESPACE;
1524               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1525                                             root_node, constraint);
1526               if (__glibc_unlikely (err != REG_NOERROR))
1527                 return err;
1528             }
1529           else
1530             {
1531               /* There is a duplicated node which satisfies the constraint,
1532                  use it to avoid infinite loop.  */
1533               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534               if (__glibc_unlikely (! ok))
1535                 return REG_ESPACE;
1536             }
1537
1538           org_dest = dfa->edests[org_node].elems[1];
1539           clone_dest = duplicate_node (dfa, org_dest, constraint);
1540           if (__glibc_unlikely (clone_dest == -1))
1541             return REG_ESPACE;
1542           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543           if (__glibc_unlikely (! ok))
1544             return REG_ESPACE;
1545         }
1546       org_node = org_dest;
1547       clone_node = clone_dest;
1548     }
1549   return REG_NOERROR;
1550 }
1551
1552 /* Search for a node which is duplicated from the node ORG_NODE, and
1553    satisfies the constraint CONSTRAINT.  */
1554
1555 static Idx
1556 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1557                         unsigned int constraint)
1558 {
1559   Idx idx;
1560   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1561     {
1562       if (org_node == dfa->org_indices[idx]
1563           && constraint == dfa->nodes[idx].constraint)
1564         return idx; /* Found.  */
1565     }
1566   return -1; /* Not found.  */
1567 }
1568
1569 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1570    Return the index of the new node, or -1 if insufficient storage is
1571    available.  */
1572
1573 static Idx
1574 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1575 {
1576   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1577   if (__glibc_likely (dup_idx != -1))
1578     {
1579       dfa->nodes[dup_idx].constraint = constraint;
1580       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1581       dfa->nodes[dup_idx].duplicated = 1;
1582
1583       /* Store the index of the original node.  */
1584       dfa->org_indices[dup_idx] = org_idx;
1585     }
1586   return dup_idx;
1587 }
1588
1589 static reg_errcode_t
1590 calc_inveclosure (re_dfa_t *dfa)
1591 {
1592   Idx src, idx;
1593   bool ok;
1594   for (idx = 0; idx < dfa->nodes_len; ++idx)
1595     re_node_set_init_empty (dfa->inveclosures + idx);
1596
1597   for (src = 0; src < dfa->nodes_len; ++src)
1598     {
1599       Idx *elems = dfa->eclosures[src].elems;
1600       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1601         {
1602           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1603           if (__glibc_unlikely (! ok))
1604             return REG_ESPACE;
1605         }
1606     }
1607
1608   return REG_NOERROR;
1609 }
1610
1611 /* Calculate "eclosure" for all the node in DFA.  */
1612
1613 static reg_errcode_t
1614 calc_eclosure (re_dfa_t *dfa)
1615 {
1616   Idx node_idx;
1617   bool incomplete;
1618   DEBUG_ASSERT (dfa->nodes_len > 0);
1619   incomplete = false;
1620   /* For each nodes, calculate epsilon closure.  */
1621   for (node_idx = 0; ; ++node_idx)
1622     {
1623       reg_errcode_t err;
1624       re_node_set eclosure_elem;
1625       if (node_idx == dfa->nodes_len)
1626         {
1627           if (!incomplete)
1628             break;
1629           incomplete = false;
1630           node_idx = 0;
1631         }
1632
1633       DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1634
1635       /* If we have already calculated, skip it.  */
1636       if (dfa->eclosures[node_idx].nelem != 0)
1637         continue;
1638       /* Calculate epsilon closure of 'node_idx'.  */
1639       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1640       if (__glibc_unlikely (err != REG_NOERROR))
1641         return err;
1642
1643       if (dfa->eclosures[node_idx].nelem == 0)
1644         {
1645           incomplete = true;
1646           re_node_set_free (&eclosure_elem);
1647         }
1648     }
1649   return REG_NOERROR;
1650 }
1651
1652 /* Calculate epsilon closure of NODE.  */
1653
1654 static reg_errcode_t
1655 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1656 {
1657   reg_errcode_t err;
1658   Idx i;
1659   re_node_set eclosure;
1660   bool incomplete = false;
1661   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662   if (__glibc_unlikely (err != REG_NOERROR))
1663     return err;
1664
1665   /* An epsilon closure includes itself.  */
1666   eclosure.elems[eclosure.nelem++] = node;
1667
1668   /* This indicates that we are calculating this node now.
1669      We reference this value to avoid infinite loop.  */
1670   dfa->eclosures[node].nelem = -1;
1671
1672   /* If the current node has constraints, duplicate all nodes
1673      since they must inherit the constraints.  */
1674   if (dfa->nodes[node].constraint
1675       && dfa->edests[node].nelem
1676       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1677     {
1678       err = duplicate_node_closure (dfa, node, node, node,
1679                                     dfa->nodes[node].constraint);
1680       if (__glibc_unlikely (err != REG_NOERROR))
1681         return err;
1682     }
1683
1684   /* Expand each epsilon destination nodes.  */
1685   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1686     for (i = 0; i < dfa->edests[node].nelem; ++i)
1687       {
1688         re_node_set eclosure_elem;
1689         Idx edest = dfa->edests[node].elems[i];
1690         /* If calculating the epsilon closure of 'edest' is in progress,
1691            return intermediate result.  */
1692         if (dfa->eclosures[edest].nelem == -1)
1693           {
1694             incomplete = true;
1695             continue;
1696           }
1697         /* If we haven't calculated the epsilon closure of 'edest' yet,
1698            calculate now. Otherwise use calculated epsilon closure.  */
1699         if (dfa->eclosures[edest].nelem == 0)
1700           {
1701             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1702             if (__glibc_unlikely (err != REG_NOERROR))
1703               return err;
1704           }
1705         else
1706           eclosure_elem = dfa->eclosures[edest];
1707         /* Merge the epsilon closure of 'edest'.  */
1708         err = re_node_set_merge (&eclosure, &eclosure_elem);
1709         if (__glibc_unlikely (err != REG_NOERROR))
1710           return err;
1711         /* If the epsilon closure of 'edest' is incomplete,
1712            the epsilon closure of this node is also incomplete.  */
1713         if (dfa->eclosures[edest].nelem == 0)
1714           {
1715             incomplete = true;
1716             re_node_set_free (&eclosure_elem);
1717           }
1718       }
1719
1720   if (incomplete && !root)
1721     dfa->eclosures[node].nelem = 0;
1722   else
1723     dfa->eclosures[node] = eclosure;
1724   *new_set = eclosure;
1725   return REG_NOERROR;
1726 }
1727 \f
1728 /* Functions for token which are used in the parser.  */
1729
1730 /* Fetch a token from INPUT.
1731    We must not use this function inside bracket expressions.  */
1732
1733 static void
1734 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1735 {
1736   re_string_skip_bytes (input, peek_token (result, input, syntax));
1737 }
1738
1739 /* Peek a token from INPUT, and return the length of the token.
1740    We must not use this function inside bracket expressions.  */
1741
1742 static int
1743 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1744 {
1745   unsigned char c;
1746
1747   if (re_string_eoi (input))
1748     {
1749       token->type = END_OF_RE;
1750       return 0;
1751     }
1752
1753   c = re_string_peek_byte (input, 0);
1754   token->opr.c = c;
1755
1756   token->word_char = 0;
1757   token->mb_partial = 0;
1758   if (input->mb_cur_max > 1
1759       && !re_string_first_byte (input, re_string_cur_idx (input)))
1760     {
1761       token->type = CHARACTER;
1762       token->mb_partial = 1;
1763       return 1;
1764     }
1765   if (c == '\\')
1766     {
1767       unsigned char c2;
1768       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1769         {
1770           token->type = BACK_SLASH;
1771           return 1;
1772         }
1773
1774       c2 = re_string_peek_byte_case (input, 1);
1775       token->opr.c = c2;
1776       token->type = CHARACTER;
1777       if (input->mb_cur_max > 1)
1778         {
1779           wint_t wc = re_string_wchar_at (input,
1780                                           re_string_cur_idx (input) + 1);
1781           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1782         }
1783       else
1784         token->word_char = IS_WORD_CHAR (c2) != 0;
1785
1786       switch (c2)
1787         {
1788         case '|':
1789           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1790             token->type = OP_ALT;
1791           break;
1792         case '1': case '2': case '3': case '4': case '5':
1793         case '6': case '7': case '8': case '9':
1794           if (!(syntax & RE_NO_BK_REFS))
1795             {
1796               token->type = OP_BACK_REF;
1797               token->opr.idx = c2 - '1';
1798             }
1799           break;
1800         case '<':
1801           if (!(syntax & RE_NO_GNU_OPS))
1802             {
1803               token->type = ANCHOR;
1804               token->opr.ctx_type = WORD_FIRST;
1805             }
1806           break;
1807         case '>':
1808           if (!(syntax & RE_NO_GNU_OPS))
1809             {
1810               token->type = ANCHOR;
1811               token->opr.ctx_type = WORD_LAST;
1812             }
1813           break;
1814         case 'b':
1815           if (!(syntax & RE_NO_GNU_OPS))
1816             {
1817               token->type = ANCHOR;
1818               token->opr.ctx_type = WORD_DELIM;
1819             }
1820           break;
1821         case 'B':
1822           if (!(syntax & RE_NO_GNU_OPS))
1823             {
1824               token->type = ANCHOR;
1825               token->opr.ctx_type = NOT_WORD_DELIM;
1826             }
1827           break;
1828         case 'w':
1829           if (!(syntax & RE_NO_GNU_OPS))
1830             token->type = OP_WORD;
1831           break;
1832         case 'W':
1833           if (!(syntax & RE_NO_GNU_OPS))
1834             token->type = OP_NOTWORD;
1835           break;
1836         case 's':
1837           if (!(syntax & RE_NO_GNU_OPS))
1838             token->type = OP_SPACE;
1839           break;
1840         case 'S':
1841           if (!(syntax & RE_NO_GNU_OPS))
1842             token->type = OP_NOTSPACE;
1843           break;
1844         case '`':
1845           if (!(syntax & RE_NO_GNU_OPS))
1846             {
1847               token->type = ANCHOR;
1848               token->opr.ctx_type = BUF_FIRST;
1849             }
1850           break;
1851         case '\'':
1852           if (!(syntax & RE_NO_GNU_OPS))
1853             {
1854               token->type = ANCHOR;
1855               token->opr.ctx_type = BUF_LAST;
1856             }
1857           break;
1858         case '(':
1859           if (!(syntax & RE_NO_BK_PARENS))
1860             token->type = OP_OPEN_SUBEXP;
1861           break;
1862         case ')':
1863           if (!(syntax & RE_NO_BK_PARENS))
1864             token->type = OP_CLOSE_SUBEXP;
1865           break;
1866         case '+':
1867           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1868             token->type = OP_DUP_PLUS;
1869           break;
1870         case '?':
1871           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1872             token->type = OP_DUP_QUESTION;
1873           break;
1874         case '{':
1875           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1876             token->type = OP_OPEN_DUP_NUM;
1877           break;
1878         case '}':
1879           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1880             token->type = OP_CLOSE_DUP_NUM;
1881           break;
1882         default:
1883           break;
1884         }
1885       return 2;
1886     }
1887
1888   token->type = CHARACTER;
1889   if (input->mb_cur_max > 1)
1890     {
1891       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1892       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1893     }
1894   else
1895     token->word_char = IS_WORD_CHAR (token->opr.c);
1896
1897   switch (c)
1898     {
1899     case '\n':
1900       if (syntax & RE_NEWLINE_ALT)
1901         token->type = OP_ALT;
1902       break;
1903     case '|':
1904       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1905         token->type = OP_ALT;
1906       break;
1907     case '*':
1908       token->type = OP_DUP_ASTERISK;
1909       break;
1910     case '+':
1911       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1912         token->type = OP_DUP_PLUS;
1913       break;
1914     case '?':
1915       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1916         token->type = OP_DUP_QUESTION;
1917       break;
1918     case '{':
1919       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1920         token->type = OP_OPEN_DUP_NUM;
1921       break;
1922     case '}':
1923       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1924         token->type = OP_CLOSE_DUP_NUM;
1925       break;
1926     case '(':
1927       if (syntax & RE_NO_BK_PARENS)
1928         token->type = OP_OPEN_SUBEXP;
1929       break;
1930     case ')':
1931       if (syntax & RE_NO_BK_PARENS)
1932         token->type = OP_CLOSE_SUBEXP;
1933       break;
1934     case '[':
1935       token->type = OP_OPEN_BRACKET;
1936       break;
1937     case '.':
1938       token->type = OP_PERIOD;
1939       break;
1940     case '^':
1941       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1942           && re_string_cur_idx (input) != 0)
1943         {
1944           char prev = re_string_peek_byte (input, -1);
1945           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1946             break;
1947         }
1948       token->type = ANCHOR;
1949       token->opr.ctx_type = LINE_FIRST;
1950       break;
1951     case '$':
1952       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1953           && re_string_cur_idx (input) + 1 != re_string_length (input))
1954         {
1955           re_token_t next;
1956           re_string_skip_bytes (input, 1);
1957           peek_token (&next, input, syntax);
1958           re_string_skip_bytes (input, -1);
1959           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1960             break;
1961         }
1962       token->type = ANCHOR;
1963       token->opr.ctx_type = LINE_LAST;
1964       break;
1965     default:
1966       break;
1967     }
1968   return 1;
1969 }
1970
1971 /* Peek a token from INPUT, and return the length of the token.
1972    We must not use this function out of bracket expressions.  */
1973
1974 static int
1975 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1976 {
1977   unsigned char c;
1978   if (re_string_eoi (input))
1979     {
1980       token->type = END_OF_RE;
1981       return 0;
1982     }
1983   c = re_string_peek_byte (input, 0);
1984   token->opr.c = c;
1985
1986   if (input->mb_cur_max > 1
1987       && !re_string_first_byte (input, re_string_cur_idx (input)))
1988     {
1989       token->type = CHARACTER;
1990       return 1;
1991     }
1992
1993   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1994       && re_string_cur_idx (input) + 1 < re_string_length (input))
1995     {
1996       /* In this case, '\' escape a character.  */
1997       unsigned char c2;
1998       re_string_skip_bytes (input, 1);
1999       c2 = re_string_peek_byte (input, 0);
2000       token->opr.c = c2;
2001       token->type = CHARACTER;
2002       return 1;
2003     }
2004   if (c == '[') /* '[' is a special char in a bracket exps.  */
2005     {
2006       unsigned char c2;
2007       int token_len;
2008       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2009         c2 = re_string_peek_byte (input, 1);
2010       else
2011         c2 = 0;
2012       token->opr.c = c2;
2013       token_len = 2;
2014       switch (c2)
2015         {
2016         case '.':
2017           token->type = OP_OPEN_COLL_ELEM;
2018           break;
2019
2020         case '=':
2021           token->type = OP_OPEN_EQUIV_CLASS;
2022           break;
2023
2024         case ':':
2025           if (syntax & RE_CHAR_CLASSES)
2026             {
2027               token->type = OP_OPEN_CHAR_CLASS;
2028               break;
2029             }
2030           FALLTHROUGH;
2031         default:
2032           token->type = CHARACTER;
2033           token->opr.c = c;
2034           token_len = 1;
2035           break;
2036         }
2037       return token_len;
2038     }
2039   switch (c)
2040     {
2041     case ']':
2042       token->type = OP_CLOSE_BRACKET;
2043       break;
2044     case '^':
2045       token->type = OP_NON_MATCH_LIST;
2046       break;
2047     case '-':
2048       /* In V7 Unix grep and Unix awk and mawk, [...---...]
2049          (3 adjacent minus signs) stands for a single minus sign.
2050          Support that without breaking anything else.  */
2051       if (! (re_string_cur_idx (input) + 2 < re_string_length (input)
2052              && re_string_peek_byte (input, 1) == '-'
2053              && re_string_peek_byte (input, 2) == '-'))
2054         {
2055           token->type = OP_CHARSET_RANGE;
2056           break;
2057         }
2058       re_string_skip_bytes (input, 2);
2059       FALLTHROUGH;
2060     default:
2061       token->type = CHARACTER;
2062     }
2063   return 1;
2064 }
2065 \f
2066 /* Functions for parser.  */
2067
2068 /* Entry point of the parser.
2069    Parse the regular expression REGEXP and return the structure tree.
2070    If an error occurs, ERR is set by error code, and return NULL.
2071    This function build the following tree, from regular expression <reg_exp>:
2072            CAT
2073            / \
2074           /   \
2075    <reg_exp>  EOR
2076
2077    CAT means concatenation.
2078    EOR means end of regular expression.  */
2079
2080 static bin_tree_t *
2081 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2082        reg_errcode_t *err)
2083 {
2084   re_dfa_t *dfa = preg->buffer;
2085   bin_tree_t *tree, *eor, *root;
2086   re_token_t current_token;
2087   dfa->syntax = syntax;
2088   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2089   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2090   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2091     return NULL;
2092   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2093   if (tree != NULL)
2094     root = create_tree (dfa, tree, eor, CONCAT);
2095   else
2096     root = eor;
2097   if (__glibc_unlikely (eor == NULL || root == NULL))
2098     {
2099       *err = REG_ESPACE;
2100       return NULL;
2101     }
2102   return root;
2103 }
2104
2105 /* This function build the following tree, from regular expression
2106    <branch1>|<branch2>:
2107            ALT
2108            / \
2109           /   \
2110    <branch1> <branch2>
2111
2112    ALT means alternative, which represents the operator '|'.  */
2113
2114 static bin_tree_t *
2115 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2116                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2117 {
2118   re_dfa_t *dfa = preg->buffer;
2119   bin_tree_t *tree, *branch = NULL;
2120   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2121   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2122   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2123     return NULL;
2124
2125   while (token->type == OP_ALT)
2126     {
2127       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2128       if (token->type != OP_ALT && token->type != END_OF_RE
2129           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2130         {
2131           bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2132           dfa->completed_bkref_map = initial_bkref_map;
2133           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2134           if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2135             {
2136               if (tree != NULL)
2137                 postorder (tree, free_tree, NULL);
2138               return NULL;
2139             }
2140           dfa->completed_bkref_map |= accumulated_bkref_map;
2141         }
2142       else
2143         branch = NULL;
2144       tree = create_tree (dfa, tree, branch, OP_ALT);
2145       if (__glibc_unlikely (tree == NULL))
2146         {
2147           *err = REG_ESPACE;
2148           return NULL;
2149         }
2150     }
2151   return tree;
2152 }
2153
2154 /* This function build the following tree, from regular expression
2155    <exp1><exp2>:
2156         CAT
2157         / \
2158        /   \
2159    <exp1> <exp2>
2160
2161    CAT means concatenation.  */
2162
2163 static bin_tree_t *
2164 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2165               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2166 {
2167   bin_tree_t *tree, *expr;
2168   re_dfa_t *dfa = preg->buffer;
2169   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2170   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2171     return NULL;
2172
2173   while (token->type != OP_ALT && token->type != END_OF_RE
2174          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2175     {
2176       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2177       if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2178         {
2179           if (tree != NULL)
2180             postorder (tree, free_tree, NULL);
2181           return NULL;
2182         }
2183       if (tree != NULL && expr != NULL)
2184         {
2185           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2186           if (newtree == NULL)
2187             {
2188               postorder (expr, free_tree, NULL);
2189               postorder (tree, free_tree, NULL);
2190               *err = REG_ESPACE;
2191               return NULL;
2192             }
2193           tree = newtree;
2194         }
2195       else if (tree == NULL)
2196         tree = expr;
2197       /* Otherwise expr == NULL, we don't need to create new tree.  */
2198     }
2199   return tree;
2200 }
2201
2202 /* This function build the following tree, from regular expression a*:
2203          *
2204          |
2205          a
2206 */
2207
2208 static bin_tree_t *
2209 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2210                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2211 {
2212   re_dfa_t *dfa = preg->buffer;
2213   bin_tree_t *tree;
2214   switch (token->type)
2215     {
2216     case CHARACTER:
2217       tree = create_token_tree (dfa, NULL, NULL, token);
2218       if (__glibc_unlikely (tree == NULL))
2219         {
2220           *err = REG_ESPACE;
2221           return NULL;
2222         }
2223       if (dfa->mb_cur_max > 1)
2224         {
2225           while (!re_string_eoi (regexp)
2226                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2227             {
2228               bin_tree_t *mbc_remain;
2229               fetch_token (token, regexp, syntax);
2230               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2231               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2232               if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2233                 {
2234                   *err = REG_ESPACE;
2235                   return NULL;
2236                 }
2237             }
2238         }
2239       break;
2240
2241     case OP_OPEN_SUBEXP:
2242       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2243       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2244         return NULL;
2245       break;
2246
2247     case OP_OPEN_BRACKET:
2248       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2249       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2250         return NULL;
2251       break;
2252
2253     case OP_BACK_REF:
2254       if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2255         {
2256           *err = REG_ESUBREG;
2257           return NULL;
2258         }
2259       dfa->used_bkref_map |= 1 << token->opr.idx;
2260       tree = create_token_tree (dfa, NULL, NULL, token);
2261       if (__glibc_unlikely (tree == NULL))
2262         {
2263           *err = REG_ESPACE;
2264           return NULL;
2265         }
2266       ++dfa->nbackref;
2267       dfa->has_mb_node = 1;
2268       break;
2269
2270     case OP_OPEN_DUP_NUM:
2271       if (syntax & RE_CONTEXT_INVALID_DUP)
2272         {
2273           *err = REG_BADRPT;
2274           return NULL;
2275         }
2276       FALLTHROUGH;
2277     case OP_DUP_ASTERISK:
2278     case OP_DUP_PLUS:
2279     case OP_DUP_QUESTION:
2280       if (syntax & RE_CONTEXT_INVALID_OPS)
2281         {
2282           *err = REG_BADRPT;
2283           return NULL;
2284         }
2285       else if (syntax & RE_CONTEXT_INDEP_OPS)
2286         {
2287           fetch_token (token, regexp, syntax);
2288           return parse_expression (regexp, preg, token, syntax, nest, err);
2289         }
2290       FALLTHROUGH;
2291     case OP_CLOSE_SUBEXP:
2292       if ((token->type == OP_CLOSE_SUBEXP)
2293           && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2294         {
2295           *err = REG_ERPAREN;
2296           return NULL;
2297         }
2298       FALLTHROUGH;
2299     case OP_CLOSE_DUP_NUM:
2300       /* We treat it as a normal character.  */
2301
2302       /* Then we can these characters as normal characters.  */
2303       token->type = CHARACTER;
2304       /* mb_partial and word_char bits should be initialized already
2305          by peek_token.  */
2306       tree = create_token_tree (dfa, NULL, NULL, token);
2307       if (__glibc_unlikely (tree == NULL))
2308         {
2309           *err = REG_ESPACE;
2310           return NULL;
2311         }
2312       break;
2313
2314     case ANCHOR:
2315       if ((token->opr.ctx_type
2316            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2317           && dfa->word_ops_used == 0)
2318         init_word_char (dfa);
2319       if (token->opr.ctx_type == WORD_DELIM
2320           || token->opr.ctx_type == NOT_WORD_DELIM)
2321         {
2322           bin_tree_t *tree_first, *tree_last;
2323           if (token->opr.ctx_type == WORD_DELIM)
2324             {
2325               token->opr.ctx_type = WORD_FIRST;
2326               tree_first = create_token_tree (dfa, NULL, NULL, token);
2327               token->opr.ctx_type = WORD_LAST;
2328             }
2329           else
2330             {
2331               token->opr.ctx_type = INSIDE_WORD;
2332               tree_first = create_token_tree (dfa, NULL, NULL, token);
2333               token->opr.ctx_type = INSIDE_NOTWORD;
2334             }
2335           tree_last = create_token_tree (dfa, NULL, NULL, token);
2336           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2337           if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2338                                 || tree == NULL))
2339             {
2340               *err = REG_ESPACE;
2341               return NULL;
2342             }
2343         }
2344       else
2345         {
2346           tree = create_token_tree (dfa, NULL, NULL, token);
2347           if (__glibc_unlikely (tree == NULL))
2348             {
2349               *err = REG_ESPACE;
2350               return NULL;
2351             }
2352         }
2353       /* We must return here, since ANCHORs can't be followed
2354          by repetition operators.
2355          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2356              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2357       fetch_token (token, regexp, syntax);
2358       return tree;
2359
2360     case OP_PERIOD:
2361       tree = create_token_tree (dfa, NULL, NULL, token);
2362       if (__glibc_unlikely (tree == NULL))
2363         {
2364           *err = REG_ESPACE;
2365           return NULL;
2366         }
2367       if (dfa->mb_cur_max > 1)
2368         dfa->has_mb_node = 1;
2369       break;
2370
2371     case OP_WORD:
2372     case OP_NOTWORD:
2373       tree = build_charclass_op (dfa, regexp->trans,
2374                                  "alnum",
2375                                  "_",
2376                                  token->type == OP_NOTWORD, err);
2377       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2378         return NULL;
2379       break;
2380
2381     case OP_SPACE:
2382     case OP_NOTSPACE:
2383       tree = build_charclass_op (dfa, regexp->trans,
2384                                  "space",
2385                                  "",
2386                                  token->type == OP_NOTSPACE, err);
2387       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2388         return NULL;
2389       break;
2390
2391     case OP_ALT:
2392     case END_OF_RE:
2393       return NULL;
2394
2395     case BACK_SLASH:
2396       *err = REG_EESCAPE;
2397       return NULL;
2398
2399     default:
2400       /* Must not happen?  */
2401       DEBUG_ASSERT (false);
2402       return NULL;
2403     }
2404   fetch_token (token, regexp, syntax);
2405
2406   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2407          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2408     {
2409       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2410                                            syntax, err);
2411       if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2412         {
2413           if (tree != NULL)
2414             postorder (tree, free_tree, NULL);
2415           return NULL;
2416         }
2417       tree = dup_tree;
2418       /* In BRE consecutive duplications are not allowed.  */
2419       if ((syntax & RE_CONTEXT_INVALID_DUP)
2420           && (token->type == OP_DUP_ASTERISK
2421               || token->type == OP_OPEN_DUP_NUM))
2422         {
2423           if (tree != NULL)
2424             postorder (tree, free_tree, NULL);
2425           *err = REG_BADRPT;
2426           return NULL;
2427         }
2428     }
2429
2430   return tree;
2431 }
2432
2433 /* This function build the following tree, from regular expression
2434    (<reg_exp>):
2435          SUBEXP
2436             |
2437         <reg_exp>
2438 */
2439
2440 static bin_tree_t *
2441 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2442                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2443 {
2444   re_dfa_t *dfa = preg->buffer;
2445   bin_tree_t *tree;
2446   size_t cur_nsub;
2447   cur_nsub = preg->re_nsub++;
2448
2449   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2450
2451   /* The subexpression may be a null string.  */
2452   if (token->type == OP_CLOSE_SUBEXP)
2453     tree = NULL;
2454   else
2455     {
2456       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2457       if (__glibc_unlikely (*err == REG_NOERROR
2458                             && token->type != OP_CLOSE_SUBEXP))
2459         {
2460           if (tree != NULL)
2461             postorder (tree, free_tree, NULL);
2462           *err = REG_EPAREN;
2463         }
2464       if (__glibc_unlikely (*err != REG_NOERROR))
2465         return NULL;
2466     }
2467
2468   if (cur_nsub <= '9' - '1')
2469     dfa->completed_bkref_map |= 1 << cur_nsub;
2470
2471   tree = create_tree (dfa, tree, NULL, SUBEXP);
2472   if (__glibc_unlikely (tree == NULL))
2473     {
2474       *err = REG_ESPACE;
2475       return NULL;
2476     }
2477   tree->token.opr.idx = cur_nsub;
2478   return tree;
2479 }
2480
2481 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2482
2483 static bin_tree_t *
2484 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2485               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2486 {
2487   bin_tree_t *tree = NULL, *old_tree = NULL;
2488   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2489   re_token_t start_token = *token;
2490
2491   if (token->type == OP_OPEN_DUP_NUM)
2492     {
2493       end = 0;
2494       start = fetch_number (regexp, token, syntax);
2495       if (start == -1)
2496         {
2497           if (token->type == CHARACTER && token->opr.c == ',')
2498             start = 0; /* We treat "{,m}" as "{0,m}".  */
2499           else
2500             {
2501               *err = REG_BADBR; /* <re>{} is invalid.  */
2502               return NULL;
2503             }
2504         }
2505       if (__glibc_likely (start != -2))
2506         {
2507           /* We treat "{n}" as "{n,n}".  */
2508           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2509                  : ((token->type == CHARACTER && token->opr.c == ',')
2510                     ? fetch_number (regexp, token, syntax) : -2));
2511         }
2512       if (__glibc_unlikely (start == -2 || end == -2))
2513         {
2514           /* Invalid sequence.  */
2515           if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2516             {
2517               if (token->type == END_OF_RE)
2518                 *err = REG_EBRACE;
2519               else
2520                 *err = REG_BADBR;
2521
2522               return NULL;
2523             }
2524
2525           /* If the syntax bit is set, rollback.  */
2526           re_string_set_index (regexp, start_idx);
2527           *token = start_token;
2528           token->type = CHARACTER;
2529           /* mb_partial and word_char bits should be already initialized by
2530              peek_token.  */
2531           return elem;
2532         }
2533
2534       if (__glibc_unlikely ((end != -1 && start > end)
2535                             || token->type != OP_CLOSE_DUP_NUM))
2536         {
2537           /* First number greater than second.  */
2538           *err = REG_BADBR;
2539           return NULL;
2540         }
2541
2542       if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2543         {
2544           *err = REG_ESIZE;
2545           return NULL;
2546         }
2547     }
2548   else
2549     {
2550       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2552     }
2553
2554   fetch_token (token, regexp, syntax);
2555
2556   if (__glibc_unlikely (elem == NULL))
2557     return NULL;
2558   if (__glibc_unlikely (start == 0 && end == 0))
2559     {
2560       postorder (elem, free_tree, NULL);
2561       return NULL;
2562     }
2563
2564   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2565   if (__glibc_unlikely (start > 0))
2566     {
2567       tree = elem;
2568       for (i = 2; i <= start; ++i)
2569         {
2570           elem = duplicate_tree (elem, dfa);
2571           tree = create_tree (dfa, tree, elem, CONCAT);
2572           if (__glibc_unlikely (elem == NULL || tree == NULL))
2573             goto parse_dup_op_espace;
2574         }
2575
2576       if (start == end)
2577         return tree;
2578
2579       /* Duplicate ELEM before it is marked optional.  */
2580       elem = duplicate_tree (elem, dfa);
2581       if (__glibc_unlikely (elem == NULL))
2582         goto parse_dup_op_espace;
2583       old_tree = tree;
2584     }
2585   else
2586     old_tree = NULL;
2587
2588   if (elem->token.type == SUBEXP)
2589     {
2590       uintptr_t subidx = elem->token.opr.idx;
2591       postorder (elem, mark_opt_subexp, (void *) subidx);
2592     }
2593
2594   tree = create_tree (dfa, elem, NULL,
2595                       (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2596   if (__glibc_unlikely (tree == NULL))
2597     goto parse_dup_op_espace;
2598
2599   /* This loop is actually executed only when end != -1,
2600      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2601      already created the start+1-th copy.  */
2602   if (TYPE_SIGNED (Idx) || end != -1)
2603     for (i = start + 2; i <= end; ++i)
2604       {
2605         elem = duplicate_tree (elem, dfa);
2606         tree = create_tree (dfa, tree, elem, CONCAT);
2607         if (__glibc_unlikely (elem == NULL || tree == NULL))
2608           goto parse_dup_op_espace;
2609
2610         tree = create_tree (dfa, tree, NULL, OP_ALT);
2611         if (__glibc_unlikely (tree == NULL))
2612           goto parse_dup_op_espace;
2613       }
2614
2615   if (old_tree)
2616     tree = create_tree (dfa, old_tree, tree, CONCAT);
2617
2618   return tree;
2619
2620  parse_dup_op_espace:
2621   *err = REG_ESPACE;
2622   return NULL;
2623 }
2624
2625 /* Size of the names for collating symbol/equivalence_class/character_class.
2626    I'm not sure, but maybe enough.  */
2627 #define BRACKET_NAME_BUF_SIZE 32
2628
2629 #ifndef _LIBC
2630
2631 /* Convert the byte B to the corresponding wide character.  In a
2632    unibyte locale, treat B as itself.  In a multibyte locale, return
2633    WEOF if B is an encoding error.  */
2634 static wint_t
2635 parse_byte (unsigned char b, re_dfa_t const *dfa)
2636 {
2637   return dfa->mb_cur_max > 1 ? __btowc (b) : b;
2638 }
2639
2640 /* Local function for parse_bracket_exp used in _LIBC environment.
2641    Build the range expression which starts from START_ELEM, and ends
2642    at END_ELEM.  The result are written to MBCSET and SBCSET.
2643    RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2644    mbcset->range_ends, is a pointer argument since we may
2645    update it.  */
2646
2647 static reg_errcode_t
2648 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2649                  bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2650                  re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
2651                  const unsigned char *collseqmb, const char *collseqwc,
2652                  int_fast32_t table_size, const void *symb_table,
2653                  const unsigned char *extra)
2654 {
2655   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2656   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2657                         || start_elem->type == CHAR_CLASS
2658                         || end_elem->type == EQUIV_CLASS
2659                         || end_elem->type == CHAR_CLASS))
2660     return REG_ERANGE;
2661
2662   /* We can handle no multi character collating elements without libc
2663      support.  */
2664   if (__glibc_unlikely ((start_elem->type == COLL_SYM
2665                          && strlen ((char *) start_elem->opr.name) > 1)
2666                         || (end_elem->type == COLL_SYM
2667                             && strlen ((char *) end_elem->opr.name) > 1)))
2668     return REG_ECOLLATE;
2669
2670   unsigned int
2671     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2672                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2673                    : 0)),
2674     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2675               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2676                  : 0));
2677   wint_t
2678     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2679                 ? parse_byte (start_ch, dfa) : start_elem->opr.wch),
2680     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2681               ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
2682
2683   if (start_wc == WEOF || end_wc == WEOF)
2684     return REG_ECOLLATE;
2685   else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2686                              && start_wc > end_wc))
2687     return REG_ERANGE;
2688
2689   /* Got valid collation sequence values, add them as a new entry.
2690      However, for !_LIBC we have no collation elements: if the
2691      character set is single byte, the single byte character set
2692      that we build below suffices.  parse_bracket_exp passes
2693      no MBCSET if dfa->mb_cur_max == 1.  */
2694   if (dfa->mb_cur_max > 1)
2695     {
2696       /* Check the space of the arrays.  */
2697       if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2698         {
2699           /* There is not enough space, need realloc.  */
2700           wchar_t *new_array_start, *new_array_end;
2701           Idx new_nranges;
2702
2703           /* +1 in case of mbcset->nranges is 0.  */
2704           new_nranges = 2 * mbcset->nranges + 1;
2705           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706              are NULL if *range_alloc == 0.  */
2707           new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2708                                         new_nranges);
2709           new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2710                                       new_nranges);
2711
2712           if (__glibc_unlikely (new_array_start == NULL
2713                                 || new_array_end == NULL))
2714             {
2715               re_free (new_array_start);
2716               re_free (new_array_end);
2717               return REG_ESPACE;
2718             }
2719
2720           mbcset->range_starts = new_array_start;
2721           mbcset->range_ends = new_array_end;
2722           *range_alloc = new_nranges;
2723         }
2724
2725       mbcset->range_starts[mbcset->nranges] = start_wc;
2726       mbcset->range_ends[mbcset->nranges++] = end_wc;
2727     }
2728
2729   /* Build the table for single byte characters.  */
2730   for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
2731     {
2732       if (start_wc <= wc && wc <= end_wc)
2733         bitset_set (sbcset, wc);
2734     }
2735
2736   return REG_NOERROR;
2737 }
2738 #endif /* not _LIBC */
2739
2740 #ifndef _LIBC
2741 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
2742    Build the collating element which is represented by NAME.
2743    The result are written to MBCSET and SBCSET.
2744    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2745    pointer argument since we may update it.  */
2746
2747 static reg_errcode_t
2748 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2749                         Idx *coll_sym_alloc, const unsigned char *name,
2750                         uint_fast32_t nrules, int_fast32_t table_size,
2751                         const void *symb_table, const unsigned char *extra)
2752 {
2753   size_t name_len = strlen ((const char *) name);
2754   if (__glibc_unlikely (name_len != 1))
2755     return REG_ECOLLATE;
2756   else
2757     {
2758       bitset_set (sbcset, name[0]);
2759       return REG_NOERROR;
2760     }
2761 }
2762 #endif /* not _LIBC */
2763
2764 #ifdef _LIBC
2765 /* Local function for parse_bracket_exp used in _LIBC environment.
2766    Seek the collating symbol entry corresponding to NAME.
2767    Return the index of the symbol in the SYMB_TABLE,
2768    or -1 if not found.  */
2769
2770 static __always_inline int32_t
2771 seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2772                              const int32_t *symb_table,
2773                              int_fast32_t table_size,
2774                              const unsigned char *extra)
2775 {
2776   int_fast32_t elem;
2777
2778   for (elem = 0; elem < table_size; elem++)
2779     if (symb_table[2 * elem] != 0)
2780       {
2781         int32_t idx = symb_table[2 * elem + 1];
2782         /* Skip the name of collating element name.  */
2783         idx += 1 + extra[idx];
2784         if (/* Compare the length of the name.  */
2785             name_len == extra[idx]
2786             /* Compare the name.  */
2787             && memcmp (name, &extra[idx + 1], name_len) == 0)
2788           /* Yep, this is the entry.  */
2789           return elem;
2790       }
2791   return -1;
2792 }
2793
2794 /* Local function for parse_bracket_exp used in _LIBC environment.
2795    Look up the collation sequence value of BR_ELEM.
2796    Return the value if succeeded, UINT_MAX otherwise.  */
2797
2798 static __always_inline unsigned int
2799 lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2800                                  const unsigned char *collseqmb,
2801                                  const char *collseqwc,
2802                                  int_fast32_t table_size,
2803                                  const int32_t *symb_table,
2804                                  const unsigned char *extra)
2805 {
2806   if (br_elem->type == SB_CHAR)
2807     {
2808       /* if (MB_CUR_MAX == 1) */
2809       if (nrules == 0)
2810         return collseqmb[br_elem->opr.ch];
2811       else
2812         {
2813           wint_t wc = __btowc (br_elem->opr.ch);
2814           return __collseq_table_lookup (collseqwc, wc);
2815         }
2816     }
2817   else if (br_elem->type == MB_CHAR)
2818     {
2819       if (nrules != 0)
2820         return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2821     }
2822   else if (br_elem->type == COLL_SYM)
2823     {
2824       size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2825       if (nrules != 0)
2826         {
2827           int32_t elem, idx;
2828           elem = seek_collating_symbol_entry (br_elem->opr.name,
2829                                               sym_name_len,
2830                                               symb_table, table_size,
2831                                               extra);
2832           if (elem != -1)
2833             {
2834               /* We found the entry.  */
2835               idx = symb_table[2 * elem + 1];
2836               /* Skip the name of collating element name.  */
2837               idx += 1 + extra[idx];
2838               /* Skip the byte sequence of the collating element.  */
2839               idx += 1 + extra[idx];
2840               /* Adjust for the alignment.  */
2841               idx = (idx + 3) & ~3;
2842               /* Skip the multibyte collation sequence value.  */
2843               idx += sizeof (unsigned int);
2844               /* Skip the wide char sequence of the collating element.  */
2845               idx += sizeof (unsigned int) *
2846                 (1 + *(unsigned int *) (extra + idx));
2847               /* Return the collation sequence value.  */
2848               return *(unsigned int *) (extra + idx);
2849             }
2850           else if (sym_name_len == 1)
2851             {
2852               /* No valid character.  Match it as a single byte
2853                  character.  */
2854               return collseqmb[br_elem->opr.name[0]];
2855             }
2856         }
2857       else if (sym_name_len == 1)
2858         return collseqmb[br_elem->opr.name[0]];
2859     }
2860   return UINT_MAX;
2861 }
2862
2863 /* Local function for parse_bracket_exp used in _LIBC environment.
2864    Build the range expression which starts from START_ELEM, and ends
2865    at END_ELEM.  The result are written to MBCSET and SBCSET.
2866    RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2867    mbcset->range_ends, is a pointer argument since we may
2868    update it.  */
2869
2870 static __always_inline reg_errcode_t
2871 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2872                  bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2873                  re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2874                  const unsigned char *collseqmb, const char *collseqwc,
2875                  int_fast32_t table_size, const int32_t *symb_table,
2876                  const unsigned char *extra)
2877 {
2878   unsigned int ch;
2879   uint32_t start_collseq;
2880   uint32_t end_collseq;
2881
2882   /* Equivalence Classes and Character Classes can't be a range
2883      start/end.  */
2884   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2885                         || start_elem->type == CHAR_CLASS
2886                         || end_elem->type == EQUIV_CLASS
2887                         || end_elem->type == CHAR_CLASS))
2888     return REG_ERANGE;
2889
2890   /* FIXME: Implement rational ranges here, too.  */
2891   start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2892                                                    table_size, symb_table, extra);
2893   end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2894                                                  table_size, symb_table, extra);
2895   /* Check start/end collation sequence values.  */
2896   if (__glibc_unlikely (start_collseq == UINT_MAX
2897                         || end_collseq == UINT_MAX))
2898     return REG_ECOLLATE;
2899   if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2900                         && start_collseq > end_collseq))
2901     return REG_ERANGE;
2902
2903   /* Got valid collation sequence values, add them as a new entry.
2904      However, if we have no collation elements, and the character set
2905      is single byte, the single byte character set that we
2906      build below suffices. */
2907   if (nrules > 0 || dfa->mb_cur_max > 1)
2908     {
2909       /* Check the space of the arrays.  */
2910       if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2911         {
2912           /* There is not enough space, need realloc.  */
2913           uint32_t *new_array_start;
2914           uint32_t *new_array_end;
2915           int new_nranges;
2916
2917           /* +1 in case of mbcset->nranges is 0.  */
2918           new_nranges = 2 * mbcset->nranges + 1;
2919           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2920                                         new_nranges);
2921           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2922                                       new_nranges);
2923
2924           if (__glibc_unlikely (new_array_start == NULL
2925                                 || new_array_end == NULL))
2926             return REG_ESPACE;
2927
2928           mbcset->range_starts = new_array_start;
2929           mbcset->range_ends = new_array_end;
2930           *range_alloc = new_nranges;
2931         }
2932
2933       mbcset->range_starts[mbcset->nranges] = start_collseq;
2934       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2935     }
2936
2937   /* Build the table for single byte characters.  */
2938   for (ch = 0; ch < SBC_MAX; ch++)
2939     {
2940       uint32_t ch_collseq;
2941       /* if (MB_CUR_MAX == 1) */
2942       if (nrules == 0)
2943         ch_collseq = collseqmb[ch];
2944       else
2945         ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2946       if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2947         bitset_set (sbcset, ch);
2948     }
2949   return REG_NOERROR;
2950 }
2951
2952 /* Local function for parse_bracket_exp used in _LIBC environment.
2953    Build the collating element which is represented by NAME.
2954    The result are written to MBCSET and SBCSET.
2955    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2956    pointer argument since we may update it.  */
2957
2958 static __always_inline reg_errcode_t
2959 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2960                         Idx *coll_sym_alloc, const unsigned char *name,
2961                         uint_fast32_t nrules, int_fast32_t table_size,
2962                         const int32_t *symb_table, const unsigned char *extra)
2963 {
2964   int32_t elem, idx;
2965   size_t name_len = strlen ((const char *) name);
2966   if (nrules != 0)
2967     {
2968       elem = seek_collating_symbol_entry (name, name_len, symb_table,
2969                                           table_size, extra);
2970       if (elem != -1)
2971         {
2972           /* We found the entry.  */
2973           idx = symb_table[2 * elem + 1];
2974           /* Skip the name of collating element name.  */
2975           idx += 1 + extra[idx];
2976         }
2977       else if (name_len == 1)
2978         {
2979           /* No valid character, treat it as a normal
2980              character.  */
2981           bitset_set (sbcset, name[0]);
2982           return REG_NOERROR;
2983         }
2984       else
2985         return REG_ECOLLATE;
2986
2987       /* Got valid collation sequence, add it as a new entry.  */
2988       /* Check the space of the arrays.  */
2989       if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
2990         {
2991           /* Not enough, realloc it.  */
2992           /* +1 in case of mbcset->ncoll_syms is 0.  */
2993           int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2994           /* Use realloc since mbcset->coll_syms is NULL
2995              if *alloc == 0.  */
2996           int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2997                                                new_coll_sym_alloc);
2998           if (__glibc_unlikely (new_coll_syms == NULL))
2999             return REG_ESPACE;
3000           mbcset->coll_syms = new_coll_syms;
3001           *coll_sym_alloc = new_coll_sym_alloc;
3002         }
3003       mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3004       return REG_NOERROR;
3005     }
3006   else
3007     {
3008       if (__glibc_unlikely (name_len != 1))
3009         return REG_ECOLLATE;
3010       else
3011         {
3012           bitset_set (sbcset, name[0]);
3013           return REG_NOERROR;
3014         }
3015     }
3016 }
3017 #endif /* _LIBC */
3018
3019 /* This function parse bracket expression like "[abc]", "[a-c]",
3020    "[[.a-a.]]" etc.  */
3021
3022 static bin_tree_t *
3023 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3024                    reg_syntax_t syntax, reg_errcode_t *err)
3025 {
3026   const unsigned char *collseqmb = NULL;
3027   const char *collseqwc = NULL;
3028   uint_fast32_t nrules = 0;
3029   int_fast32_t table_size = 0;
3030   const void *symb_table = NULL;
3031   const unsigned char *extra = NULL;
3032
3033   re_token_t br_token;
3034   re_bitset_ptr_t sbcset;
3035   re_charset_t *mbcset;
3036   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3037   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3038   bool non_match = false;
3039   bin_tree_t *work_tree;
3040   int token_len;
3041   bool first_round = true;
3042 #ifdef _LIBC
3043   collseqmb = (const unsigned char *)
3044     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3045   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3046   if (nrules)
3047     {
3048       /*
3049       if (MB_CUR_MAX > 1)
3050       */
3051       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3052       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3053       symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
3054       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3055                                                    _NL_COLLATE_SYMB_EXTRAMB);
3056     }
3057 #endif
3058   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3059   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3060   if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3061     {
3062       re_free (sbcset);
3063       re_free (mbcset);
3064       *err = REG_ESPACE;
3065       return NULL;
3066     }
3067
3068   token_len = peek_token_bracket (token, regexp, syntax);
3069   if (__glibc_unlikely (token->type == END_OF_RE))
3070     {
3071       *err = REG_BADPAT;
3072       goto parse_bracket_exp_free_return;
3073     }
3074   if (token->type == OP_NON_MATCH_LIST)
3075     {
3076       mbcset->non_match = 1;
3077       non_match = true;
3078       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3079         bitset_set (sbcset, '\n');
3080       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3081       token_len = peek_token_bracket (token, regexp, syntax);
3082       if (__glibc_unlikely (token->type == END_OF_RE))
3083         {
3084           *err = REG_BADPAT;
3085           goto parse_bracket_exp_free_return;
3086         }
3087     }
3088
3089   /* We treat the first ']' as a normal character.  */
3090   if (token->type == OP_CLOSE_BRACKET)
3091     token->type = CHARACTER;
3092
3093   while (1)
3094     {
3095       bracket_elem_t start_elem, end_elem;
3096       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3097       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3098       reg_errcode_t ret;
3099       int token_len2 = 0;
3100       bool is_range_exp = false;
3101       re_token_t token2;
3102
3103       start_elem.opr.name = start_name_buf;
3104       start_elem.type = COLL_SYM;
3105       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3106                                    syntax, first_round);
3107       if (__glibc_unlikely (ret != REG_NOERROR))
3108         {
3109           *err = ret;
3110           goto parse_bracket_exp_free_return;
3111         }
3112       first_round = false;
3113
3114       /* Get information about the next token.  We need it in any case.  */
3115       token_len = peek_token_bracket (token, regexp, syntax);
3116
3117       /* Do not check for ranges if we know they are not allowed.  */
3118       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3119         {
3120           if (__glibc_unlikely (token->type == END_OF_RE))
3121             {
3122               *err = REG_EBRACK;
3123               goto parse_bracket_exp_free_return;
3124             }
3125           if (token->type == OP_CHARSET_RANGE)
3126             {
3127               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3128               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3129               if (__glibc_unlikely (token2.type == END_OF_RE))
3130                 {
3131                   *err = REG_EBRACK;
3132                   goto parse_bracket_exp_free_return;
3133                 }
3134               if (token2.type == OP_CLOSE_BRACKET)
3135                 {
3136                   /* We treat the last '-' as a normal character.  */
3137                   re_string_skip_bytes (regexp, -token_len);
3138                   token->type = CHARACTER;
3139                 }
3140               else
3141                 is_range_exp = true;
3142             }
3143         }
3144
3145       if (is_range_exp == true)
3146         {
3147           end_elem.opr.name = end_name_buf;
3148           end_elem.type = COLL_SYM;
3149           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3150                                        dfa, syntax, true);
3151           if (__glibc_unlikely (ret != REG_NOERROR))
3152             {
3153               *err = ret;
3154               goto parse_bracket_exp_free_return;
3155             }
3156
3157           token_len = peek_token_bracket (token, regexp, syntax);
3158
3159           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3160                                   &start_elem, &end_elem,
3161                                   dfa, syntax, nrules, collseqmb, collseqwc,
3162                                   table_size, symb_table, extra);
3163           if (__glibc_unlikely (*err != REG_NOERROR))
3164             goto parse_bracket_exp_free_return;
3165         }
3166       else
3167         {
3168           switch (start_elem.type)
3169             {
3170             case SB_CHAR:
3171               bitset_set (sbcset, start_elem.opr.ch);
3172               break;
3173             case MB_CHAR:
3174               /* Check whether the array has enough space.  */
3175               if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3176                 {
3177                   wchar_t *new_mbchars;
3178                   /* Not enough, realloc it.  */
3179                   /* +1 in case of mbcset->nmbchars is 0.  */
3180                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3181                   /* Use realloc since array is NULL if *alloc == 0.  */
3182                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3183                                             mbchar_alloc);
3184                   if (__glibc_unlikely (new_mbchars == NULL))
3185                     goto parse_bracket_exp_espace;
3186                   mbcset->mbchars = new_mbchars;
3187                 }
3188               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3189               break;
3190             case EQUIV_CLASS:
3191               *err = build_equiv_class (sbcset,
3192                                         mbcset, &equiv_class_alloc,
3193                                         start_elem.opr.name);
3194               if (__glibc_unlikely (*err != REG_NOERROR))
3195                 goto parse_bracket_exp_free_return;
3196               break;
3197             case COLL_SYM:
3198               *err = build_collating_symbol (sbcset,
3199                                              mbcset, &coll_sym_alloc,
3200                                              start_elem.opr.name,
3201                                              nrules, table_size, symb_table, extra);
3202               if (__glibc_unlikely (*err != REG_NOERROR))
3203                 goto parse_bracket_exp_free_return;
3204               break;
3205             case CHAR_CLASS:
3206               *err = build_charclass (regexp->trans, sbcset,
3207                                       mbcset, &char_class_alloc,
3208                                       (const char *) start_elem.opr.name,
3209                                       syntax);
3210               if (__glibc_unlikely (*err != REG_NOERROR))
3211                goto parse_bracket_exp_free_return;
3212               break;
3213             default:
3214               DEBUG_ASSERT (false);
3215               break;
3216             }
3217         }
3218       if (__glibc_unlikely (token->type == END_OF_RE))
3219         {
3220           *err = REG_EBRACK;
3221           goto parse_bracket_exp_free_return;
3222         }
3223       if (token->type == OP_CLOSE_BRACKET)
3224         break;
3225     }
3226
3227   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3228
3229   /* If it is non-matching list.  */
3230   if (non_match)
3231     bitset_not (sbcset);
3232
3233   /* Ensure only single byte characters are set.  */
3234   if (dfa->mb_cur_max > 1)
3235     bitset_mask (sbcset, dfa->sb_char);
3236
3237   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3238       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3239                                                      || mbcset->non_match)))
3240     {
3241       bin_tree_t *mbc_tree;
3242       int sbc_idx;
3243       /* Build a tree for complex bracket.  */
3244       dfa->has_mb_node = 1;
3245       br_token.type = COMPLEX_BRACKET;
3246       br_token.opr.mbcset = mbcset;
3247       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3248       if (__glibc_unlikely (mbc_tree == NULL))
3249         goto parse_bracket_exp_espace;
3250       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3251         if (sbcset[sbc_idx])
3252           break;
3253       /* If there are no bits set in sbcset, there is no point
3254          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3255       if (sbc_idx < BITSET_WORDS)
3256         {
3257           /* Build a tree for simple bracket.  */
3258           br_token.type = SIMPLE_BRACKET;
3259           br_token.opr.sbcset = sbcset;
3260           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3261           if (__glibc_unlikely (work_tree == NULL))
3262             goto parse_bracket_exp_espace;
3263
3264           /* Then join them by ALT node.  */
3265           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3266           if (__glibc_unlikely (work_tree == NULL))
3267             goto parse_bracket_exp_espace;
3268         }
3269       else
3270         {
3271           re_free (sbcset);
3272           work_tree = mbc_tree;
3273         }
3274     }
3275   else
3276     {
3277       free_charset (mbcset);
3278       /* Build a tree for simple bracket.  */
3279       br_token.type = SIMPLE_BRACKET;
3280       br_token.opr.sbcset = sbcset;
3281       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3282       if (__glibc_unlikely (work_tree == NULL))
3283         goto parse_bracket_exp_espace;
3284     }
3285   return work_tree;
3286
3287  parse_bracket_exp_espace:
3288   *err = REG_ESPACE;
3289  parse_bracket_exp_free_return:
3290   re_free (sbcset);
3291   free_charset (mbcset);
3292   return NULL;
3293 }
3294
3295 /* Parse an element in the bracket expression.  */
3296
3297 static reg_errcode_t
3298 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3299                        re_token_t *token, int token_len, re_dfa_t *dfa,
3300                        reg_syntax_t syntax, bool accept_hyphen)
3301 {
3302   int cur_char_size;
3303   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3304   if (cur_char_size > 1)
3305     {
3306       elem->type = MB_CHAR;
3307       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3308       re_string_skip_bytes (regexp, cur_char_size);
3309       return REG_NOERROR;
3310     }
3311   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3312   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3313       || token->type == OP_OPEN_EQUIV_CLASS)
3314     return parse_bracket_symbol (elem, regexp, token);
3315   if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3316     {
3317       /* A '-' must only appear as anything but a range indicator before
3318          the closing bracket.  Everything else is an error.  */
3319       re_token_t token2;
3320       (void) peek_token_bracket (&token2, regexp, syntax);
3321       if (token2.type != OP_CLOSE_BRACKET)
3322         /* The actual error value is not standardized since this whole
3323            case is undefined.  But ERANGE makes good sense.  */
3324         return REG_ERANGE;
3325     }
3326   elem->type = SB_CHAR;
3327   elem->opr.ch = token->opr.c;
3328   return REG_NOERROR;
3329 }
3330
3331 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3332    such as [:<character_class>:], [.<collating_element>.], and
3333    [=<equivalent_class>=].  */
3334
3335 static reg_errcode_t
3336 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3337                       re_token_t *token)
3338 {
3339   unsigned char ch, delim = token->opr.c;
3340   int i = 0;
3341   if (re_string_eoi(regexp))
3342     return REG_EBRACK;
3343   for (;; ++i)
3344     {
3345       if (i >= BRACKET_NAME_BUF_SIZE)
3346         return REG_EBRACK;
3347       if (token->type == OP_OPEN_CHAR_CLASS)
3348         ch = re_string_fetch_byte_case (regexp);
3349       else
3350         ch = re_string_fetch_byte (regexp);
3351       if (re_string_eoi(regexp))
3352         return REG_EBRACK;
3353       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3354         break;
3355       elem->opr.name[i] = ch;
3356     }
3357   re_string_skip_bytes (regexp, 1);
3358   elem->opr.name[i] = '\0';
3359   switch (token->type)
3360     {
3361     case OP_OPEN_COLL_ELEM:
3362       elem->type = COLL_SYM;
3363       break;
3364     case OP_OPEN_EQUIV_CLASS:
3365       elem->type = EQUIV_CLASS;
3366       break;
3367     case OP_OPEN_CHAR_CLASS:
3368       elem->type = CHAR_CLASS;
3369       break;
3370     default:
3371       break;
3372     }
3373   return REG_NOERROR;
3374 }
3375
3376   /* Helper function for parse_bracket_exp.
3377      Build the equivalence class which is represented by NAME.
3378      The result are written to MBCSET and SBCSET.
3379      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3380      is a pointer argument since we may update it.  */
3381
3382 static reg_errcode_t
3383 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3384                    Idx *equiv_class_alloc, const unsigned char *name)
3385 {
3386 #ifdef _LIBC
3387   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3388   if (nrules != 0)
3389     {
3390       const int32_t *table, *indirect;
3391       const unsigned char *weights, *extra, *cp;
3392       unsigned char char_buf[2];
3393       int32_t idx1, idx2;
3394       unsigned int ch;
3395       size_t len;
3396       /* Calculate the index for equivalence class.  */
3397       cp = name;
3398       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3399       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3400                                                _NL_COLLATE_WEIGHTMB);
3401       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3402                                                    _NL_COLLATE_EXTRAMB);
3403       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3404                                                 _NL_COLLATE_INDIRECTMB);
3405       idx1 = findidx (table, indirect, extra, &cp, -1);
3406       if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3407         /* This isn't a valid character.  */
3408         return REG_ECOLLATE;
3409
3410       /* Build single byte matching table for this equivalence class.  */
3411       len = weights[idx1 & 0xffffff];
3412       for (ch = 0; ch < SBC_MAX; ++ch)
3413         {
3414           char_buf[0] = ch;
3415           cp = char_buf;
3416           idx2 = findidx (table, indirect, extra, &cp, 1);
3417 /*
3418           idx2 = table[ch];
3419 */
3420           if (idx2 == 0)
3421             /* This isn't a valid character.  */
3422             continue;
3423           /* Compare only if the length matches and the collation rule
3424              index is the same.  */
3425           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3426               && memcmp (weights + (idx1 & 0xffffff) + 1,
3427                          weights + (idx2 & 0xffffff) + 1, len) == 0)
3428             bitset_set (sbcset, ch);
3429         }
3430       /* Check whether the array has enough space.  */
3431       if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3432         {
3433           /* Not enough, realloc it.  */
3434           /* +1 in case of mbcset->nequiv_classes is 0.  */
3435           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3436           /* Use realloc since the array is NULL if *alloc == 0.  */
3437           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3438                                                    int32_t,
3439                                                    new_equiv_class_alloc);
3440           if (__glibc_unlikely (new_equiv_classes == NULL))
3441             return REG_ESPACE;
3442           mbcset->equiv_classes = new_equiv_classes;
3443           *equiv_class_alloc = new_equiv_class_alloc;
3444         }
3445       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3446     }
3447   else
3448 #endif /* _LIBC */
3449     {
3450       if (__glibc_unlikely (strlen ((const char *) name) != 1))
3451         return REG_ECOLLATE;
3452       bitset_set (sbcset, *name);
3453     }
3454   return REG_NOERROR;
3455 }
3456
3457   /* Helper function for parse_bracket_exp.
3458      Build the character class which is represented by NAME.
3459      The result are written to MBCSET and SBCSET.
3460      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3461      is a pointer argument since we may update it.  */
3462
3463 static reg_errcode_t
3464 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3465                  re_charset_t *mbcset, Idx *char_class_alloc,
3466                  const char *class_name, reg_syntax_t syntax)
3467 {
3468   int i;
3469   const char *name = class_name;
3470
3471   /* In case of REG_ICASE "upper" and "lower" match the both of
3472      upper and lower cases.  */
3473   if ((syntax & RE_ICASE)
3474       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3475     name = "alpha";
3476
3477   /* Check the space of the arrays.  */
3478   if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3479     {
3480       /* Not enough, realloc it.  */
3481       /* +1 in case of mbcset->nchar_classes is 0.  */
3482       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3483       /* Use realloc since array is NULL if *alloc == 0.  */
3484       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3485                                                new_char_class_alloc);
3486       if (__glibc_unlikely (new_char_classes == NULL))
3487         return REG_ESPACE;
3488       mbcset->char_classes = new_char_classes;
3489       *char_class_alloc = new_char_class_alloc;
3490     }
3491   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3492
3493 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3494   do {                                          \
3495     if (__glibc_unlikely (trans != NULL))                       \
3496       {                                         \
3497         for (i = 0; i < SBC_MAX; ++i)           \
3498           if (ctype_func (i))                   \
3499             bitset_set (sbcset, trans[i]);      \
3500       }                                         \
3501     else                                        \
3502       {                                         \
3503         for (i = 0; i < SBC_MAX; ++i)           \
3504           if (ctype_func (i))                   \
3505             bitset_set (sbcset, i);             \
3506       }                                         \
3507   } while (0)
3508
3509   if (strcmp (name, "alnum") == 0)
3510     BUILD_CHARCLASS_LOOP (isalnum);
3511   else if (strcmp (name, "cntrl") == 0)
3512     BUILD_CHARCLASS_LOOP (iscntrl);
3513   else if (strcmp (name, "lower") == 0)
3514     BUILD_CHARCLASS_LOOP (islower);
3515   else if (strcmp (name, "space") == 0)
3516     BUILD_CHARCLASS_LOOP (isspace);
3517   else if (strcmp (name, "alpha") == 0)
3518     BUILD_CHARCLASS_LOOP (isalpha);
3519   else if (strcmp (name, "digit") == 0)
3520     BUILD_CHARCLASS_LOOP (isdigit);
3521   else if (strcmp (name, "print") == 0)
3522     BUILD_CHARCLASS_LOOP (isprint);
3523   else if (strcmp (name, "upper") == 0)
3524     BUILD_CHARCLASS_LOOP (isupper);
3525   else if (strcmp (name, "blank") == 0)
3526     BUILD_CHARCLASS_LOOP (isblank);
3527   else if (strcmp (name, "graph") == 0)
3528     BUILD_CHARCLASS_LOOP (isgraph);
3529   else if (strcmp (name, "punct") == 0)
3530     BUILD_CHARCLASS_LOOP (ispunct);
3531   else if (strcmp (name, "xdigit") == 0)
3532     BUILD_CHARCLASS_LOOP (isxdigit);
3533   else
3534     return REG_ECTYPE;
3535
3536   return REG_NOERROR;
3537 }
3538
3539 static bin_tree_t *
3540 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3541                     const char *class_name,
3542                     const char *extra, bool non_match,
3543                     reg_errcode_t *err)
3544 {
3545   re_bitset_ptr_t sbcset;
3546   re_charset_t *mbcset;
3547   Idx alloc = 0;
3548   reg_errcode_t ret;
3549   bin_tree_t *tree;
3550
3551   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3552   if (__glibc_unlikely (sbcset == NULL))
3553     {
3554       *err = REG_ESPACE;
3555       return NULL;
3556     }
3557   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3558   if (__glibc_unlikely (mbcset == NULL))
3559     {
3560       re_free (sbcset);
3561       *err = REG_ESPACE;
3562       return NULL;
3563     }
3564   mbcset->non_match = non_match;
3565
3566   /* We don't care the syntax in this case.  */
3567   ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
3568
3569   if (__glibc_unlikely (ret != REG_NOERROR))
3570     {
3571       re_free (sbcset);
3572       free_charset (mbcset);
3573       *err = ret;
3574       return NULL;
3575     }
3576   /* \w match '_' also.  */
3577   for (; *extra; extra++)
3578     bitset_set (sbcset, *extra);
3579
3580   /* If it is non-matching list.  */
3581   if (non_match)
3582     bitset_not (sbcset);
3583
3584   /* Ensure only single byte characters are set.  */
3585   if (dfa->mb_cur_max > 1)
3586     bitset_mask (sbcset, dfa->sb_char);
3587
3588   /* Build a tree for simple bracket.  */
3589   re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3590   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3591   if (__glibc_unlikely (tree == NULL))
3592     goto build_word_op_espace;
3593
3594   if (dfa->mb_cur_max > 1)
3595     {
3596       bin_tree_t *mbc_tree;
3597       /* Build a tree for complex bracket.  */
3598       br_token.type = COMPLEX_BRACKET;
3599       br_token.opr.mbcset = mbcset;
3600       dfa->has_mb_node = 1;
3601       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3602       if (__glibc_unlikely (mbc_tree == NULL))
3603         goto build_word_op_espace;
3604       /* Then join them by ALT node.  */
3605       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3606       if (__glibc_likely (mbc_tree != NULL))
3607         return tree;
3608     }
3609   else
3610     {
3611       free_charset (mbcset);
3612       return tree;
3613     }
3614
3615  build_word_op_espace:
3616   re_free (sbcset);
3617   free_charset (mbcset);
3618   *err = REG_ESPACE;
3619   return NULL;
3620 }
3621
3622 /* This is intended for the expressions like "a{1,3}".
3623    Fetch a number from 'input', and return the number.
3624    Return -1 if the number field is empty like "{,1}".
3625    Return RE_DUP_MAX + 1 if the number field is too large.
3626    Return -2 if an error occurred.  */
3627
3628 static Idx
3629 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3630 {
3631   Idx num = -1;
3632   unsigned char c;
3633   while (1)
3634     {
3635       fetch_token (token, input, syntax);
3636       c = token->opr.c;
3637       if (__glibc_unlikely (token->type == END_OF_RE))
3638         return -2;
3639       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3640         break;
3641       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3642              ? -2
3643              : num == -1
3644              ? c - '0'
3645              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3646     }
3647   return num;
3648 }
3649 \f
3650 static void
3651 free_charset (re_charset_t *cset)
3652 {
3653   re_free (cset->mbchars);
3654 #ifdef _LIBC
3655   re_free (cset->coll_syms);
3656   re_free (cset->equiv_classes);
3657 #endif
3658   re_free (cset->range_starts);
3659   re_free (cset->range_ends);
3660   re_free (cset->char_classes);
3661   re_free (cset);
3662 }
3663 \f
3664 /* Functions for binary tree operation.  */
3665
3666 /* Create a tree node.  */
3667
3668 static bin_tree_t *
3669 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3670              re_token_type_t type)
3671 {
3672   re_token_t t = { .type = type };
3673   return create_token_tree (dfa, left, right, &t);
3674 }
3675
3676 static bin_tree_t *
3677 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3678                    const re_token_t *token)
3679 {
3680   bin_tree_t *tree;
3681   if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3682     {
3683       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3684
3685       if (storage == NULL)
3686         return NULL;
3687       storage->next = dfa->str_tree_storage;
3688       dfa->str_tree_storage = storage;
3689       dfa->str_tree_storage_idx = 0;
3690     }
3691   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3692
3693   tree->parent = NULL;
3694   tree->left = left;
3695   tree->right = right;
3696   tree->token = *token;
3697   tree->token.duplicated = 0;
3698   tree->token.opt_subexp = 0;
3699   tree->first = NULL;
3700   tree->next = NULL;
3701   tree->node_idx = -1;
3702
3703   if (left != NULL)
3704     left->parent = tree;
3705   if (right != NULL)
3706     right->parent = tree;
3707   return tree;
3708 }
3709
3710 /* Mark the tree SRC as an optional subexpression.
3711    To be called from preorder or postorder.  */
3712
3713 static reg_errcode_t
3714 mark_opt_subexp (void *extra, bin_tree_t *node)
3715 {
3716   Idx idx = (uintptr_t) extra;
3717   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3718     node->token.opt_subexp = 1;
3719
3720   return REG_NOERROR;
3721 }
3722
3723 /* Free the allocated memory inside NODE. */
3724
3725 static void
3726 free_token (re_token_t *node)
3727 {
3728   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3729     free_charset (node->opr.mbcset);
3730   else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3731     re_free (node->opr.sbcset);
3732 }
3733
3734 /* Worker function for tree walking.  Free the allocated memory inside NODE
3735    and its children. */
3736
3737 static reg_errcode_t
3738 free_tree (void *extra, bin_tree_t *node)
3739 {
3740   free_token (&node->token);
3741   return REG_NOERROR;
3742 }
3743
3744
3745 /* Duplicate the node SRC, and return new node.  This is a preorder
3746    visit similar to the one implemented by the generic visitor, but
3747    we need more infrastructure to maintain two parallel trees --- so,
3748    it's easier to duplicate.  */
3749
3750 static bin_tree_t *
3751 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3752 {
3753   const bin_tree_t *node;
3754   bin_tree_t *dup_root;
3755   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3756
3757   for (node = root; ; )
3758     {
3759       /* Create a new tree and link it back to the current parent.  */
3760       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3761       if (*p_new == NULL)
3762         return NULL;
3763       (*p_new)->parent = dup_node;
3764       (*p_new)->token.duplicated = 1;
3765       dup_node = *p_new;
3766
3767       /* Go to the left node, or up and to the right.  */
3768       if (node->left)
3769         {
3770           node = node->left;
3771           p_new = &dup_node->left;
3772         }
3773       else
3774         {
3775           const bin_tree_t *prev = NULL;
3776           while (node->right == prev || node->right == NULL)
3777             {
3778               prev = node;
3779               node = node->parent;
3780               dup_node = dup_node->parent;
3781               if (!node)
3782                 return dup_root;
3783             }
3784           node = node->right;
3785           p_new = &dup_node->right;
3786         }
3787     }
3788 }