src/grep/lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #ifdef _LIBC
  21 # include <locale/weight.h>
  22 #endif
  23
  24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  25                                           size_t length, reg_syntax_t syntax);
  26 static void re_compile_fastmap_iter (regex_t *bufp,
  27                                      const re_dfastate_t *init_state,
  28                                      char *fastmap);
  29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  30 #ifdef RE_ENABLE_I18N
  31 static void free_charset (re_charset_t *cset);
  32 #endif /* RE_ENABLE_I18N */
  33 static void free_workarea_compile (regex_t *preg);
  34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  35 #ifdef RE_ENABLE_I18N
  36 static void optimize_utf8 (re_dfa_t *dfa);
  37 #endif
  38 static reg_errcode_t analyze (regex_t *preg);
  39 static reg_errcode_t preorder (bin_tree_t *root,
  40                                reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                void *extra);
  42 static reg_errcode_t postorder (bin_tree_t *root,
  43                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  44                                 void *extra);
  45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  48                                  bin_tree_t *node);
  49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  54                                    unsigned int constraint);
  55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  57                                          Idx node, bool root);
  58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  59 static Idx fetch_number (re_string_t *input, re_token_t *token,
  60                          reg_syntax_t syntax);
  61 static int peek_token (re_token_t *token, re_string_t *input,
  62                         reg_syntax_t syntax);
  63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  64                           reg_syntax_t syntax, reg_errcode_t *err);
  65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  66                                   re_token_t *token, reg_syntax_t syntax,
  67                                   Idx nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  69                                  re_token_t *token, reg_syntax_t syntax,
  70                                  Idx nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  72                                      re_token_t *token, reg_syntax_t syntax,
  73                                      Idx nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  75                                   re_token_t *token, reg_syntax_t syntax,
  76                                   Idx nest, reg_errcode_t *err);
  77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  78                                  re_dfa_t *dfa, re_token_t *token,
  79                                  reg_syntax_t syntax, reg_errcode_t *err);
  80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  81                                       re_token_t *token, reg_syntax_t syntax,
  82                                       reg_errcode_t *err);
  83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  84                                             re_string_t *regexp,
  85                                             re_token_t *token, int token_len,
  86                                             re_dfa_t *dfa,
  87                                             reg_syntax_t syntax,
  88                                             bool accept_hyphen);
  89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  90                                           re_string_t *regexp,
  91                                           re_token_t *token);
  92 #ifdef RE_ENABLE_I18N
  93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  94                                         re_charset_t *mbcset,
  95                                         Idx *equiv_class_alloc,
  96                                         const unsigned char *name);
  97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  98                                       bitset_t sbcset,
  99                                       re_charset_t *mbcset,
 100                                       Idx *char_class_alloc,
 101                                       const char *class_name,
 102                                       reg_syntax_t syntax);
 103 #else  /* not RE_ENABLE_I18N */
 104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 105                                         const unsigned char *name);
 106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 107                                       bitset_t sbcset,
 108                                       const char *class_name,
 109                                       reg_syntax_t syntax);
 110 #endif /* not RE_ENABLE_I18N */
 111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 112                                        RE_TRANSLATE_TYPE trans,
 113                                        const char *class_name,
 114                                        const char *extra,
 115                                        bool non_match, reg_errcode_t *err);
 116 static bin_tree_t *create_tree (re_dfa_t *dfa,
 117                                 bin_tree_t *left, bin_tree_t *right,
 118                                 re_token_type_t type);
 119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 120                                       bin_tree_t *left, bin_tree_t *right,
 121                                       const re_token_t *token);
 122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 123 static void free_token (re_token_t *node);
 124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 126 \f
 127 /* This table gives an error message for each of the error codes listed
 128    in regex.h.  Obviously the order here has to be same as there.
 129    POSIX doesn't require that we do anything for REG_NOERROR,
 130    but why not be nice?  */
 131
 132 static const char __re_error_msgid[] =
 133   {
 134 #define REG_NOERROR_IDX 0
 135     gettext_noop ("Success")    /* REG_NOERROR */
 136     "\0"
 137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 138     gettext_noop ("No match")   /* REG_NOMATCH */
 139     "\0"
 140 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 142     "\0"
 143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 145     "\0"
 146 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 148     "\0"
 149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 151     "\0"
 152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 154     "\0"
 155 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 156     gettext_noop ("Unmatched [, [^, [:, [., or [=")     /* REG_EBRACK */
 157     "\0"
 158 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
 159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 160     "\0"
 161 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 163     "\0"
 164 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 166     "\0"
 167 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 168     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 169     "\0"
 170 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 172     "\0"
 173 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 175     "\0"
 176 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 178     "\0"
 179 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 181     "\0"
 182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 184   };
 185
 186 static const size_t __re_error_msgid_idx[] =
 187   {
 188     REG_NOERROR_IDX,
 189     REG_NOMATCH_IDX,
 190     REG_BADPAT_IDX,
 191     REG_ECOLLATE_IDX,
 192     REG_ECTYPE_IDX,
 193     REG_EESCAPE_IDX,
 194     REG_ESUBREG_IDX,
 195     REG_EBRACK_IDX,
 196     REG_EPAREN_IDX,
 197     REG_EBRACE_IDX,
 198     REG_BADBR_IDX,
 199     REG_ERANGE_IDX,
 200     REG_ESPACE_IDX,
 201     REG_BADRPT_IDX,
 202     REG_EEND_IDX,
 203     REG_ESIZE_IDX,
 204     REG_ERPAREN_IDX
 205   };
 206 \f
 207 /* Entry points for GNU code.  */
 208
 209 /* re_compile_pattern is the GNU regular expression compiler: it
 210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 211    Returns 0 if the pattern was valid, otherwise an error string.
 212
 213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 214    are set in BUFP on entry.  */
 215
 216 const char *
 217 re_compile_pattern (const char *pattern, size_t length,
 218                     struct re_pattern_buffer *bufp)
 219 {
 220   reg_errcode_t ret;
 221
 222   /* And GNU code determines whether or not to get register information
 223      by passing null for the REGS argument to re_match, etc., not by
 224      setting no_sub, unless RE_NO_SUB is set.  */
 225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 226
 227   /* Match anchors at newline.  */
 228   bufp->newline_anchor = 1;
 229
 230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 231
 232   if (!ret)
 233     return NULL;
 234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 235 }
 236 weak_alias (__re_compile_pattern, re_compile_pattern)
 237
 238 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 239    also be assigned to arbitrarily: each pattern buffer stores its own
 240    syntax, so it can be changed between regex compilations.  */
 241 /* This has no initializer because initialized variables in Emacs
 242    become read-only after dumping.  */
 243 reg_syntax_t re_syntax_options;
 244
 245
 246 /* Specify the precise syntax of regexps for compilation.  This provides
 247    for compatibility for various utilities which historically have
 248    different, incompatible syntaxes.
 249
 250    The argument SYNTAX is a bit mask comprised of the various bits
 251    defined in regex.h.  We return the old syntax.  */
 252
 253 reg_syntax_t
 254 re_set_syntax (reg_syntax_t syntax)
 255 {
 256   reg_syntax_t ret = re_syntax_options;
 257
 258   re_syntax_options = syntax;
 259   return ret;
 260 }
 261 weak_alias (__re_set_syntax, re_set_syntax)
 262
 263 int
 264 re_compile_fastmap (struct re_pattern_buffer *bufp)
 265 {
 266   re_dfa_t *dfa = bufp->buffer;
 267   char *fastmap = bufp->fastmap;
 268
 269   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 270   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 271   if (dfa->init_state != dfa->init_state_word)
 272     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 273   if (dfa->init_state != dfa->init_state_nl)
 274     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 275   if (dfa->init_state != dfa->init_state_begbuf)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 277   bufp->fastmap_accurate = 1;
 278   return 0;
 279 }
 280 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 281
 282 static inline void
 283 __attribute__ ((always_inline))
 284 re_set_fastmap (char *fastmap, bool icase, int ch)
 285 {
 286   fastmap[ch] = 1;
 287   if (icase)
 288     fastmap[tolower (ch)] = 1;
 289 }
 290
 291 /* Helper function for re_compile_fastmap.
 292    Compile fastmap for the initial_state INIT_STATE.  */
 293
 294 static void
 295 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 296                          char *fastmap)
 297 {
 298   re_dfa_t *dfa = bufp->buffer;
 299   Idx node_cnt;
 300   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 301   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 302     {
 303       Idx node = init_state->nodes.elems[node_cnt];
 304       re_token_type_t type = dfa->nodes[node].type;
 305
 306       if (type == CHARACTER)
 307         {
 308           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 309 #ifdef RE_ENABLE_I18N
 310           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 311             {
 312               unsigned char buf[MB_LEN_MAX];
 313               unsigned char *p;
 314               wchar_t wc;
 315               mbstate_t state;
 316
 317               p = buf;
 318               *p++ = dfa->nodes[node].opr.c;
 319               while (++node < dfa->nodes_len
 320                      && dfa->nodes[node].type == CHARACTER
 321                      && dfa->nodes[node].mb_partial)
 322                 *p++ = dfa->nodes[node].opr.c;
 323               memset (&state, '\0', sizeof (state));
 324               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 325                              &state) == p - buf
 326                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 327                       != (size_t) -1))
 328                 re_set_fastmap (fastmap, false, buf[0]);
 329             }
 330 #endif
 331         }
 332       else if (type == SIMPLE_BRACKET)
 333         {
 334           int i, ch;
 335           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 336             {
 337               int j;
 338               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 339               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 340                 if (w & ((bitset_word_t) 1 << j))
 341                   re_set_fastmap (fastmap, icase, ch);
 342             }
 343         }
 344 #ifdef RE_ENABLE_I18N
 345       else if (type == COMPLEX_BRACKET)
 346         {
 347           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 348           Idx i;
 349
 350 # ifdef _LIBC
 351           /* See if we have to try all bytes which start multiple collation
 352              elements.
 353              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 354                   collation element, and don't catch 'b' since 'b' is
 355                   the only collation element which starts from 'b' (and
 356                   it is caught by SIMPLE_BRACKET).  */
 357               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 358                   && (cset->ncoll_syms || cset->nranges))
 359                 {
 360                   const int32_t *table = (const int32_t *)
 361                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 362                   for (i = 0; i < SBC_MAX; ++i)
 363                     if (table[i] < 0)
 364                       re_set_fastmap (fastmap, icase, i);
 365                 }
 366 # endif /* _LIBC */
 367
 368           /* See if we have to start the match at all multibyte characters,
 369              i.e. where we would not find an invalid sequence.  This only
 370              applies to multibyte character sets; for single byte character
 371              sets, the SIMPLE_BRACKET again suffices.  */
 372           if (dfa->mb_cur_max > 1
 373               && (cset->nchar_classes || cset->non_match || cset->nranges
 374 # ifdef _LIBC
 375                   || cset->nequiv_classes
 376 # endif /* _LIBC */
 377                  ))
 378             {
 379               unsigned char c = 0;
 380               do
 381                 {
 382                   mbstate_t mbs;
 383                   memset (&mbs, 0, sizeof (mbs));
 384                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 385                     re_set_fastmap (fastmap, false, (int) c);
 386                 }
 387               while (++c != 0);
 388             }
 389
 390           else
 391             {
 392               /* ... Else catch all bytes which can start the mbchars.  */
 393               for (i = 0; i < cset->nmbchars; ++i)
 394                 {
 395                   char buf[256];
 396                   mbstate_t state;
 397                   memset (&state, '\0', sizeof (state));
 398                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 399                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 400                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 401                     {
 402                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 403                           != (size_t) -1)
 404                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 405                     }
 406                 }
 407             }
 408         }
 409 #endif /* RE_ENABLE_I18N */
 410       else if (type == OP_PERIOD
 411 #ifdef RE_ENABLE_I18N
 412                || type == OP_UTF8_PERIOD
 413 #endif /* RE_ENABLE_I18N */
 414                || type == END_OF_RE)
 415         {
 416           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 417           if (type == END_OF_RE)
 418             bufp->can_be_null = 1;
 419           return;
 420         }
 421     }
 422 }
 423 \f
 424 /* Entry point for POSIX code.  */
 425 /* regcomp takes a regular expression as a string and compiles it.
 426
 427    PREG is a regex_t *.  We do not expect any fields to be initialized,
 428    since POSIX says we shouldn't.  Thus, we set
 429
 430      'buffer' to the compiled pattern;
 431      'used' to the length of the compiled pattern;
 432      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 433        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 434        RE_SYNTAX_POSIX_BASIC;
 435      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 436      'fastmap' to an allocated space for the fastmap;
 437      'fastmap_accurate' to zero;
 438      're_nsub' to the number of subexpressions in PATTERN.
 439
 440    PATTERN is the address of the pattern string.
 441
 442    CFLAGS is a series of bits which affect compilation.
 443
 444      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 445      use POSIX basic syntax.
 446
 447      If REG_NEWLINE is set, then . and [^...] don't match newline.
 448      Also, regexec will try a match beginning after every newline.
 449
 450      If REG_ICASE is set, then we considers upper- and lowercase
 451      versions of letters to be equivalent when matching.
 452
 453      If REG_NOSUB is set, then when PREG is passed to regexec, that
 454      routine will report only success or failure, and nothing about the
 455      registers.
 456
 457    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 458    the return codes and their meanings.)  */
 459
 460 int
 461 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 462 {
 463   reg_errcode_t ret;
 464   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 465                          : RE_SYNTAX_POSIX_BASIC);
 466
 467   preg->buffer = NULL;
 468   preg->allocated = 0;
 469   preg->used = 0;
 470
 471   /* Try to allocate space for the fastmap.  */
 472   preg->fastmap = re_malloc (char, SBC_MAX);
 473   if (__glibc_unlikely (preg->fastmap == NULL))
 474     return REG_ESPACE;
 475
 476   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 477
 478   /* If REG_NEWLINE is set, newlines are treated differently.  */
 479   if (cflags & REG_NEWLINE)
 480     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 481       syntax &= ~RE_DOT_NEWLINE;
 482       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 483       /* It also changes the matching behavior.  */
 484       preg->newline_anchor = 1;
 485     }
 486   else
 487     preg->newline_anchor = 0;
 488   preg->no_sub = !!(cflags & REG_NOSUB);
 489   preg->translate = NULL;
 490
 491   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 492
 493   /* POSIX doesn't distinguish between an unmatched open-group and an
 494      unmatched close-group: both are REG_EPAREN.  */
 495   if (ret == REG_ERPAREN)
 496     ret = REG_EPAREN;
 497
 498   /* We have already checked preg->fastmap != NULL.  */
 499   if (__glibc_likely (ret == REG_NOERROR))
 500     /* Compute the fastmap now, since regexec cannot modify the pattern
 501        buffer.  This function never fails in this implementation.  */
 502     (void) re_compile_fastmap (preg);
 503   else
 504     {
 505       /* Some error occurred while compiling the expression.  */
 506       re_free (preg->fastmap);
 507       preg->fastmap = NULL;
 508     }
 509
 510   return (int) ret;
 511 }
 512 libc_hidden_def (__regcomp)
 513 weak_alias (__regcomp, regcomp)
 514
 515 /* Returns a message corresponding to an error code, ERRCODE, returned
 516    from either regcomp or regexec.   We don't use PREG here.  */
 517
 518 size_t
 519 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
 520           size_t errbuf_size)
 521 {
 522   const char *msg;
 523   size_t msg_size;
 524   int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
 525
 526   if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
 527     /* Only error codes returned by the rest of the code should be passed
 528        to this routine.  If we are given anything else, or if other regex
 529        code generates an invalid error code, then the program has a bug.
 530        Dump core so we can fix it.  */
 531     abort ();
 532
 533   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 534
 535   msg_size = strlen (msg) + 1; /* Includes the null.  */
 536
 537   if (__glibc_likely (errbuf_size != 0))
 538     {
 539       size_t cpy_size = msg_size;
 540       if (__glibc_unlikely (msg_size > errbuf_size))
 541         {
 542           cpy_size = errbuf_size - 1;
 543           errbuf[cpy_size] = '\0';
 544         }
 545       memcpy (errbuf, msg, cpy_size);
 546     }
 547
 548   return msg_size;
 549 }
 550 weak_alias (__regerror, regerror)
 551
 552
 553 #ifdef RE_ENABLE_I18N
 554 /* This static array is used for the map to single-byte characters when
 555    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 556    it the same all the time.  UTF-8 is the preferred encoding so this is
 557    a worthwhile optimization.  */
 558 static const bitset_t utf8_sb_map =
 559 {
 560   /* Set the first 128 bits.  */
 561 # if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
 562   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 563 # else
 564 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 565 #   error "bitset_word_t is narrower than 32 bits"
 566 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 567   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 568 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 569   BITSET_WORD_MAX, BITSET_WORD_MAX,
 570 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 571   BITSET_WORD_MAX,
 572 #  endif
 573   (BITSET_WORD_MAX
 574    >> (SBC_MAX % BITSET_WORD_BITS == 0
 575        ? 0
 576        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 577 # endif
 578 };
 579 #endif
 580
 581
 582 static void
 583 free_dfa_content (re_dfa_t *dfa)
 584 {
 585   Idx i, j;
 586
 587   if (dfa->nodes)
 588     for (i = 0; i < dfa->nodes_len; ++i)
 589       free_token (dfa->nodes + i);
 590   re_free (dfa->nexts);
 591   for (i = 0; i < dfa->nodes_len; ++i)
 592     {
 593       if (dfa->eclosures != NULL)
 594         re_node_set_free (dfa->eclosures + i);
 595       if (dfa->inveclosures != NULL)
 596         re_node_set_free (dfa->inveclosures + i);
 597       if (dfa->edests != NULL)
 598         re_node_set_free (dfa->edests + i);
 599     }
 600   re_free (dfa->edests);
 601   re_free (dfa->eclosures);
 602   re_free (dfa->inveclosures);
 603   re_free (dfa->nodes);
 604
 605   if (dfa->state_table)
 606     for (i = 0; i <= dfa->state_hash_mask; ++i)
 607       {
 608         struct re_state_table_entry *entry = dfa->state_table + i;
 609         for (j = 0; j < entry->num; ++j)
 610           {
 611             re_dfastate_t *state = entry->array[j];
 612             free_state (state);
 613           }
 614         re_free (entry->array);
 615       }
 616   re_free (dfa->state_table);
 617 #ifdef RE_ENABLE_I18N
 618   if (dfa->sb_char != utf8_sb_map)
 619     re_free (dfa->sb_char);
 620 #endif
 621   re_free (dfa->subexp_map);
 622 #ifdef DEBUG
 623   re_free (dfa->re_str);
 624 #endif
 625
 626   re_free (dfa);
 627 }
 628
 629
 630 /* Free dynamically allocated space used by PREG.  */
 631
 632 void
 633 regfree (regex_t *preg)
 634 {
 635   re_dfa_t *dfa = preg->buffer;
 636   if (__glibc_likely (dfa != NULL))
 637     {
 638       lock_fini (dfa->lock);
 639       free_dfa_content (dfa);
 640     }
 641   preg->buffer = NULL;
 642   preg->allocated = 0;
 643
 644   re_free (preg->fastmap);
 645   preg->fastmap = NULL;
 646
 647   re_free (preg->translate);
 648   preg->translate = NULL;
 649 }
 650 libc_hidden_def (__regfree)
 651 weak_alias (__regfree, regfree)
 652 \f
 653 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 654    them unless specifically requested.  */
 655
 656 #if defined _REGEX_RE_COMP || defined _LIBC
 657
 658 /* BSD has one and only one pattern buffer.  */
 659 static struct re_pattern_buffer re_comp_buf;
 660
 661 char *
 662 # ifdef _LIBC
 663 /* Make these definitions weak in libc, so POSIX programs can redefine
 664    these names if they don't use our functions, and still use
 665    regcomp/regexec above without link errors.  */
 666 weak_function
 667 # endif
 668 re_comp (const char *s)
 669 {
 670   reg_errcode_t ret;
 671   char *fastmap;
 672
 673   if (!s)
 674     {
 675       if (!re_comp_buf.buffer)
 676         return gettext ("No previous regular expression");
 677       return 0;
 678     }
 679
 680   if (re_comp_buf.buffer)
 681     {
 682       fastmap = re_comp_buf.fastmap;
 683       re_comp_buf.fastmap = NULL;
 684       __regfree (&re_comp_buf);
 685       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 686       re_comp_buf.fastmap = fastmap;
 687     }
 688
 689   if (re_comp_buf.fastmap == NULL)
 690     {
 691       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
 692       if (re_comp_buf.fastmap == NULL)
 693         return (char *) gettext (__re_error_msgid
 694                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 695     }
 696
 697   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 698      don't need to initialize the pattern buffer fields which affect it.  */
 699
 700   /* Match anchors at newlines.  */
 701   re_comp_buf.newline_anchor = 1;
 702
 703   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 704
 705   if (!ret)
 706     return NULL;
 707
 708   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 709   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 710 }
 711
 712 #ifdef _LIBC
 713 libc_freeres_fn (free_mem)
 714 {
 715   __regfree (&re_comp_buf);
 716 }
 717 #endif
 718
 719 #endif /* _REGEX_RE_COMP */
 720 \f
 721 /* Internal entry point.
 722    Compile the regular expression PATTERN, whose length is LENGTH.
 723    SYNTAX indicate regular expression's syntax.  */
 724
 725 static reg_errcode_t
 726 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 727                      reg_syntax_t syntax)
 728 {
 729   reg_errcode_t err = REG_NOERROR;
 730   re_dfa_t *dfa;
 731   re_string_t regexp;
 732
 733   /* Initialize the pattern buffer.  */
 734   preg->fastmap_accurate = 0;
 735   preg->syntax = syntax;
 736   preg->not_bol = preg->not_eol = 0;
 737   preg->used = 0;
 738   preg->re_nsub = 0;
 739   preg->can_be_null = 0;
 740   preg->regs_allocated = REGS_UNALLOCATED;
 741
 742   /* Initialize the dfa.  */
 743   dfa = preg->buffer;
 744   if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
 745     {
 746       /* If zero allocated, but buffer is non-null, try to realloc
 747          enough space.  This loses if buffer's address is bogus, but
 748          that is the user's responsibility.  If ->buffer is NULL this
 749          is a simple allocation.  */
 750       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 751       if (dfa == NULL)
 752         return REG_ESPACE;
 753       preg->allocated = sizeof (re_dfa_t);
 754       preg->buffer = dfa;
 755     }
 756   preg->used = sizeof (re_dfa_t);
 757
 758   err = init_dfa (dfa, length);
 759   if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
 760     err = REG_ESPACE;
 761   if (__glibc_unlikely (err != REG_NOERROR))
 762     {
 763       free_dfa_content (dfa);
 764       preg->buffer = NULL;
 765       preg->allocated = 0;
 766       return err;
 767     }
 768 #ifdef DEBUG
 769   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 770   dfa->re_str = re_malloc (char, length + 1);
 771   strncpy (dfa->re_str, pattern, length + 1);
 772 #endif
 773
 774   err = re_string_construct (&regexp, pattern, length, preg->translate,
 775                              (syntax & RE_ICASE) != 0, dfa);
 776   if (__glibc_unlikely (err != REG_NOERROR))
 777     {
 778     re_compile_internal_free_return:
 779       free_workarea_compile (preg);
 780       re_string_destruct (&regexp);
 781       lock_fini (dfa->lock);
 782       free_dfa_content (dfa);
 783       preg->buffer = NULL;
 784       preg->allocated = 0;
 785       return err;
 786     }
 787
 788   /* Parse the regular expression, and build a structure tree.  */
 789   preg->re_nsub = 0;
 790   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 791   if (__glibc_unlikely (dfa->str_tree == NULL))
 792     goto re_compile_internal_free_return;
 793
 794   /* Analyze the tree and create the nfa.  */
 795   err = analyze (preg);
 796   if (__glibc_unlikely (err != REG_NOERROR))
 797     goto re_compile_internal_free_return;
 798
 799 #ifdef RE_ENABLE_I18N
 800   /* If possible, do searching in single byte encoding to speed things up.  */
 801   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 802     optimize_utf8 (dfa);
 803 #endif
 804
 805   /* Then create the initial state of the dfa.  */
 806   err = create_initial_state (dfa);
 807
 808   /* Release work areas.  */
 809   free_workarea_compile (preg);
 810   re_string_destruct (&regexp);
 811
 812   if (__glibc_unlikely (err != REG_NOERROR))
 813     {
 814       lock_fini (dfa->lock);
 815       free_dfa_content (dfa);
 816       preg->buffer = NULL;
 817       preg->allocated = 0;
 818     }
 819
 820   return err;
 821 }
 822
 823 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 824    as the initial length of some arrays.  */
 825
 826 static reg_errcode_t
 827 init_dfa (re_dfa_t *dfa, size_t pat_len)
 828 {
 829   __re_size_t table_size;
 830 #ifndef _LIBC
 831   const char *codeset_name;
 832 #endif
 833 #ifdef RE_ENABLE_I18N
 834   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 835 #else
 836   size_t max_i18n_object_size = 0;
 837 #endif
 838   size_t max_object_size =
 839     MAX (sizeof (struct re_state_table_entry),
 840          MAX (sizeof (re_token_t),
 841               MAX (sizeof (re_node_set),
 842                    MAX (sizeof (regmatch_t),
 843                         max_i18n_object_size))));
 844
 845   memset (dfa, '\0', sizeof (re_dfa_t));
 846
 847   /* Force allocation of str_tree_storage the first time.  */
 848   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 849
 850   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 851      calculation below, and for similar doubling calculations
 852      elsewhere.  And it's <= rather than <, because some of the
 853      doubling calculations add 1 afterwards.  */
 854   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
 855                         <= pat_len))
 856     return REG_ESPACE;
 857
 858   dfa->nodes_alloc = pat_len + 1;
 859   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 860
 861   /*  table_size = 2 ^ ceil(log pat_len) */
 862   for (table_size = 1; ; table_size <<= 1)
 863     if (table_size > pat_len)
 864       break;
 865
 866   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 867   dfa->state_hash_mask = table_size - 1;
 868
 869   dfa->mb_cur_max = MB_CUR_MAX;
 870 #ifdef _LIBC
 871   if (dfa->mb_cur_max == 6
 872       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 873     dfa->is_utf8 = 1;
 874   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 875                        != 0);
 876 #else
 877 # ifdef _MSC_VER
 878   (void)codeset_name;
 879   if (get_crt_codepage() == CP_UTF8)
 880 # else
 881   codeset_name = nl_langinfo (CODESET);
 882   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 883       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 884       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 885       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 886 # endif
 887     dfa->is_utf8 = 1;
 888
 889   /* We check exhaustively in the loop below if this charset is a
 890      superset of ASCII.  */
 891   dfa->map_notascii = 0;
 892 #endif
 893
 894 #ifdef RE_ENABLE_I18N
 895   if (dfa->mb_cur_max > 1)
 896     {
 897       if (dfa->is_utf8)
 898         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 899       else
 900         {
 901           int i, j, ch;
 902
 903           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 904           if (__glibc_unlikely (dfa->sb_char == NULL))
 905             return REG_ESPACE;
 906
 907           /* Set the bits corresponding to single byte chars.  */
 908           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 909             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 910               {
 911                 wint_t wch = __btowc (ch);
 912                 if (wch != WEOF)
 913                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 914 # ifndef _LIBC
 915                 if (isascii (ch) && wch != ch)
 916                   dfa->map_notascii = 1;
 917 # endif
 918               }
 919         }
 920     }
 921 #endif
 922
 923   if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
 924     return REG_ESPACE;
 925   return REG_NOERROR;
 926 }
 927
 928 /* Initialize WORD_CHAR table, which indicate which character is
 929    "word".  In this case "word" means that it is the word construction
 930    character used by some operators like "\<", "\>", etc.  */
 931
 932 static void
 933 init_word_char (re_dfa_t *dfa)
 934 {
 935   int i = 0;
 936   int j;
 937   int ch = 0;
 938   dfa->word_ops_used = 1;
 939   if (__glibc_likely (dfa->map_notascii == 0))
 940     {
 941       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
 942          them, an issue when this code is used in Gnulib.  */
 943       bitset_word_t bits0 = 0x00000000;
 944       bitset_word_t bits1 = 0x03ff0000;
 945       bitset_word_t bits2 = 0x87fffffe;
 946       bitset_word_t bits3 = 0x07fffffe;
 947       if (BITSET_WORD_BITS == 64)
 948         {
 949           /* Pacify gcc -Woverflow on 32-bit platformns.  */
 950           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 951           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 952           i = 2;
 953         }
 954       else if (BITSET_WORD_BITS == 32)
 955         {
 956           dfa->word_char[0] = bits0;
 957           dfa->word_char[1] = bits1;
 958           dfa->word_char[2] = bits2;
 959           dfa->word_char[3] = bits3;
 960           i = 4;
 961         }
 962       else
 963         goto general_case;
 964       ch = 128;
 965
 966       if (__glibc_likely (dfa->is_utf8))
 967         {
 968           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 969           return;
 970         }
 971     }
 972
 973  general_case:
 974   for (; i < BITSET_WORDS; ++i)
 975     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 976       if (isalnum (ch) || ch == '_')
 977         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 978 }
 979
 980 /* Free the work area which are only used while compiling.  */
 981
 982 static void
 983 free_workarea_compile (regex_t *preg)
 984 {
 985   re_dfa_t *dfa = preg->buffer;
 986   bin_tree_storage_t *storage, *next;
 987   for (storage = dfa->str_tree_storage; storage; storage = next)
 988     {
 989       next = storage->next;
 990       re_free (storage);
 991     }
 992   dfa->str_tree_storage = NULL;
 993   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 994   dfa->str_tree = NULL;
 995   re_free (dfa->org_indices);
 996   dfa->org_indices = NULL;
 997 }
 998
 999 /* Create initial states for all contexts.  */
1000
1001 static reg_errcode_t
1002 create_initial_state (re_dfa_t *dfa)
1003 {
1004   Idx first, i;
1005   reg_errcode_t err;
1006   re_node_set init_nodes;
1007
1008   /* Initial states have the epsilon closure of the node which is
1009      the first node of the regular expression.  */
1010   first = dfa->str_tree->first->node_idx;
1011   dfa->init_node = first;
1012   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1013   if (__glibc_unlikely (err != REG_NOERROR))
1014     return err;
1015
1016   /* The back-references which are in initial states can epsilon transit,
1017      since in this case all of the subexpressions can be null.
1018      Then we add epsilon closures of the nodes which are the next nodes of
1019      the back-references.  */
1020   if (dfa->nbackref > 0)
1021     for (i = 0; i < init_nodes.nelem; ++i)
1022       {
1023         Idx node_idx = init_nodes.elems[i];
1024         re_token_type_t type = dfa->nodes[node_idx].type;
1025
1026         Idx clexp_idx;
1027         if (type != OP_BACK_REF)
1028           continue;
1029         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1030           {
1031             re_token_t *clexp_node;
1032             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1033             if (clexp_node->type == OP_CLOSE_SUBEXP
1034                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1035               break;
1036           }
1037         if (clexp_idx == init_nodes.nelem)
1038           continue;
1039
1040         if (type == OP_BACK_REF)
1041           {
1042             Idx dest_idx = dfa->edests[node_idx].elems[0];
1043             if (!re_node_set_contains (&init_nodes, dest_idx))
1044               {
1045                 reg_errcode_t merge_err
1046                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1047                 if (merge_err != REG_NOERROR)
1048                   return merge_err;
1049                 i = 0;
1050               }
1051           }
1052       }
1053
1054   /* It must be the first time to invoke acquire_state.  */
1055   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1056   /* We don't check ERR here, since the initial state must not be NULL.  */
1057   if (__glibc_unlikely (dfa->init_state == NULL))
1058     return err;
1059   if (dfa->init_state->has_constraint)
1060     {
1061       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1062                                                        CONTEXT_WORD);
1063       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1064                                                      CONTEXT_NEWLINE);
1065       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1066                                                          &init_nodes,
1067                                                          CONTEXT_NEWLINE
1068                                                          | CONTEXT_BEGBUF);
1069       if (__glibc_unlikely (dfa->init_state_word == NULL
1070                             || dfa->init_state_nl == NULL
1071                             || dfa->init_state_begbuf == NULL))
1072         return err;
1073     }
1074   else
1075     dfa->init_state_word = dfa->init_state_nl
1076       = dfa->init_state_begbuf = dfa->init_state;
1077
1078   re_node_set_free (&init_nodes);
1079   return REG_NOERROR;
1080 }
1081 \f
1082 #ifdef RE_ENABLE_I18N
1083 /* If it is possible to do searching in single byte encoding instead of UTF-8
1084    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1085    DFA nodes where needed.  */
1086
1087 static void
1088 optimize_utf8 (re_dfa_t *dfa)
1089 {
1090   Idx node;
1091   int i;
1092   bool mb_chars = false;
1093   bool has_period = false;
1094
1095   for (node = 0; node < dfa->nodes_len; ++node)
1096     switch (dfa->nodes[node].type)
1097       {
1098       case CHARACTER:
1099         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1100           mb_chars = true;
1101         break;
1102       case ANCHOR:
1103         switch (dfa->nodes[node].opr.ctx_type)
1104           {
1105           case LINE_FIRST:
1106           case LINE_LAST:
1107           case BUF_FIRST:
1108           case BUF_LAST:
1109             break;
1110           default:
1111             /* Word anchors etc. cannot be handled.  It's okay to test
1112                opr.ctx_type since constraints (for all DFA nodes) are
1113                created by ORing one or more opr.ctx_type values.  */
1114             return;
1115           }
1116         break;
1117       case OP_PERIOD:
1118         has_period = true;
1119         break;
1120       case OP_BACK_REF:
1121       case OP_ALT:
1122       case END_OF_RE:
1123       case OP_DUP_ASTERISK:
1124       case OP_OPEN_SUBEXP:
1125       case OP_CLOSE_SUBEXP:
1126         break;
1127       case COMPLEX_BRACKET:
1128         return;
1129       case SIMPLE_BRACKET:
1130         /* Just double check.  */
1131         {
1132           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1133                         ? 0
1134                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1135           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1136             {
1137               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1138                 return;
1139               rshift = 0;
1140             }
1141         }
1142         break;
1143       default:
1144         abort ();
1145       }
1146
1147   if (mb_chars || has_period)
1148     for (node = 0; node < dfa->nodes_len; ++node)
1149       {
1150         if (dfa->nodes[node].type == CHARACTER
1151             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1152           dfa->nodes[node].mb_partial = 0;
1153         else if (dfa->nodes[node].type == OP_PERIOD)
1154           dfa->nodes[node].type = OP_UTF8_PERIOD;
1155       }
1156
1157   /* The search can be in single byte locale.  */
1158   dfa->mb_cur_max = 1;
1159   dfa->is_utf8 = 0;
1160   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1161 }
1162 #endif
1163 \f
1164 /* Analyze the structure tree, and calculate "first", "next", "edest",
1165    "eclosure", and "inveclosure".  */
1166
1167 static reg_errcode_t
1168 analyze (regex_t *preg)
1169 {
1170   re_dfa_t *dfa = preg->buffer;
1171   reg_errcode_t ret;
1172
1173   /* Allocate arrays.  */
1174   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1175   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1176   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1177   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1178   if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1179                         || dfa->edests == NULL || dfa->eclosures == NULL))
1180     return REG_ESPACE;
1181
1182   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1183   if (dfa->subexp_map != NULL)
1184     {
1185       Idx i;
1186       for (i = 0; i < preg->re_nsub; i++)
1187         dfa->subexp_map[i] = i;
1188       preorder (dfa->str_tree, optimize_subexps, dfa);
1189       for (i = 0; i < preg->re_nsub; i++)
1190         if (dfa->subexp_map[i] != i)
1191           break;
1192       if (i == preg->re_nsub)
1193         {
1194           re_free (dfa->subexp_map);
1195           dfa->subexp_map = NULL;
1196         }
1197     }
1198
1199   ret = postorder (dfa->str_tree, lower_subexps, preg);
1200   if (__glibc_unlikely (ret != REG_NOERROR))
1201     return ret;
1202   ret = postorder (dfa->str_tree, calc_first, dfa);
1203   if (__glibc_unlikely (ret != REG_NOERROR))
1204     return ret;
1205   preorder (dfa->str_tree, calc_next, dfa);
1206   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1207   if (__glibc_unlikely (ret != REG_NOERROR))
1208     return ret;
1209   ret = calc_eclosure (dfa);
1210   if (__glibc_unlikely (ret != REG_NOERROR))
1211     return ret;
1212
1213   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1214      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1215   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1216       || dfa->nbackref)
1217     {
1218       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1219       if (__glibc_unlikely (dfa->inveclosures == NULL))
1220         return REG_ESPACE;
1221       ret = calc_inveclosure (dfa);
1222     }
1223
1224   return ret;
1225 }
1226
1227 /* Our parse trees are very unbalanced, so we cannot use a stack to
1228    implement parse tree visits.  Instead, we use parent pointers and
1229    some hairy code in these two functions.  */
1230 static reg_errcode_t
1231 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1232            void *extra)
1233 {
1234   bin_tree_t *node, *prev;
1235
1236   for (node = root; ; )
1237     {
1238       /* Descend down the tree, preferably to the left (or to the right
1239          if that's the only child).  */
1240       while (node->left || node->right)
1241         if (node->left)
1242           node = node->left;
1243         else
1244           node = node->right;
1245
1246       do
1247         {
1248           reg_errcode_t err = fn (extra, node);
1249           if (__glibc_unlikely (err != REG_NOERROR))
1250             return err;
1251           if (node->parent == NULL)
1252             return REG_NOERROR;
1253           prev = node;
1254           node = node->parent;
1255         }
1256       /* Go up while we have a node that is reached from the right.  */
1257       while (node->right == prev || node->right == NULL);
1258       node = node->right;
1259     }
1260 }
1261
1262 static reg_errcode_t
1263 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1264           void *extra)
1265 {
1266   bin_tree_t *node;
1267
1268   for (node = root; ; )
1269     {
1270       reg_errcode_t err = fn (extra, node);
1271       if (__glibc_unlikely (err != REG_NOERROR))
1272         return err;
1273
1274       /* Go to the left node, or up and to the right.  */
1275       if (node->left)
1276         node = node->left;
1277       else
1278         {
1279           bin_tree_t *prev = NULL;
1280           while (node->right == prev || node->right == NULL)
1281             {
1282               prev = node;
1283               node = node->parent;
1284               if (!node)
1285                 return REG_NOERROR;
1286             }
1287           node = node->right;
1288         }
1289     }
1290 }
1291
1292 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1293    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1294    backreferences as well.  Requires a preorder visit.  */
1295 static reg_errcode_t
1296 optimize_subexps (void *extra, bin_tree_t *node)
1297 {
1298   re_dfa_t *dfa = (re_dfa_t *) extra;
1299
1300   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1301     {
1302       int idx = node->token.opr.idx;
1303       node->token.opr.idx = dfa->subexp_map[idx];
1304       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1305     }
1306
1307   else if (node->token.type == SUBEXP
1308            && node->left && node->left->token.type == SUBEXP)
1309     {
1310       Idx other_idx = node->left->token.opr.idx;
1311
1312       node->left = node->left->left;
1313       if (node->left)
1314         node->left->parent = node;
1315
1316       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1317       if (other_idx < BITSET_WORD_BITS)
1318         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1319     }
1320
1321   return REG_NOERROR;
1322 }
1323
1324 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1325    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1326 static reg_errcode_t
1327 lower_subexps (void *extra, bin_tree_t *node)
1328 {
1329   regex_t *preg = (regex_t *) extra;
1330   reg_errcode_t err = REG_NOERROR;
1331
1332   if (node->left && node->left->token.type == SUBEXP)
1333     {
1334       node->left = lower_subexp (&err, preg, node->left);
1335       if (node->left)
1336         node->left->parent = node;
1337     }
1338   if (node->right && node->right->token.type == SUBEXP)
1339     {
1340       node->right = lower_subexp (&err, preg, node->right);
1341       if (node->right)
1342         node->right->parent = node;
1343     }
1344
1345   return err;
1346 }
1347
1348 static bin_tree_t *
1349 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1350 {
1351   re_dfa_t *dfa = preg->buffer;
1352   bin_tree_t *body = node->left;
1353   bin_tree_t *op, *cls, *tree1, *tree;
1354
1355   if (preg->no_sub
1356       /* We do not optimize empty subexpressions, because otherwise we may
1357          have bad CONCAT nodes with NULL children.  This is obviously not
1358          very common, so we do not lose much.  An example that triggers
1359          this case is the sed "script" /\(\)/x.  */
1360       && node->left != NULL
1361       && (node->token.opr.idx >= BITSET_WORD_BITS
1362           || !(dfa->used_bkref_map
1363                & ((bitset_word_t) 1 << node->token.opr.idx))))
1364     return node->left;
1365
1366   /* Convert the SUBEXP node to the concatenation of an
1367      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1368   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1369   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1370   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1371   tree = create_tree (dfa, op, tree1, CONCAT);
1372   if (__glibc_unlikely (tree == NULL || tree1 == NULL
1373                         || op == NULL || cls == NULL))
1374     {
1375       *err = REG_ESPACE;
1376       return NULL;
1377     }
1378
1379   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1380   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1381   return tree;
1382 }
1383
1384 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1385    nodes.  Requires a postorder visit.  */
1386 static reg_errcode_t
1387 calc_first (void *extra, bin_tree_t *node)
1388 {
1389   re_dfa_t *dfa = (re_dfa_t *) extra;
1390   if (node->token.type == CONCAT)
1391     {
1392       node->first = node->left->first;
1393       node->node_idx = node->left->node_idx;
1394     }
1395   else
1396     {
1397       node->first = node;
1398       node->node_idx = re_dfa_add_node (dfa, node->token);
1399       if (__glibc_unlikely (node->node_idx == -1))
1400         return REG_ESPACE;
1401       if (node->token.type == ANCHOR)
1402         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1403     }
1404   return REG_NOERROR;
1405 }
1406
1407 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1408 static reg_errcode_t
1409 calc_next (void *extra, bin_tree_t *node)
1410 {
1411   switch (node->token.type)
1412     {
1413     case OP_DUP_ASTERISK:
1414       node->left->next = node;
1415       break;
1416     case CONCAT:
1417       node->left->next = node->right->first;
1418       node->right->next = node->next;
1419       break;
1420     default:
1421       if (node->left)
1422         node->left->next = node->next;
1423       if (node->right)
1424         node->right->next = node->next;
1425       break;
1426     }
1427   return REG_NOERROR;
1428 }
1429
1430 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1431 static reg_errcode_t
1432 link_nfa_nodes (void *extra, bin_tree_t *node)
1433 {
1434   re_dfa_t *dfa = (re_dfa_t *) extra;
1435   Idx idx = node->node_idx;
1436   reg_errcode_t err = REG_NOERROR;
1437
1438   switch (node->token.type)
1439     {
1440     case CONCAT:
1441       break;
1442
1443     case END_OF_RE:
1444       DEBUG_ASSERT (node->next == NULL);
1445       break;
1446
1447     case OP_DUP_ASTERISK:
1448     case OP_ALT:
1449       {
1450         Idx left, right;
1451         dfa->has_plural_match = 1;
1452         if (node->left != NULL)
1453           left = node->left->first->node_idx;
1454         else
1455           left = node->next->node_idx;
1456         if (node->right != NULL)
1457           right = node->right->first->node_idx;
1458         else
1459           right = node->next->node_idx;
1460         DEBUG_ASSERT (left > -1);
1461         DEBUG_ASSERT (right > -1);
1462         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1463       }
1464       break;
1465
1466     case ANCHOR:
1467     case OP_OPEN_SUBEXP:
1468     case OP_CLOSE_SUBEXP:
1469       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1470       break;
1471
1472     case OP_BACK_REF:
1473       dfa->nexts[idx] = node->next->node_idx;
1474       if (node->token.type == OP_BACK_REF)
1475         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1476       break;
1477
1478     default:
1479       DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1480       dfa->nexts[idx] = node->next->node_idx;
1481       break;
1482     }
1483
1484   return err;
1485 }
1486
1487 /* Duplicate the epsilon closure of the node ROOT_NODE.
1488    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1489    to their own constraint.  */
1490
1491 static reg_errcode_t
1492 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1493                         Idx root_node, unsigned int init_constraint)
1494 {
1495   Idx org_node, clone_node;
1496   bool ok;
1497   unsigned int constraint = init_constraint;
1498   for (org_node = top_org_node, clone_node = top_clone_node;;)
1499     {
1500       Idx org_dest, clone_dest;
1501       if (dfa->nodes[org_node].type == OP_BACK_REF)
1502         {
1503           /* If the back reference epsilon-transit, its destination must
1504              also have the constraint.  Then duplicate the epsilon closure
1505              of the destination of the back reference, and store it in
1506              edests of the back reference.  */
1507           org_dest = dfa->nexts[org_node];
1508           re_node_set_empty (dfa->edests + clone_node);
1509           clone_dest = duplicate_node (dfa, org_dest, constraint);
1510           if (__glibc_unlikely (clone_dest == -1))
1511             return REG_ESPACE;
1512           dfa->nexts[clone_node] = dfa->nexts[org_node];
1513           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1514           if (__glibc_unlikely (! ok))
1515             return REG_ESPACE;
1516         }
1517       else if (dfa->edests[org_node].nelem == 0)
1518         {
1519           /* In case of the node can't epsilon-transit, don't duplicate the
1520              destination and store the original destination as the
1521              destination of the node.  */
1522           dfa->nexts[clone_node] = dfa->nexts[org_node];
1523           break;
1524         }
1525       else if (dfa->edests[org_node].nelem == 1)
1526         {
1527           /* In case of the node can epsilon-transit, and it has only one
1528              destination.  */
1529           org_dest = dfa->edests[org_node].elems[0];
1530           re_node_set_empty (dfa->edests + clone_node);
1531           /* If the node is root_node itself, it means the epsilon closure
1532              has a loop.  Then tie it to the destination of the root_node.  */
1533           if (org_node == root_node && clone_node != org_node)
1534             {
1535               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1536               if (__glibc_unlikely (! ok))
1537                 return REG_ESPACE;
1538               break;
1539             }
1540           /* In case the node has another constraint, append it.  */
1541           constraint |= dfa->nodes[org_node].constraint;
1542           clone_dest = duplicate_node (dfa, org_dest, constraint);
1543           if (__glibc_unlikely (clone_dest == -1))
1544             return REG_ESPACE;
1545           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1546           if (__glibc_unlikely (! ok))
1547             return REG_ESPACE;
1548         }
1549       else /* dfa->edests[org_node].nelem == 2 */
1550         {
1551           /* In case of the node can epsilon-transit, and it has two
1552              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1553           org_dest = dfa->edests[org_node].elems[0];
1554           re_node_set_empty (dfa->edests + clone_node);
1555           /* Search for a duplicated node which satisfies the constraint.  */
1556           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1557           if (clone_dest == -1)
1558             {
1559               /* There is no such duplicated node, create a new one.  */
1560               reg_errcode_t err;
1561               clone_dest = duplicate_node (dfa, org_dest, constraint);
1562               if (__glibc_unlikely (clone_dest == -1))
1563                 return REG_ESPACE;
1564               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565               if (__glibc_unlikely (! ok))
1566                 return REG_ESPACE;
1567               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1568                                             root_node, constraint);
1569               if (__glibc_unlikely (err != REG_NOERROR))
1570                 return err;
1571             }
1572           else
1573             {
1574               /* There is a duplicated node which satisfies the constraint,
1575                  use it to avoid infinite loop.  */
1576               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1577               if (__glibc_unlikely (! ok))
1578                 return REG_ESPACE;
1579             }
1580
1581           org_dest = dfa->edests[org_node].elems[1];
1582           clone_dest = duplicate_node (dfa, org_dest, constraint);
1583           if (__glibc_unlikely (clone_dest == -1))
1584             return REG_ESPACE;
1585           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1586           if (__glibc_unlikely (! ok))
1587             return REG_ESPACE;
1588         }
1589       org_node = org_dest;
1590       clone_node = clone_dest;
1591     }
1592   return REG_NOERROR;
1593 }
1594
1595 /* Search for a node which is duplicated from the node ORG_NODE, and
1596    satisfies the constraint CONSTRAINT.  */
1597
1598 static Idx
1599 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1600                         unsigned int constraint)
1601 {
1602   Idx idx;
1603   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1604     {
1605       if (org_node == dfa->org_indices[idx]
1606           && constraint == dfa->nodes[idx].constraint)
1607         return idx; /* Found.  */
1608     }
1609   return -1; /* Not found.  */
1610 }
1611
1612 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1613    Return the index of the new node, or -1 if insufficient storage is
1614    available.  */
1615
1616 static Idx
1617 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1618 {
1619   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1620   if (__glibc_likely (dup_idx != -1))
1621     {
1622       dfa->nodes[dup_idx].constraint = constraint;
1623       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1624       dfa->nodes[dup_idx].duplicated = 1;
1625
1626       /* Store the index of the original node.  */
1627       dfa->org_indices[dup_idx] = org_idx;
1628     }
1629   return dup_idx;
1630 }
1631
1632 static reg_errcode_t
1633 calc_inveclosure (re_dfa_t *dfa)
1634 {
1635   Idx src, idx;
1636   bool ok;
1637   for (idx = 0; idx < dfa->nodes_len; ++idx)
1638     re_node_set_init_empty (dfa->inveclosures + idx);
1639
1640   for (src = 0; src < dfa->nodes_len; ++src)
1641     {
1642       Idx *elems = dfa->eclosures[src].elems;
1643       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1644         {
1645           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1646           if (__glibc_unlikely (! ok))
1647             return REG_ESPACE;
1648         }
1649     }
1650
1651   return REG_NOERROR;
1652 }
1653
1654 /* Calculate "eclosure" for all the node in DFA.  */
1655
1656 static reg_errcode_t
1657 calc_eclosure (re_dfa_t *dfa)
1658 {
1659   Idx node_idx;
1660   bool incomplete;
1661   DEBUG_ASSERT (dfa->nodes_len > 0);
1662   incomplete = false;
1663   /* For each nodes, calculate epsilon closure.  */
1664   for (node_idx = 0; ; ++node_idx)
1665     {
1666       reg_errcode_t err;
1667       re_node_set eclosure_elem;
1668       if (node_idx == dfa->nodes_len)
1669         {
1670           if (!incomplete)
1671             break;
1672           incomplete = false;
1673           node_idx = 0;
1674         }
1675
1676       DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1677
1678       /* If we have already calculated, skip it.  */
1679       if (dfa->eclosures[node_idx].nelem != 0)
1680         continue;
1681       /* Calculate epsilon closure of 'node_idx'.  */
1682       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1683       if (__glibc_unlikely (err != REG_NOERROR))
1684         return err;
1685
1686       if (dfa->eclosures[node_idx].nelem == 0)
1687         {
1688           incomplete = true;
1689           re_node_set_free (&eclosure_elem);
1690         }
1691     }
1692   return REG_NOERROR;
1693 }
1694
1695 /* Calculate epsilon closure of NODE.  */
1696
1697 static reg_errcode_t
1698 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1699 {
1700   reg_errcode_t err;
1701   Idx i;
1702   re_node_set eclosure;
1703   bool incomplete = false;
1704   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1705   if (__glibc_unlikely (err != REG_NOERROR))
1706     return err;
1707
1708   /* An epsilon closure includes itself.  */
1709   eclosure.elems[eclosure.nelem++] = node;
1710
1711   /* This indicates that we are calculating this node now.
1712      We reference this value to avoid infinite loop.  */
1713   dfa->eclosures[node].nelem = -1;
1714
1715   /* If the current node has constraints, duplicate all nodes
1716      since they must inherit the constraints.  */
1717   if (dfa->nodes[node].constraint
1718       && dfa->edests[node].nelem
1719       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1720     {
1721       err = duplicate_node_closure (dfa, node, node, node,
1722                                     dfa->nodes[node].constraint);
1723       if (__glibc_unlikely (err != REG_NOERROR))
1724         return err;
1725     }
1726
1727   /* Expand each epsilon destination nodes.  */
1728   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1729     for (i = 0; i < dfa->edests[node].nelem; ++i)
1730       {
1731         re_node_set eclosure_elem;
1732         Idx edest = dfa->edests[node].elems[i];
1733         /* If calculating the epsilon closure of 'edest' is in progress,
1734            return intermediate result.  */
1735         if (dfa->eclosures[edest].nelem == -1)
1736           {
1737             incomplete = true;
1738             continue;
1739           }
1740         /* If we haven't calculated the epsilon closure of 'edest' yet,
1741            calculate now. Otherwise use calculated epsilon closure.  */
1742         if (dfa->eclosures[edest].nelem == 0)
1743           {
1744             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1745             if (__glibc_unlikely (err != REG_NOERROR))
1746               return err;
1747           }
1748         else
1749           eclosure_elem = dfa->eclosures[edest];
1750         /* Merge the epsilon closure of 'edest'.  */
1751         err = re_node_set_merge (&eclosure, &eclosure_elem);
1752         if (__glibc_unlikely (err != REG_NOERROR))
1753           return err;
1754         /* If the epsilon closure of 'edest' is incomplete,
1755            the epsilon closure of this node is also incomplete.  */
1756         if (dfa->eclosures[edest].nelem == 0)
1757           {
1758             incomplete = true;
1759             re_node_set_free (&eclosure_elem);
1760           }
1761       }
1762
1763   if (incomplete && !root)
1764     dfa->eclosures[node].nelem = 0;
1765   else
1766     dfa->eclosures[node] = eclosure;
1767   *new_set = eclosure;
1768   return REG_NOERROR;
1769 }
1770 \f
1771 /* Functions for token which are used in the parser.  */
1772
1773 /* Fetch a token from INPUT.
1774    We must not use this function inside bracket expressions.  */
1775
1776 static void
1777 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1778 {
1779   re_string_skip_bytes (input, peek_token (result, input, syntax));
1780 }
1781
1782 /* Peek a token from INPUT, and return the length of the token.
1783    We must not use this function inside bracket expressions.  */
1784
1785 static int
1786 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1787 {
1788   unsigned char c;
1789
1790   if (re_string_eoi (input))
1791     {
1792       token->type = END_OF_RE;
1793       return 0;
1794     }
1795
1796   c = re_string_peek_byte (input, 0);
1797   token->opr.c = c;
1798
1799   token->word_char = 0;
1800 #ifdef RE_ENABLE_I18N
1801   token->mb_partial = 0;
1802   if (input->mb_cur_max > 1
1803       && !re_string_first_byte (input, re_string_cur_idx (input)))
1804     {
1805       token->type = CHARACTER;
1806       token->mb_partial = 1;
1807       return 1;
1808     }
1809 #endif
1810   if (c == '\\')
1811     {
1812       unsigned char c2;
1813       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1814         {
1815           token->type = BACK_SLASH;
1816           return 1;
1817         }
1818
1819       c2 = re_string_peek_byte_case (input, 1);
1820       token->opr.c = c2;
1821       token->type = CHARACTER;
1822 #ifdef RE_ENABLE_I18N
1823       if (input->mb_cur_max > 1)
1824         {
1825           wint_t wc = re_string_wchar_at (input,
1826                                           re_string_cur_idx (input) + 1);
1827           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1828         }
1829       else
1830 #endif
1831         token->word_char = IS_WORD_CHAR (c2) != 0;
1832
1833       switch (c2)
1834         {
1835         case '|':
1836           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1837             token->type = OP_ALT;
1838           break;
1839         case '1': case '2': case '3': case '4': case '5':
1840         case '6': case '7': case '8': case '9':
1841           if (!(syntax & RE_NO_BK_REFS))
1842             {
1843               token->type = OP_BACK_REF;
1844               token->opr.idx = c2 - '1';
1845             }
1846           break;
1847         case '<':
1848           if (!(syntax & RE_NO_GNU_OPS))
1849             {
1850               token->type = ANCHOR;
1851               token->opr.ctx_type = WORD_FIRST;
1852             }
1853           break;
1854         case '>':
1855           if (!(syntax & RE_NO_GNU_OPS))
1856             {
1857               token->type = ANCHOR;
1858               token->opr.ctx_type = WORD_LAST;
1859             }
1860           break;
1861         case 'b':
1862           if (!(syntax & RE_NO_GNU_OPS))
1863             {
1864               token->type = ANCHOR;
1865               token->opr.ctx_type = WORD_DELIM;
1866             }
1867           break;
1868         case 'B':
1869           if (!(syntax & RE_NO_GNU_OPS))
1870             {
1871               token->type = ANCHOR;
1872               token->opr.ctx_type = NOT_WORD_DELIM;
1873             }
1874           break;
1875         case 'w':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             token->type = OP_WORD;
1878           break;
1879         case 'W':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             token->type = OP_NOTWORD;
1882           break;
1883         case 's':
1884           if (!(syntax & RE_NO_GNU_OPS))
1885             token->type = OP_SPACE;
1886           break;
1887         case 'S':
1888           if (!(syntax & RE_NO_GNU_OPS))
1889             token->type = OP_NOTSPACE;
1890           break;
1891         case '`':
1892           if (!(syntax & RE_NO_GNU_OPS))
1893             {
1894               token->type = ANCHOR;
1895               token->opr.ctx_type = BUF_FIRST;
1896             }
1897           break;
1898         case '\'':
1899           if (!(syntax & RE_NO_GNU_OPS))
1900             {
1901               token->type = ANCHOR;
1902               token->opr.ctx_type = BUF_LAST;
1903             }
1904           break;
1905         case '(':
1906           if (!(syntax & RE_NO_BK_PARENS))
1907             token->type = OP_OPEN_SUBEXP;
1908           break;
1909         case ')':
1910           if (!(syntax & RE_NO_BK_PARENS))
1911             token->type = OP_CLOSE_SUBEXP;
1912           break;
1913         case '+':
1914           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915             token->type = OP_DUP_PLUS;
1916           break;
1917         case '?':
1918           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1919             token->type = OP_DUP_QUESTION;
1920           break;
1921         case '{':
1922           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923             token->type = OP_OPEN_DUP_NUM;
1924           break;
1925         case '}':
1926           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1927             token->type = OP_CLOSE_DUP_NUM;
1928           break;
1929         default:
1930           break;
1931         }
1932       return 2;
1933     }
1934
1935   token->type = CHARACTER;
1936 #ifdef RE_ENABLE_I18N
1937   if (input->mb_cur_max > 1)
1938     {
1939       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1940       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1941     }
1942   else
1943 #endif
1944     token->word_char = IS_WORD_CHAR (token->opr.c);
1945
1946   switch (c)
1947     {
1948     case '\n':
1949       if (syntax & RE_NEWLINE_ALT)
1950         token->type = OP_ALT;
1951       break;
1952     case '|':
1953       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1954         token->type = OP_ALT;
1955       break;
1956     case '*':
1957       token->type = OP_DUP_ASTERISK;
1958       break;
1959     case '+':
1960       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961         token->type = OP_DUP_PLUS;
1962       break;
1963     case '?':
1964       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1965         token->type = OP_DUP_QUESTION;
1966       break;
1967     case '{':
1968       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969         token->type = OP_OPEN_DUP_NUM;
1970       break;
1971     case '}':
1972       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1973         token->type = OP_CLOSE_DUP_NUM;
1974       break;
1975     case '(':
1976       if (syntax & RE_NO_BK_PARENS)
1977         token->type = OP_OPEN_SUBEXP;
1978       break;
1979     case ')':
1980       if (syntax & RE_NO_BK_PARENS)
1981         token->type = OP_CLOSE_SUBEXP;
1982       break;
1983     case '[':
1984       token->type = OP_OPEN_BRACKET;
1985       break;
1986     case '.':
1987       token->type = OP_PERIOD;
1988       break;
1989     case '^':
1990       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1991           && re_string_cur_idx (input) != 0)
1992         {
1993           char prev = re_string_peek_byte (input, -1);
1994           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1995             break;
1996         }
1997       token->type = ANCHOR;
1998       token->opr.ctx_type = LINE_FIRST;
1999       break;
2000     case '$':
2001       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
2002           && re_string_cur_idx (input) + 1 != re_string_length (input))
2003         {
2004           re_token_t next;
2005           re_string_skip_bytes (input, 1);
2006           peek_token (&next, input, syntax);
2007           re_string_skip_bytes (input, -1);
2008           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2009             break;
2010         }
2011       token->type = ANCHOR;
2012       token->opr.ctx_type = LINE_LAST;
2013       break;
2014     default:
2015       break;
2016     }
2017   return 1;
2018 }
2019
2020 /* Peek a token from INPUT, and return the length of the token.
2021    We must not use this function out of bracket expressions.  */
2022
2023 static int
2024 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2025 {
2026   unsigned char c;
2027   if (re_string_eoi (input))
2028     {
2029       token->type = END_OF_RE;
2030       return 0;
2031     }
2032   c = re_string_peek_byte (input, 0);
2033   token->opr.c = c;
2034
2035 #ifdef RE_ENABLE_I18N
2036   if (input->mb_cur_max > 1
2037       && !re_string_first_byte (input, re_string_cur_idx (input)))
2038     {
2039       token->type = CHARACTER;
2040       return 1;
2041     }
2042 #endif /* RE_ENABLE_I18N */
2043
2044   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2045       && re_string_cur_idx (input) + 1 < re_string_length (input))
2046     {
2047       /* In this case, '\' escape a character.  */
2048       unsigned char c2;
2049       re_string_skip_bytes (input, 1);
2050       c2 = re_string_peek_byte (input, 0);
2051       token->opr.c = c2;
2052       token->type = CHARACTER;
2053       return 1;
2054     }
2055   if (c == '[') /* '[' is a special char in a bracket exps.  */
2056     {
2057       unsigned char c2;
2058       int token_len;
2059       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2060         c2 = re_string_peek_byte (input, 1);
2061       else
2062         c2 = 0;
2063       token->opr.c = c2;
2064       token_len = 2;
2065       switch (c2)
2066         {
2067         case '.':
2068           token->type = OP_OPEN_COLL_ELEM;
2069           break;
2070
2071         case '=':
2072           token->type = OP_OPEN_EQUIV_CLASS;
2073           break;
2074
2075         case ':':
2076           if (syntax & RE_CHAR_CLASSES)
2077             {
2078               token->type = OP_OPEN_CHAR_CLASS;
2079               break;
2080             }
2081           FALLTHROUGH;
2082         default:
2083           token->type = CHARACTER;
2084           token->opr.c = c;
2085           token_len = 1;
2086           break;
2087         }
2088       return token_len;
2089     }
2090   switch (c)
2091     {
2092     case '-':
2093       token->type = OP_CHARSET_RANGE;
2094       break;
2095     case ']':
2096       token->type = OP_CLOSE_BRACKET;
2097       break;
2098     case '^':
2099       token->type = OP_NON_MATCH_LIST;
2100       break;
2101     default:
2102       token->type = CHARACTER;
2103     }
2104   return 1;
2105 }
2106 \f
2107 /* Functions for parser.  */
2108
2109 /* Entry point of the parser.
2110    Parse the regular expression REGEXP and return the structure tree.
2111    If an error occurs, ERR is set by error code, and return NULL.
2112    This function build the following tree, from regular expression <reg_exp>:
2113            CAT
2114            / \
2115           /   \
2116    <reg_exp>  EOR
2117
2118    CAT means concatenation.
2119    EOR means end of regular expression.  */
2120
2121 static bin_tree_t *
2122 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2123        reg_errcode_t *err)
2124 {
2125   re_dfa_t *dfa = preg->buffer;
2126   bin_tree_t *tree, *eor, *root;
2127   re_token_t current_token;
2128   dfa->syntax = syntax;
2129   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2130   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2131   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2132     return NULL;
2133   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2134   if (tree != NULL)
2135     root = create_tree (dfa, tree, eor, CONCAT);
2136   else
2137     root = eor;
2138   if (__glibc_unlikely (eor == NULL || root == NULL))
2139     {
2140       *err = REG_ESPACE;
2141       return NULL;
2142     }
2143   return root;
2144 }
2145
2146 /* This function build the following tree, from regular expression
2147    <branch1>|<branch2>:
2148            ALT
2149            / \
2150           /   \
2151    <branch1> <branch2>
2152
2153    ALT means alternative, which represents the operator '|'.  */
2154
2155 static bin_tree_t *
2156 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2157                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2158 {
2159   re_dfa_t *dfa = preg->buffer;
2160   bin_tree_t *tree, *branch = NULL;
2161   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2162   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2163   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2164     return NULL;
2165
2166   while (token->type == OP_ALT)
2167     {
2168       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2169       if (token->type != OP_ALT && token->type != END_OF_RE
2170           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2171         {
2172           bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2173           dfa->completed_bkref_map = initial_bkref_map;
2174           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2175           if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2176             {
2177               if (tree != NULL)
2178                 postorder (tree, free_tree, NULL);
2179               return NULL;
2180             }
2181           dfa->completed_bkref_map |= accumulated_bkref_map;
2182         }
2183       else
2184         branch = NULL;
2185       tree = create_tree (dfa, tree, branch, OP_ALT);
2186       if (__glibc_unlikely (tree == NULL))
2187         {
2188           *err = REG_ESPACE;
2189           return NULL;
2190         }
2191     }
2192   return tree;
2193 }
2194
2195 /* This function build the following tree, from regular expression
2196    <exp1><exp2>:
2197         CAT
2198         / \
2199        /   \
2200    <exp1> <exp2>
2201
2202    CAT means concatenation.  */
2203
2204 static bin_tree_t *
2205 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2206               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2207 {
2208   bin_tree_t *tree, *expr;
2209   re_dfa_t *dfa = preg->buffer;
2210   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2211   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2212     return NULL;
2213
2214   while (token->type != OP_ALT && token->type != END_OF_RE
2215          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2216     {
2217       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2218       if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2219         {
2220           if (tree != NULL)
2221             postorder (tree, free_tree, NULL);
2222           return NULL;
2223         }
2224       if (tree != NULL && expr != NULL)
2225         {
2226           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2227           if (newtree == NULL)
2228             {
2229               postorder (expr, free_tree, NULL);
2230               postorder (tree, free_tree, NULL);
2231               *err = REG_ESPACE;
2232               return NULL;
2233             }
2234           tree = newtree;
2235         }
2236       else if (tree == NULL)
2237         tree = expr;
2238       /* Otherwise expr == NULL, we don't need to create new tree.  */
2239     }
2240   return tree;
2241 }
2242
2243 /* This function build the following tree, from regular expression a*:
2244          *
2245          |
2246          a
2247 */
2248
2249 static bin_tree_t *
2250 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2251                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2252 {
2253   re_dfa_t *dfa = preg->buffer;
2254   bin_tree_t *tree;
2255   switch (token->type)
2256     {
2257     case CHARACTER:
2258       tree = create_token_tree (dfa, NULL, NULL, token);
2259       if (__glibc_unlikely (tree == NULL))
2260         {
2261           *err = REG_ESPACE;
2262           return NULL;
2263         }
2264 #ifdef RE_ENABLE_I18N
2265       if (dfa->mb_cur_max > 1)
2266         {
2267           while (!re_string_eoi (regexp)
2268                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2269             {
2270               bin_tree_t *mbc_remain;
2271               fetch_token (token, regexp, syntax);
2272               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2273               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2274               if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2275                 {
2276                   *err = REG_ESPACE;
2277                   return NULL;
2278                 }
2279             }
2280         }
2281 #endif
2282       break;
2283
2284     case OP_OPEN_SUBEXP:
2285       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2286       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2287         return NULL;
2288       break;
2289
2290     case OP_OPEN_BRACKET:
2291       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2292       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2293         return NULL;
2294       break;
2295
2296     case OP_BACK_REF:
2297       if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2298         {
2299           *err = REG_ESUBREG;
2300           return NULL;
2301         }
2302       dfa->used_bkref_map |= 1 << token->opr.idx;
2303       tree = create_token_tree (dfa, NULL, NULL, token);
2304       if (__glibc_unlikely (tree == NULL))
2305         {
2306           *err = REG_ESPACE;
2307           return NULL;
2308         }
2309       ++dfa->nbackref;
2310       dfa->has_mb_node = 1;
2311       break;
2312
2313     case OP_OPEN_DUP_NUM:
2314       if (syntax & RE_CONTEXT_INVALID_DUP)
2315         {
2316           *err = REG_BADRPT;
2317           return NULL;
2318         }
2319       FALLTHROUGH;
2320     case OP_DUP_ASTERISK:
2321     case OP_DUP_PLUS:
2322     case OP_DUP_QUESTION:
2323       if (syntax & RE_CONTEXT_INVALID_OPS)
2324         {
2325           *err = REG_BADRPT;
2326           return NULL;
2327         }
2328       else if (syntax & RE_CONTEXT_INDEP_OPS)
2329         {
2330           fetch_token (token, regexp, syntax);
2331           return parse_expression (regexp, preg, token, syntax, nest, err);
2332         }
2333       FALLTHROUGH;
2334     case OP_CLOSE_SUBEXP:
2335       if ((token->type == OP_CLOSE_SUBEXP)
2336           && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2337         {
2338           *err = REG_ERPAREN;
2339           return NULL;
2340         }
2341       FALLTHROUGH;
2342     case OP_CLOSE_DUP_NUM:
2343       /* We treat it as a normal character.  */
2344
2345       /* Then we can these characters as normal characters.  */
2346       token->type = CHARACTER;
2347       /* mb_partial and word_char bits should be initialized already
2348          by peek_token.  */
2349       tree = create_token_tree (dfa, NULL, NULL, token);
2350       if (__glibc_unlikely (tree == NULL))
2351         {
2352           *err = REG_ESPACE;
2353           return NULL;
2354         }
2355       break;
2356
2357     case ANCHOR:
2358       if ((token->opr.ctx_type
2359            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2360           && dfa->word_ops_used == 0)
2361         init_word_char (dfa);
2362       if (token->opr.ctx_type == WORD_DELIM
2363           || token->opr.ctx_type == NOT_WORD_DELIM)
2364         {
2365           bin_tree_t *tree_first, *tree_last;
2366           if (token->opr.ctx_type == WORD_DELIM)
2367             {
2368               token->opr.ctx_type = WORD_FIRST;
2369               tree_first = create_token_tree (dfa, NULL, NULL, token);
2370               token->opr.ctx_type = WORD_LAST;
2371             }
2372           else
2373             {
2374               token->opr.ctx_type = INSIDE_WORD;
2375               tree_first = create_token_tree (dfa, NULL, NULL, token);
2376               token->opr.ctx_type = INSIDE_NOTWORD;
2377             }
2378           tree_last = create_token_tree (dfa, NULL, NULL, token);
2379           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2380           if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2381                                 || tree == NULL))
2382             {
2383               *err = REG_ESPACE;
2384               return NULL;
2385             }
2386         }
2387       else
2388         {
2389           tree = create_token_tree (dfa, NULL, NULL, token);
2390           if (__glibc_unlikely (tree == NULL))
2391             {
2392               *err = REG_ESPACE;
2393               return NULL;
2394             }
2395         }
2396       /* We must return here, since ANCHORs can't be followed
2397          by repetition operators.
2398          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2399              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2400       fetch_token (token, regexp, syntax);
2401       return tree;
2402
2403     case OP_PERIOD:
2404       tree = create_token_tree (dfa, NULL, NULL, token);
2405       if (__glibc_unlikely (tree == NULL))
2406         {
2407           *err = REG_ESPACE;
2408           return NULL;
2409         }
2410       if (dfa->mb_cur_max > 1)
2411         dfa->has_mb_node = 1;
2412       break;
2413
2414     case OP_WORD:
2415     case OP_NOTWORD:
2416       tree = build_charclass_op (dfa, regexp->trans,
2417                                  "alnum",
2418                                  "_",
2419                                  token->type == OP_NOTWORD, err);
2420       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2421         return NULL;
2422       break;
2423
2424     case OP_SPACE:
2425     case OP_NOTSPACE:
2426       tree = build_charclass_op (dfa, regexp->trans,
2427                                  "space",
2428                                  "",
2429                                  token->type == OP_NOTSPACE, err);
2430       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2431         return NULL;
2432       break;
2433
2434     case OP_ALT:
2435     case END_OF_RE:
2436       return NULL;
2437
2438     case BACK_SLASH:
2439       *err = REG_EESCAPE;
2440       return NULL;
2441
2442     default:
2443       /* Must not happen?  */
2444       DEBUG_ASSERT (false);
2445       return NULL;
2446     }
2447   fetch_token (token, regexp, syntax);
2448
2449   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2450          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2451     {
2452       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2453                                            syntax, err);
2454       if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2455         {
2456           if (tree != NULL)
2457             postorder (tree, free_tree, NULL);
2458           return NULL;
2459         }
2460       tree = dup_tree;
2461       /* In BRE consecutive duplications are not allowed.  */
2462       if ((syntax & RE_CONTEXT_INVALID_DUP)
2463           && (token->type == OP_DUP_ASTERISK
2464               || token->type == OP_OPEN_DUP_NUM))
2465         {
2466           if (tree != NULL)
2467             postorder (tree, free_tree, NULL);
2468           *err = REG_BADRPT;
2469           return NULL;
2470         }
2471     }
2472
2473   return tree;
2474 }
2475
2476 /* This function build the following tree, from regular expression
2477    (<reg_exp>):
2478          SUBEXP
2479             |
2480         <reg_exp>
2481 */
2482
2483 static bin_tree_t *
2484 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2485                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2486 {
2487   re_dfa_t *dfa = preg->buffer;
2488   bin_tree_t *tree;
2489   size_t cur_nsub;
2490   cur_nsub = preg->re_nsub++;
2491
2492   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2493
2494   /* The subexpression may be a null string.  */
2495   if (token->type == OP_CLOSE_SUBEXP)
2496     tree = NULL;
2497   else
2498     {
2499       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2500       if (__glibc_unlikely (*err == REG_NOERROR
2501                             && token->type != OP_CLOSE_SUBEXP))
2502         {
2503           if (tree != NULL)
2504             postorder (tree, free_tree, NULL);
2505           *err = REG_EPAREN;
2506         }
2507       if (__glibc_unlikely (*err != REG_NOERROR))
2508         return NULL;
2509     }
2510
2511   if (cur_nsub <= '9' - '1')
2512     dfa->completed_bkref_map |= 1 << cur_nsub;
2513
2514   tree = create_tree (dfa, tree, NULL, SUBEXP);
2515   if (__glibc_unlikely (tree == NULL))
2516     {
2517       *err = REG_ESPACE;
2518       return NULL;
2519     }
2520   tree->token.opr.idx = cur_nsub;
2521   return tree;
2522 }
2523
2524 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2525
2526 static bin_tree_t *
2527 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2528               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2529 {
2530   bin_tree_t *tree = NULL, *old_tree = NULL;
2531   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2532   re_token_t start_token = *token;
2533
2534   if (token->type == OP_OPEN_DUP_NUM)
2535     {
2536       end = 0;
2537       start = fetch_number (regexp, token, syntax);
2538       if (start == -1)
2539         {
2540           if (token->type == CHARACTER && token->opr.c == ',')
2541             start = 0; /* We treat "{,m}" as "{0,m}".  */
2542           else
2543             {
2544               *err = REG_BADBR; /* <re>{} is invalid.  */
2545               return NULL;
2546             }
2547         }
2548       if (__glibc_likely (start != -2))
2549         {
2550           /* We treat "{n}" as "{n,n}".  */
2551           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2552                  : ((token->type == CHARACTER && token->opr.c == ',')
2553                     ? fetch_number (regexp, token, syntax) : -2));
2554         }
2555       if (__glibc_unlikely (start == -2 || end == -2))
2556         {
2557           /* Invalid sequence.  */
2558           if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2559             {
2560               if (token->type == END_OF_RE)
2561                 *err = REG_EBRACE;
2562               else
2563                 *err = REG_BADBR;
2564
2565               return NULL;
2566             }
2567
2568           /* If the syntax bit is set, rollback.  */
2569           re_string_set_index (regexp, start_idx);
2570           *token = start_token;
2571           token->type = CHARACTER;
2572           /* mb_partial and word_char bits should be already initialized by
2573              peek_token.  */
2574           return elem;
2575         }
2576
2577       if (__glibc_unlikely ((end != -1 && start > end)
2578                             || token->type != OP_CLOSE_DUP_NUM))
2579         {
2580           /* First number greater than second.  */
2581           *err = REG_BADBR;
2582           return NULL;
2583         }
2584
2585       if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2586         {
2587           *err = REG_ESIZE;
2588           return NULL;
2589         }
2590     }
2591   else
2592     {
2593       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2594       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2595     }
2596
2597   fetch_token (token, regexp, syntax);
2598
2599   if (__glibc_unlikely (elem == NULL))
2600     return NULL;
2601   if (__glibc_unlikely (start == 0 && end == 0))
2602     {
2603       postorder (elem, free_tree, NULL);
2604       return NULL;
2605     }
2606
2607   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2608   if (__glibc_unlikely (start > 0))
2609     {
2610       tree = elem;
2611       for (i = 2; i <= start; ++i)
2612         {
2613           elem = duplicate_tree (elem, dfa);
2614           tree = create_tree (dfa, tree, elem, CONCAT);
2615           if (__glibc_unlikely (elem == NULL || tree == NULL))
2616             goto parse_dup_op_espace;
2617         }
2618
2619       if (start == end)
2620         return tree;
2621
2622       /* Duplicate ELEM before it is marked optional.  */
2623       elem = duplicate_tree (elem, dfa);
2624       if (__glibc_unlikely (elem == NULL))
2625         goto parse_dup_op_espace;
2626       old_tree = tree;
2627     }
2628   else
2629     old_tree = NULL;
2630
2631   if (elem->token.type == SUBEXP)
2632     {
2633       uintptr_t subidx = elem->token.opr.idx;
2634       postorder (elem, mark_opt_subexp, (void *) subidx);
2635     }
2636
2637   tree = create_tree (dfa, elem, NULL,
2638                       (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2639   if (__glibc_unlikely (tree == NULL))
2640     goto parse_dup_op_espace;
2641
2642   /* This loop is actually executed only when end != -1,
2643      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2644      already created the start+1-th copy.  */
2645   if (TYPE_SIGNED (Idx) || end != -1)
2646     for (i = start + 2; i <= end; ++i)
2647       {
2648         elem = duplicate_tree (elem, dfa);
2649         tree = create_tree (dfa, tree, elem, CONCAT);
2650         if (__glibc_unlikely (elem == NULL || tree == NULL))
2651           goto parse_dup_op_espace;
2652
2653         tree = create_tree (dfa, tree, NULL, OP_ALT);
2654         if (__glibc_unlikely (tree == NULL))
2655           goto parse_dup_op_espace;
2656       }
2657
2658   if (old_tree)
2659     tree = create_tree (dfa, old_tree, tree, CONCAT);
2660
2661   return tree;
2662
2663  parse_dup_op_espace:
2664   *err = REG_ESPACE;
2665   return NULL;
2666 }
2667
2668 /* Size of the names for collating symbol/equivalence_class/character_class.
2669    I'm not sure, but maybe enough.  */
2670 #define BRACKET_NAME_BUF_SIZE 32
2671
2672 #ifndef _LIBC
2673
2674 # ifdef RE_ENABLE_I18N
2675 /* Convert the byte B to the corresponding wide character.  In a
2676    unibyte locale, treat B as itself.  In a multibyte locale, return
2677    WEOF if B is an encoding error.  */
2678 static wint_t
2679 parse_byte (unsigned char b, re_charset_t *mbcset)
2680 {
2681   return mbcset == NULL ? b : __btowc (b);
2682 }
2683 # endif
2684
2685   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2686      Build the range expression which starts from START_ELEM, and ends
2687      at END_ELEM.  The result are written to MBCSET and SBCSET.
2688      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2689      mbcset->range_ends, is a pointer argument since we may
2690      update it.  */
2691
2692 static reg_errcode_t
2693 # ifdef RE_ENABLE_I18N
2694 build_range_exp (const reg_syntax_t syntax,
2695                  bitset_t sbcset,
2696                  re_charset_t *mbcset,
2697                  Idx *range_alloc,
2698                  const bracket_elem_t *start_elem,
2699                  const bracket_elem_t *end_elem)
2700 # else /* not RE_ENABLE_I18N */
2701 build_range_exp (const reg_syntax_t syntax,
2702                  bitset_t sbcset,
2703                  const bracket_elem_t *start_elem,
2704                  const bracket_elem_t *end_elem)
2705 # endif /* not RE_ENABLE_I18N */
2706 {
2707   unsigned int start_ch, end_ch;
2708   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2709   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2710                         || start_elem->type == CHAR_CLASS
2711                         || end_elem->type == EQUIV_CLASS
2712                         || end_elem->type == CHAR_CLASS))
2713     return REG_ERANGE;
2714
2715   /* We can handle no multi character collating elements without libc
2716      support.  */
2717   if (__glibc_unlikely ((start_elem->type == COLL_SYM
2718                          && strlen ((char *) start_elem->opr.name) > 1)
2719                         || (end_elem->type == COLL_SYM
2720                             && strlen ((char *) end_elem->opr.name) > 1)))
2721     return REG_ECOLLATE;
2722
2723 # ifdef RE_ENABLE_I18N
2724   {
2725     wchar_t wc;
2726     wint_t start_wc;
2727     wint_t end_wc;
2728
2729     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2730                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2731                    : 0));
2732     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2733               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2734                  : 0));
2735     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2736                 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2737     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2738               ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2739     if (start_wc == WEOF || end_wc == WEOF)
2740       return REG_ECOLLATE;
2741     else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2742                                && start_wc > end_wc))
2743       return REG_ERANGE;
2744
2745     /* Got valid collation sequence values, add them as a new entry.
2746        However, for !_LIBC we have no collation elements: if the
2747        character set is single byte, the single byte character set
2748        that we build below suffices.  parse_bracket_exp passes
2749        no MBCSET if dfa->mb_cur_max == 1.  */
2750     if (mbcset)
2751       {
2752         /* Check the space of the arrays.  */
2753         if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2754           {
2755             /* There is not enough space, need realloc.  */
2756             wchar_t *new_array_start, *new_array_end;
2757             Idx new_nranges;
2758
2759             /* +1 in case of mbcset->nranges is 0.  */
2760             new_nranges = 2 * mbcset->nranges + 1;
2761             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2762                are NULL if *range_alloc == 0.  */
2763             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2764                                           new_nranges);
2765             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2766                                         new_nranges);
2767
2768             if (__glibc_unlikely (new_array_start == NULL
2769                                   || new_array_end == NULL))
2770               {
2771                 re_free (new_array_start);
2772                 re_free (new_array_end);
2773                 return REG_ESPACE;
2774               }
2775
2776             mbcset->range_starts = new_array_start;
2777             mbcset->range_ends = new_array_end;
2778             *range_alloc = new_nranges;
2779           }
2780
2781         mbcset->range_starts[mbcset->nranges] = start_wc;
2782         mbcset->range_ends[mbcset->nranges++] = end_wc;
2783       }
2784
2785     /* Build the table for single byte characters.  */
2786     for (wc = 0; wc < SBC_MAX; ++wc)
2787       {
2788         if (start_wc <= wc && wc <= end_wc)
2789           bitset_set (sbcset, wc);
2790       }
2791   }
2792 # else /* not RE_ENABLE_I18N */
2793   {
2794     unsigned int ch;
2795     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2796                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2797                    : 0));
2798     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2799               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2800                  : 0));
2801     if (start_ch > end_ch)
2802       return REG_ERANGE;
2803     /* Build the table for single byte characters.  */
2804     for (ch = 0; ch < SBC_MAX; ++ch)
2805       if (start_ch <= ch  && ch <= end_ch)
2806         bitset_set (sbcset, ch);
2807   }
2808 # endif /* not RE_ENABLE_I18N */
2809   return REG_NOERROR;
2810 }
2811 #endif /* not _LIBC */
2812
2813 #ifndef _LIBC
2814 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2815    Build the collating element which is represented by NAME.
2816    The result are written to MBCSET and SBCSET.
2817    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2818    pointer argument since we may update it.  */
2819
2820 static reg_errcode_t
2821 # ifdef RE_ENABLE_I18N
2822 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2823                         Idx *coll_sym_alloc, const unsigned char *name)
2824 # else /* not RE_ENABLE_I18N */
2825 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2826 # endif /* not RE_ENABLE_I18N */
2827 {
2828   size_t name_len = strlen ((const char *) name);
2829   if (__glibc_unlikely (name_len != 1))
2830     return REG_ECOLLATE;
2831   else
2832     {
2833       bitset_set (sbcset, name[0]);
2834       return REG_NOERROR;
2835     }
2836 }
2837 #endif /* not _LIBC */
2838
2839 /* This function parse bracket expression like "[abc]", "[a-c]",
2840    "[[.a-a.]]" etc.  */
2841
2842 static bin_tree_t *
2843 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2844                    reg_syntax_t syntax, reg_errcode_t *err)
2845 {
2846 #ifdef _LIBC
2847   const unsigned char *collseqmb;
2848   const char *collseqwc;
2849   uint32_t nrules;
2850   int32_t table_size;
2851   const int32_t *symb_table;
2852   const unsigned char *extra;
2853
2854   /* Local function for parse_bracket_exp used in _LIBC environment.
2855      Seek the collating symbol entry corresponding to NAME.
2856      Return the index of the symbol in the SYMB_TABLE,
2857      or -1 if not found.  */
2858
2859   auto inline int32_t
2860   __attribute__ ((always_inline))
2861   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2862     {
2863       int32_t elem;
2864
2865       for (elem = 0; elem < table_size; elem++)
2866         if (symb_table[2 * elem] != 0)
2867           {
2868             int32_t idx = symb_table[2 * elem + 1];
2869             /* Skip the name of collating element name.  */
2870             idx += 1 + extra[idx];
2871             if (/* Compare the length of the name.  */
2872                 name_len == extra[idx]
2873                 /* Compare the name.  */
2874                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2875               /* Yep, this is the entry.  */
2876               return elem;
2877           }
2878       return -1;
2879     }
2880
2881   /* Local function for parse_bracket_exp used in _LIBC environment.
2882      Look up the collation sequence value of BR_ELEM.
2883      Return the value if succeeded, UINT_MAX otherwise.  */
2884
2885   auto inline unsigned int
2886   __attribute__ ((always_inline))
2887   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2888     {
2889       if (br_elem->type == SB_CHAR)
2890         {
2891           /*
2892           if (MB_CUR_MAX == 1)
2893           */
2894           if (nrules == 0)
2895             return collseqmb[br_elem->opr.ch];
2896           else
2897             {
2898               wint_t wc = __btowc (br_elem->opr.ch);
2899               return __collseq_table_lookup (collseqwc, wc);
2900             }
2901         }
2902       else if (br_elem->type == MB_CHAR)
2903         {
2904           if (nrules != 0)
2905             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2906         }
2907       else if (br_elem->type == COLL_SYM)
2908         {
2909           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2910           if (nrules != 0)
2911             {
2912               int32_t elem, idx;
2913               elem = seek_collating_symbol_entry (br_elem->opr.name,
2914                                                   sym_name_len);
2915               if (elem != -1)
2916                 {
2917                   /* We found the entry.  */
2918                   idx = symb_table[2 * elem + 1];
2919                   /* Skip the name of collating element name.  */
2920                   idx += 1 + extra[idx];
2921                   /* Skip the byte sequence of the collating element.  */
2922                   idx += 1 + extra[idx];
2923                   /* Adjust for the alignment.  */
2924                   idx = (idx + 3) & ~3;
2925                   /* Skip the multibyte collation sequence value.  */
2926                   idx += sizeof (unsigned int);
2927                   /* Skip the wide char sequence of the collating element.  */
2928                   idx += sizeof (unsigned int) *
2929                     (1 + *(unsigned int *) (extra + idx));
2930                   /* Return the collation sequence value.  */
2931                   return *(unsigned int *) (extra + idx);
2932                 }
2933               else if (sym_name_len == 1)
2934                 {
2935                   /* No valid character.  Match it as a single byte
2936                      character.  */
2937                   return collseqmb[br_elem->opr.name[0]];
2938                 }
2939             }
2940           else if (sym_name_len == 1)
2941             return collseqmb[br_elem->opr.name[0]];
2942         }
2943       return UINT_MAX;
2944     }
2945
2946   /* Local function for parse_bracket_exp used in _LIBC environment.
2947      Build the range expression which starts from START_ELEM, and ends
2948      at END_ELEM.  The result are written to MBCSET and SBCSET.
2949      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2950      mbcset->range_ends, is a pointer argument since we may
2951      update it.  */
2952
2953   auto inline reg_errcode_t
2954   __attribute__ ((always_inline))
2955   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2956                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2957     {
2958       unsigned int ch;
2959       uint32_t start_collseq;
2960       uint32_t end_collseq;
2961
2962       /* Equivalence Classes and Character Classes can't be a range
2963          start/end.  */
2964       if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2965                             || start_elem->type == CHAR_CLASS
2966                             || end_elem->type == EQUIV_CLASS
2967                             || end_elem->type == CHAR_CLASS))
2968         return REG_ERANGE;
2969
2970       /* FIXME: Implement rational ranges here, too.  */
2971       start_collseq = lookup_collation_sequence_value (start_elem);
2972       end_collseq = lookup_collation_sequence_value (end_elem);
2973       /* Check start/end collation sequence values.  */
2974       if (__glibc_unlikely (start_collseq == UINT_MAX
2975                             || end_collseq == UINT_MAX))
2976         return REG_ECOLLATE;
2977       if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2978                             && start_collseq > end_collseq))
2979         return REG_ERANGE;
2980
2981       /* Got valid collation sequence values, add them as a new entry.
2982          However, if we have no collation elements, and the character set
2983          is single byte, the single byte character set that we
2984          build below suffices. */
2985       if (nrules > 0 || dfa->mb_cur_max > 1)
2986         {
2987           /* Check the space of the arrays.  */
2988           if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2989             {
2990               /* There is not enough space, need realloc.  */
2991               uint32_t *new_array_start;
2992               uint32_t *new_array_end;
2993               Idx new_nranges;
2994
2995               /* +1 in case of mbcset->nranges is 0.  */
2996               new_nranges = 2 * mbcset->nranges + 1;
2997               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2998                                             new_nranges);
2999               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3000                                           new_nranges);
3001
3002               if (__glibc_unlikely (new_array_start == NULL
3003                                     || new_array_end == NULL))
3004                 return REG_ESPACE;
3005
3006               mbcset->range_starts = new_array_start;
3007               mbcset->range_ends = new_array_end;
3008               *range_alloc = new_nranges;
3009             }
3010
3011           mbcset->range_starts[mbcset->nranges] = start_collseq;
3012           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3013         }
3014
3015       /* Build the table for single byte characters.  */
3016       for (ch = 0; ch < SBC_MAX; ch++)
3017         {
3018           uint32_t ch_collseq;
3019           /*
3020           if (MB_CUR_MAX == 1)
3021           */
3022           if (nrules == 0)
3023             ch_collseq = collseqmb[ch];
3024           else
3025             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3026           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3027             bitset_set (sbcset, ch);
3028         }
3029       return REG_NOERROR;
3030     }
3031
3032   /* Local function for parse_bracket_exp used in _LIBC environment.
3033      Build the collating element which is represented by NAME.
3034      The result are written to MBCSET and SBCSET.
3035      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3036      pointer argument since we may update it.  */
3037
3038   auto inline reg_errcode_t
3039   __attribute__ ((always_inline))
3040   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3041                           Idx *coll_sym_alloc, const unsigned char *name)
3042     {
3043       int32_t elem, idx;
3044       size_t name_len = strlen ((const char *) name);
3045       if (nrules != 0)
3046         {
3047           elem = seek_collating_symbol_entry (name, name_len);
3048           if (elem != -1)
3049             {
3050               /* We found the entry.  */
3051               idx = symb_table[2 * elem + 1];
3052               /* Skip the name of collating element name.  */
3053               idx += 1 + extra[idx];
3054             }
3055           else if (name_len == 1)
3056             {
3057               /* No valid character, treat it as a normal
3058                  character.  */
3059               bitset_set (sbcset, name[0]);
3060               return REG_NOERROR;
3061             }
3062           else
3063             return REG_ECOLLATE;
3064
3065           /* Got valid collation sequence, add it as a new entry.  */
3066           /* Check the space of the arrays.  */
3067           if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3068             {
3069               /* Not enough, realloc it.  */
3070               /* +1 in case of mbcset->ncoll_syms is 0.  */
3071               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3072               /* Use realloc since mbcset->coll_syms is NULL
3073                  if *alloc == 0.  */
3074               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3075                                                    new_coll_sym_alloc);
3076               if (__glibc_unlikely (new_coll_syms == NULL))
3077                 return REG_ESPACE;
3078               mbcset->coll_syms = new_coll_syms;
3079               *coll_sym_alloc = new_coll_sym_alloc;
3080             }
3081           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3082           return REG_NOERROR;
3083         }
3084       else
3085         {
3086           if (__glibc_unlikely (name_len != 1))
3087             return REG_ECOLLATE;
3088           else
3089             {
3090               bitset_set (sbcset, name[0]);
3091               return REG_NOERROR;
3092             }
3093         }
3094     }
3095 #endif
3096
3097   re_token_t br_token;
3098   re_bitset_ptr_t sbcset;
3099 #ifdef RE_ENABLE_I18N
3100   re_charset_t *mbcset;
3101   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3102   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3103 #endif /* not RE_ENABLE_I18N */
3104   bool non_match = false;
3105   bin_tree_t *work_tree;
3106   int token_len;
3107   bool first_round = true;
3108 #ifdef _LIBC
3109   collseqmb = (const unsigned char *)
3110     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3111   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3112   if (nrules)
3113     {
3114       /*
3115       if (MB_CUR_MAX > 1)
3116       */
3117       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3118       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3119       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3120                                                   _NL_COLLATE_SYMB_TABLEMB);
3121       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3122                                                    _NL_COLLATE_SYMB_EXTRAMB);
3123     }
3124 #endif
3125   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3126 #ifdef RE_ENABLE_I18N
3127   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3128 #endif /* RE_ENABLE_I18N */
3129 #ifdef RE_ENABLE_I18N
3130   if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3131 #else
3132   if (__glibc_unlikely (sbcset == NULL))
3133 #endif /* RE_ENABLE_I18N */
3134     {
3135       re_free (sbcset);
3136 #ifdef RE_ENABLE_I18N
3137       re_free (mbcset);
3138 #endif
3139       *err = REG_ESPACE;
3140       return NULL;
3141     }
3142
3143   token_len = peek_token_bracket (token, regexp, syntax);
3144   if (__glibc_unlikely (token->type == END_OF_RE))
3145     {
3146       *err = REG_BADPAT;
3147       goto parse_bracket_exp_free_return;
3148     }
3149   if (token->type == OP_NON_MATCH_LIST)
3150     {
3151 #ifdef RE_ENABLE_I18N
3152       mbcset->non_match = 1;
3153 #endif /* not RE_ENABLE_I18N */
3154       non_match = true;
3155       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3156         bitset_set (sbcset, '\n');
3157       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3158       token_len = peek_token_bracket (token, regexp, syntax);
3159       if (__glibc_unlikely (token->type == END_OF_RE))
3160         {
3161           *err = REG_BADPAT;
3162           goto parse_bracket_exp_free_return;
3163         }
3164     }
3165
3166   /* We treat the first ']' as a normal character.  */
3167   if (token->type == OP_CLOSE_BRACKET)
3168     token->type = CHARACTER;
3169
3170   while (1)
3171     {
3172       bracket_elem_t start_elem, end_elem;
3173       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3174       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3175       reg_errcode_t ret;
3176       int token_len2 = 0;
3177       bool is_range_exp = false;
3178       re_token_t token2;
3179
3180       start_elem.opr.name = start_name_buf;
3181       start_elem.type = COLL_SYM;
3182       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3183                                    syntax, first_round);
3184       if (__glibc_unlikely (ret != REG_NOERROR))
3185         {
3186           *err = ret;
3187           goto parse_bracket_exp_free_return;
3188         }
3189       first_round = false;
3190
3191       /* Get information about the next token.  We need it in any case.  */
3192       token_len = peek_token_bracket (token, regexp, syntax);
3193
3194       /* Do not check for ranges if we know they are not allowed.  */
3195       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3196         {
3197           if (__glibc_unlikely (token->type == END_OF_RE))
3198             {
3199               *err = REG_EBRACK;
3200               goto parse_bracket_exp_free_return;
3201             }
3202           if (token->type == OP_CHARSET_RANGE)
3203             {
3204               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3205               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3206               if (__glibc_unlikely (token2.type == END_OF_RE))
3207                 {
3208                   *err = REG_EBRACK;
3209                   goto parse_bracket_exp_free_return;
3210                 }
3211               if (token2.type == OP_CLOSE_BRACKET)
3212                 {
3213                   /* We treat the last '-' as a normal character.  */
3214                   re_string_skip_bytes (regexp, -token_len);
3215                   token->type = CHARACTER;
3216                 }
3217               else
3218                 is_range_exp = true;
3219             }
3220         }
3221
3222       if (is_range_exp == true)
3223         {
3224           end_elem.opr.name = end_name_buf;
3225           end_elem.type = COLL_SYM;
3226           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3227                                        dfa, syntax, true);
3228           if (__glibc_unlikely (ret != REG_NOERROR))
3229             {
3230               *err = ret;
3231               goto parse_bracket_exp_free_return;
3232             }
3233
3234           token_len = peek_token_bracket (token, regexp, syntax);
3235
3236 #ifdef _LIBC
3237           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3238                                   &start_elem, &end_elem);
3239 #else
3240 # ifdef RE_ENABLE_I18N
3241           *err = build_range_exp (syntax, sbcset,
3242                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3243                                   &range_alloc, &start_elem, &end_elem);
3244 # else
3245           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3246 # endif
3247 #endif /* RE_ENABLE_I18N */
3248           if (__glibc_unlikely (*err != REG_NOERROR))
3249             goto parse_bracket_exp_free_return;
3250         }
3251       else
3252         {
3253           switch (start_elem.type)
3254             {
3255             case SB_CHAR:
3256               bitset_set (sbcset, start_elem.opr.ch);
3257               break;
3258 #ifdef RE_ENABLE_I18N
3259             case MB_CHAR:
3260               /* Check whether the array has enough space.  */
3261               if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3262                 {
3263                   wchar_t *new_mbchars;
3264                   /* Not enough, realloc it.  */
3265                   /* +1 in case of mbcset->nmbchars is 0.  */
3266                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3267                   /* Use realloc since array is NULL if *alloc == 0.  */
3268                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3269                                             mbchar_alloc);
3270                   if (__glibc_unlikely (new_mbchars == NULL))
3271                     goto parse_bracket_exp_espace;
3272                   mbcset->mbchars = new_mbchars;
3273                 }
3274               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3275               break;
3276 #endif /* RE_ENABLE_I18N */
3277             case EQUIV_CLASS:
3278               *err = build_equiv_class (sbcset,
3279 #ifdef RE_ENABLE_I18N
3280                                         mbcset, &equiv_class_alloc,
3281 #endif /* RE_ENABLE_I18N */
3282                                         start_elem.opr.name);
3283               if (__glibc_unlikely (*err != REG_NOERROR))
3284                 goto parse_bracket_exp_free_return;
3285               break;
3286             case COLL_SYM:
3287               *err = build_collating_symbol (sbcset,
3288 #ifdef RE_ENABLE_I18N
3289                                              mbcset, &coll_sym_alloc,
3290 #endif /* RE_ENABLE_I18N */
3291                                              start_elem.opr.name);
3292               if (__glibc_unlikely (*err != REG_NOERROR))
3293                 goto parse_bracket_exp_free_return;
3294               break;
3295             case CHAR_CLASS:
3296               *err = build_charclass (regexp->trans, sbcset,
3297 #ifdef RE_ENABLE_I18N
3298                                       mbcset, &char_class_alloc,
3299 #endif /* RE_ENABLE_I18N */
3300                                       (const char *) start_elem.opr.name,
3301                                       syntax);
3302               if (__glibc_unlikely (*err != REG_NOERROR))
3303                goto parse_bracket_exp_free_return;
3304               break;
3305             default:
3306               DEBUG_ASSERT (false);
3307               break;
3308             }
3309         }
3310       if (__glibc_unlikely (token->type == END_OF_RE))
3311         {
3312           *err = REG_EBRACK;
3313           goto parse_bracket_exp_free_return;
3314         }
3315       if (token->type == OP_CLOSE_BRACKET)
3316         break;
3317     }
3318
3319   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3320
3321   /* If it is non-matching list.  */
3322   if (non_match)
3323     bitset_not (sbcset);
3324
3325 #ifdef RE_ENABLE_I18N
3326   /* Ensure only single byte characters are set.  */
3327   if (dfa->mb_cur_max > 1)
3328     bitset_mask (sbcset, dfa->sb_char);
3329
3330   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3331       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3332                                                      || mbcset->non_match)))
3333     {
3334       bin_tree_t *mbc_tree;
3335       int sbc_idx;
3336       /* Build a tree for complex bracket.  */
3337       dfa->has_mb_node = 1;
3338       br_token.type = COMPLEX_BRACKET;
3339       br_token.opr.mbcset = mbcset;
3340       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3341       if (__glibc_unlikely (mbc_tree == NULL))
3342         goto parse_bracket_exp_espace;
3343       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3344         if (sbcset[sbc_idx])
3345           break;
3346       /* If there are no bits set in sbcset, there is no point
3347          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3348       if (sbc_idx < BITSET_WORDS)
3349         {
3350           /* Build a tree for simple bracket.  */
3351           br_token.type = SIMPLE_BRACKET;
3352           br_token.opr.sbcset = sbcset;
3353           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3354           if (__glibc_unlikely (work_tree == NULL))
3355             goto parse_bracket_exp_espace;
3356
3357           /* Then join them by ALT node.  */
3358           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3359           if (__glibc_unlikely (work_tree == NULL))
3360             goto parse_bracket_exp_espace;
3361         }
3362       else
3363         {
3364           re_free (sbcset);
3365           work_tree = mbc_tree;
3366         }
3367     }
3368   else
3369 #endif /* not RE_ENABLE_I18N */
3370     {
3371 #ifdef RE_ENABLE_I18N
3372       free_charset (mbcset);
3373 #endif
3374       /* Build a tree for simple bracket.  */
3375       br_token.type = SIMPLE_BRACKET;
3376       br_token.opr.sbcset = sbcset;
3377       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3378       if (__glibc_unlikely (work_tree == NULL))
3379         goto parse_bracket_exp_espace;
3380     }
3381   return work_tree;
3382
3383  parse_bracket_exp_espace:
3384   *err = REG_ESPACE;
3385  parse_bracket_exp_free_return:
3386   re_free (sbcset);
3387 #ifdef RE_ENABLE_I18N
3388   free_charset (mbcset);
3389 #endif /* RE_ENABLE_I18N */
3390   return NULL;
3391 }
3392
3393 /* Parse an element in the bracket expression.  */
3394
3395 static reg_errcode_t
3396 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3397                        re_token_t *token, int token_len, re_dfa_t *dfa,
3398                        reg_syntax_t syntax, bool accept_hyphen)
3399 {
3400 #ifdef RE_ENABLE_I18N
3401   int cur_char_size;
3402   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3403   if (cur_char_size > 1)
3404     {
3405       elem->type = MB_CHAR;
3406       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3407       re_string_skip_bytes (regexp, cur_char_size);
3408       return REG_NOERROR;
3409     }
3410 #endif /* RE_ENABLE_I18N */
3411   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3412   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3413       || token->type == OP_OPEN_EQUIV_CLASS)
3414     return parse_bracket_symbol (elem, regexp, token);
3415   if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3416     {
3417       /* A '-' must only appear as anything but a range indicator before
3418          the closing bracket.  Everything else is an error.  */
3419       re_token_t token2;
3420       (void) peek_token_bracket (&token2, regexp, syntax);
3421       if (token2.type != OP_CLOSE_BRACKET)
3422         /* The actual error value is not standardized since this whole
3423            case is undefined.  But ERANGE makes good sense.  */
3424         return REG_ERANGE;
3425     }
3426   elem->type = SB_CHAR;
3427   elem->opr.ch = token->opr.c;
3428   return REG_NOERROR;
3429 }
3430
3431 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3432    such as [:<character_class>:], [.<collating_element>.], and
3433    [=<equivalent_class>=].  */
3434
3435 static reg_errcode_t
3436 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3437                       re_token_t *token)
3438 {
3439   unsigned char ch, delim = token->opr.c;
3440   int i = 0;
3441   if (re_string_eoi(regexp))
3442     return REG_EBRACK;
3443   for (;; ++i)
3444     {
3445       if (i >= BRACKET_NAME_BUF_SIZE)
3446         return REG_EBRACK;
3447       if (token->type == OP_OPEN_CHAR_CLASS)
3448         ch = re_string_fetch_byte_case (regexp);
3449       else
3450         ch = re_string_fetch_byte (regexp);
3451       if (re_string_eoi(regexp))
3452         return REG_EBRACK;
3453       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3454         break;
3455       elem->opr.name[i] = ch;
3456     }
3457   re_string_skip_bytes (regexp, 1);
3458   elem->opr.name[i] = '\0';
3459   switch (token->type)
3460     {
3461     case OP_OPEN_COLL_ELEM:
3462       elem->type = COLL_SYM;
3463       break;
3464     case OP_OPEN_EQUIV_CLASS:
3465       elem->type = EQUIV_CLASS;
3466       break;
3467     case OP_OPEN_CHAR_CLASS:
3468       elem->type = CHAR_CLASS;
3469       break;
3470     default:
3471       break;
3472     }
3473   return REG_NOERROR;
3474 }
3475
3476   /* Helper function for parse_bracket_exp.
3477      Build the equivalence class which is represented by NAME.
3478      The result are written to MBCSET and SBCSET.
3479      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3480      is a pointer argument since we may update it.  */
3481
3482 static reg_errcode_t
3483 #ifdef RE_ENABLE_I18N
3484 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3485                    Idx *equiv_class_alloc, const unsigned char *name)
3486 #else /* not RE_ENABLE_I18N */
3487 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3488 #endif /* not RE_ENABLE_I18N */
3489 {
3490 #ifdef _LIBC
3491   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3492   if (nrules != 0)
3493     {
3494       const int32_t *table, *indirect;
3495       const unsigned char *weights, *extra, *cp;
3496       unsigned char char_buf[2];
3497       int32_t idx1, idx2;
3498       unsigned int ch;
3499       size_t len;
3500       /* Calculate the index for equivalence class.  */
3501       cp = name;
3502       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3503       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3504                                                _NL_COLLATE_WEIGHTMB);
3505       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3506                                                    _NL_COLLATE_EXTRAMB);
3507       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3508                                                 _NL_COLLATE_INDIRECTMB);
3509       idx1 = findidx (table, indirect, extra, &cp, -1);
3510       if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3511         /* This isn't a valid character.  */
3512         return REG_ECOLLATE;
3513
3514       /* Build single byte matching table for this equivalence class.  */
3515       len = weights[idx1 & 0xffffff];
3516       for (ch = 0; ch < SBC_MAX; ++ch)
3517         {
3518           char_buf[0] = ch;
3519           cp = char_buf;
3520           idx2 = findidx (table, indirect, extra, &cp, 1);
3521 /*
3522           idx2 = table[ch];
3523 */
3524           if (idx2 == 0)
3525             /* This isn't a valid character.  */
3526             continue;
3527           /* Compare only if the length matches and the collation rule
3528              index is the same.  */
3529           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3530               && memcmp (weights + (idx1 & 0xffffff) + 1,
3531                          weights + (idx2 & 0xffffff) + 1, len) == 0)
3532             bitset_set (sbcset, ch);
3533         }
3534       /* Check whether the array has enough space.  */
3535       if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3536         {
3537           /* Not enough, realloc it.  */
3538           /* +1 in case of mbcset->nequiv_classes is 0.  */
3539           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3540           /* Use realloc since the array is NULL if *alloc == 0.  */
3541           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3542                                                    int32_t,
3543                                                    new_equiv_class_alloc);
3544           if (__glibc_unlikely (new_equiv_classes == NULL))
3545             return REG_ESPACE;
3546           mbcset->equiv_classes = new_equiv_classes;
3547           *equiv_class_alloc = new_equiv_class_alloc;
3548         }
3549       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3550     }
3551   else
3552 #endif /* _LIBC */
3553     {
3554       if (__glibc_unlikely (strlen ((const char *) name) != 1))
3555         return REG_ECOLLATE;
3556       bitset_set (sbcset, *name);
3557     }
3558   return REG_NOERROR;
3559 }
3560
3561   /* Helper function for parse_bracket_exp.
3562      Build the character class which is represented by NAME.
3563      The result are written to MBCSET and SBCSET.
3564      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3565      is a pointer argument since we may update it.  */
3566
3567 static reg_errcode_t
3568 #ifdef RE_ENABLE_I18N
3569 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3570                  re_charset_t *mbcset, Idx *char_class_alloc,
3571                  const char *class_name, reg_syntax_t syntax)
3572 #else /* not RE_ENABLE_I18N */
3573 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3574                  const char *class_name, reg_syntax_t syntax)
3575 #endif /* not RE_ENABLE_I18N */
3576 {
3577   int i;
3578   const char *name = class_name;
3579
3580   /* In case of REG_ICASE "upper" and "lower" match the both of
3581      upper and lower cases.  */
3582   if ((syntax & RE_ICASE)
3583       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3584     name = "alpha";
3585
3586 #ifdef RE_ENABLE_I18N
3587   /* Check the space of the arrays.  */
3588   if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3589     {
3590       /* Not enough, realloc it.  */
3591       /* +1 in case of mbcset->nchar_classes is 0.  */
3592       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3593       /* Use realloc since array is NULL if *alloc == 0.  */
3594       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3595                                                new_char_class_alloc);
3596       if (__glibc_unlikely (new_char_classes == NULL))
3597         return REG_ESPACE;
3598       mbcset->char_classes = new_char_classes;
3599       *char_class_alloc = new_char_class_alloc;
3600     }
3601   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3602 #endif /* RE_ENABLE_I18N */
3603
3604 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3605   do {                                          \
3606     if (__glibc_unlikely (trans != NULL))                       \
3607       {                                         \
3608         for (i = 0; i < SBC_MAX; ++i)           \
3609           if (ctype_func (i))                   \
3610             bitset_set (sbcset, trans[i]);      \
3611       }                                         \
3612     else                                        \
3613       {                                         \
3614         for (i = 0; i < SBC_MAX; ++i)           \
3615           if (ctype_func (i))                   \
3616             bitset_set (sbcset, i);             \
3617       }                                         \
3618   } while (0)
3619
3620   if (strcmp (name, "alnum") == 0)
3621     BUILD_CHARCLASS_LOOP (isalnum);
3622   else if (strcmp (name, "cntrl") == 0)
3623     BUILD_CHARCLASS_LOOP (iscntrl);
3624   else if (strcmp (name, "lower") == 0)
3625     BUILD_CHARCLASS_LOOP (islower);
3626   else if (strcmp (name, "space") == 0)
3627     BUILD_CHARCLASS_LOOP (isspace);
3628   else if (strcmp (name, "alpha") == 0)
3629     BUILD_CHARCLASS_LOOP (isalpha);
3630   else if (strcmp (name, "digit") == 0)
3631     BUILD_CHARCLASS_LOOP (isdigit);
3632   else if (strcmp (name, "print") == 0)
3633     BUILD_CHARCLASS_LOOP (isprint);
3634   else if (strcmp (name, "upper") == 0)
3635     BUILD_CHARCLASS_LOOP (isupper);
3636   else if (strcmp (name, "blank") == 0)
3637     BUILD_CHARCLASS_LOOP (isblank);
3638   else if (strcmp (name, "graph") == 0)
3639     BUILD_CHARCLASS_LOOP (isgraph);
3640   else if (strcmp (name, "punct") == 0)
3641     BUILD_CHARCLASS_LOOP (ispunct);
3642   else if (strcmp (name, "xdigit") == 0)
3643     BUILD_CHARCLASS_LOOP (isxdigit);
3644   else
3645     return REG_ECTYPE;
3646
3647   return REG_NOERROR;
3648 }
3649
3650 static bin_tree_t *
3651 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3652                     const char *class_name,
3653                     const char *extra, bool non_match,
3654                     reg_errcode_t *err)
3655 {
3656   re_bitset_ptr_t sbcset;
3657 #ifdef RE_ENABLE_I18N
3658   re_charset_t *mbcset;
3659   Idx alloc = 0;
3660 #endif /* not RE_ENABLE_I18N */
3661   reg_errcode_t ret;
3662   bin_tree_t *tree;
3663
3664   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3665   if (__glibc_unlikely (sbcset == NULL))
3666     {
3667       *err = REG_ESPACE;
3668       return NULL;
3669     }
3670 #ifdef RE_ENABLE_I18N
3671   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3672   if (__glibc_unlikely (mbcset == NULL))
3673     {
3674       re_free (sbcset);
3675       *err = REG_ESPACE;
3676       return NULL;
3677     }
3678   mbcset->non_match = non_match;
3679 #endif /* RE_ENABLE_I18N */
3680
3681   /* We don't care the syntax in this case.  */
3682   ret = build_charclass (trans, sbcset,
3683 #ifdef RE_ENABLE_I18N
3684                          mbcset, &alloc,
3685 #endif /* RE_ENABLE_I18N */
3686                          class_name, 0);
3687
3688   if (__glibc_unlikely (ret != REG_NOERROR))
3689     {
3690       re_free (sbcset);
3691 #ifdef RE_ENABLE_I18N
3692       free_charset (mbcset);
3693 #endif /* RE_ENABLE_I18N */
3694       *err = ret;
3695       return NULL;
3696     }
3697   /* \w match '_' also.  */
3698   for (; *extra; extra++)
3699     bitset_set (sbcset, *extra);
3700
3701   /* If it is non-matching list.  */
3702   if (non_match)
3703     bitset_not (sbcset);
3704
3705 #ifdef RE_ENABLE_I18N
3706   /* Ensure only single byte characters are set.  */
3707   if (dfa->mb_cur_max > 1)
3708     bitset_mask (sbcset, dfa->sb_char);
3709 #endif
3710
3711   /* Build a tree for simple bracket.  */
3712   re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3713   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3714   if (__glibc_unlikely (tree == NULL))
3715     goto build_word_op_espace;
3716
3717 #ifdef RE_ENABLE_I18N
3718   if (dfa->mb_cur_max > 1)
3719     {
3720       bin_tree_t *mbc_tree;
3721       /* Build a tree for complex bracket.  */
3722       br_token.type = COMPLEX_BRACKET;
3723       br_token.opr.mbcset = mbcset;
3724       dfa->has_mb_node = 1;
3725       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3726       if (__glibc_unlikely (mbc_tree == NULL))
3727         goto build_word_op_espace;
3728       /* Then join them by ALT node.  */
3729       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3730       if (__glibc_likely (mbc_tree != NULL))
3731         return tree;
3732     }
3733   else
3734     {
3735       free_charset (mbcset);
3736       return tree;
3737     }
3738 #else /* not RE_ENABLE_I18N */
3739   return tree;
3740 #endif /* not RE_ENABLE_I18N */
3741
3742  build_word_op_espace:
3743   re_free (sbcset);
3744 #ifdef RE_ENABLE_I18N
3745   free_charset (mbcset);
3746 #endif /* RE_ENABLE_I18N */
3747   *err = REG_ESPACE;
3748   return NULL;
3749 }
3750
3751 /* This is intended for the expressions like "a{1,3}".
3752    Fetch a number from 'input', and return the number.
3753    Return -1 if the number field is empty like "{,1}".
3754    Return RE_DUP_MAX + 1 if the number field is too large.
3755    Return -2 if an error occurred.  */
3756
3757 static Idx
3758 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3759 {
3760   Idx num = -1;
3761   unsigned char c;
3762   while (1)
3763     {
3764       fetch_token (token, input, syntax);
3765       c = token->opr.c;
3766       if (__glibc_unlikely (token->type == END_OF_RE))
3767         return -2;
3768       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3769         break;
3770       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3771              ? -2
3772              : num == -1
3773              ? c - '0'
3774              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3775     }
3776   return num;
3777 }
3778 \f
3779 #ifdef RE_ENABLE_I18N
3780 static void
3781 free_charset (re_charset_t *cset)
3782 {
3783   re_free (cset->mbchars);
3784 # ifdef _LIBC
3785   re_free (cset->coll_syms);
3786   re_free (cset->equiv_classes);
3787 # endif
3788   re_free (cset->range_starts);
3789   re_free (cset->range_ends);
3790   re_free (cset->char_classes);
3791   re_free (cset);
3792 }
3793 #endif /* RE_ENABLE_I18N */
3794 \f
3795 /* Functions for binary tree operation.  */
3796
3797 /* Create a tree node.  */
3798
3799 static bin_tree_t *
3800 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3801              re_token_type_t type)
3802 {
3803   re_token_t t = { .type = type };
3804   return create_token_tree (dfa, left, right, &t);
3805 }
3806
3807 static bin_tree_t *
3808 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3809                    const re_token_t *token)
3810 {
3811   bin_tree_t *tree;
3812   if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3813     {
3814       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3815
3816       if (storage == NULL)
3817         return NULL;
3818       storage->next = dfa->str_tree_storage;
3819       dfa->str_tree_storage = storage;
3820       dfa->str_tree_storage_idx = 0;
3821     }
3822   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3823
3824   tree->parent = NULL;
3825   tree->left = left;
3826   tree->right = right;
3827   tree->token = *token;
3828   tree->token.duplicated = 0;
3829   tree->token.opt_subexp = 0;
3830   tree->first = NULL;
3831   tree->next = NULL;
3832   tree->node_idx = -1;
3833
3834   if (left != NULL)
3835     left->parent = tree;
3836   if (right != NULL)
3837     right->parent = tree;
3838   return tree;
3839 }
3840
3841 /* Mark the tree SRC as an optional subexpression.
3842    To be called from preorder or postorder.  */
3843
3844 static reg_errcode_t
3845 mark_opt_subexp (void *extra, bin_tree_t *node)
3846 {
3847   Idx idx = (uintptr_t) extra;
3848   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849     node->token.opt_subexp = 1;
3850
3851   return REG_NOERROR;
3852 }
3853
3854 /* Free the allocated memory inside NODE. */
3855
3856 static void
3857 free_token (re_token_t *node)
3858 {
3859 #ifdef RE_ENABLE_I18N
3860   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861     free_charset (node->opr.mbcset);
3862   else
3863 #endif /* RE_ENABLE_I18N */
3864     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865       re_free (node->opr.sbcset);
3866 }
3867
3868 /* Worker function for tree walking.  Free the allocated memory inside NODE
3869    and its children. */
3870
3871 static reg_errcode_t
3872 free_tree (void *extra, bin_tree_t *node)
3873 {
3874   free_token (&node->token);
3875   return REG_NOERROR;
3876 }
3877
3878
3879 /* Duplicate the node SRC, and return new node.  This is a preorder
3880    visit similar to the one implemented by the generic visitor, but
3881    we need more infrastructure to maintain two parallel trees --- so,
3882    it's easier to duplicate.  */
3883
3884 static bin_tree_t *
3885 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3886 {
3887   const bin_tree_t *node;
3888   bin_tree_t *dup_root;
3889   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3890
3891   for (node = root; ; )
3892     {
3893       /* Create a new tree and link it back to the current parent.  */
3894       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3895       if (*p_new == NULL)
3896         return NULL;
3897       (*p_new)->parent = dup_node;
3898       (*p_new)->token.duplicated = 1;
3899       dup_node = *p_new;
3900
3901       /* Go to the left node, or up and to the right.  */
3902       if (node->left)
3903         {
3904           node = node->left;
3905           p_new = &dup_node->left;
3906         }
3907       else
3908         {
3909           const bin_tree_t *prev = NULL;
3910           while (node->right == prev || node->right == NULL)
3911             {
3912               prev = node;
3913               node = node->parent;
3914               dup_node = dup_node->parent;
3915               if (!node)
3916                 return dup_root;
3917             }
3918           node = node->right;
3919           p_new = &dup_node->right;
3920         }
3921     }
3922 }