posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #ifdef _LIBC
  21 # include <locale/weight.h>
  22 #endif
  23
  24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  25                                           size_t length, reg_syntax_t syntax);
  26 static void re_compile_fastmap_iter (regex_t *bufp,
  27                                      const re_dfastate_t *init_state,
  28                                      char *fastmap);
  29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  30 #ifdef RE_ENABLE_I18N
  31 static void free_charset (re_charset_t *cset);
  32 #endif /* RE_ENABLE_I18N */
  33 static void free_workarea_compile (regex_t *preg);
  34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  35 #ifdef RE_ENABLE_I18N
  36 static void optimize_utf8 (re_dfa_t *dfa);
  37 #endif
  38 static reg_errcode_t analyze (regex_t *preg);
  39 static reg_errcode_t preorder (bin_tree_t *root,
  40                                reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                void *extra);
  42 static reg_errcode_t postorder (bin_tree_t *root,
  43                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  44                                 void *extra);
  45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  48                                  bin_tree_t *node);
  49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  54                                    unsigned int constraint);
  55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  57                                          Idx node, bool root);
  58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  59 static Idx fetch_number (re_string_t *input, re_token_t *token,
  60                          reg_syntax_t syntax);
  61 static int peek_token (re_token_t *token, re_string_t *input,
  62                         reg_syntax_t syntax);
  63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  64                           reg_syntax_t syntax, reg_errcode_t *err);
  65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  66                                   re_token_t *token, reg_syntax_t syntax,
  67                                   Idx nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  69                                  re_token_t *token, reg_syntax_t syntax,
  70                                  Idx nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  72                                      re_token_t *token, reg_syntax_t syntax,
  73                                      Idx nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  75                                   re_token_t *token, reg_syntax_t syntax,
  76                                   Idx nest, reg_errcode_t *err);
  77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  78                                  re_dfa_t *dfa, re_token_t *token,
  79                                  reg_syntax_t syntax, reg_errcode_t *err);
  80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  81                                       re_token_t *token, reg_syntax_t syntax,
  82                                       reg_errcode_t *err);
  83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  84                                             re_string_t *regexp,
  85                                             re_token_t *token, int token_len,
  86                                             re_dfa_t *dfa,
  87                                             reg_syntax_t syntax,
  88                                             bool accept_hyphen);
  89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  90                                           re_string_t *regexp,
  91                                           re_token_t *token);
  92 #ifdef RE_ENABLE_I18N
  93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  94                                         re_charset_t *mbcset,
  95                                         Idx *equiv_class_alloc,
  96                                         const unsigned char *name);
  97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  98                                       bitset_t sbcset,
  99                                       re_charset_t *mbcset,
 100                                       Idx *char_class_alloc,
 101                                       const char *class_name,
 102                                       reg_syntax_t syntax);
 103 #else  /* not RE_ENABLE_I18N */
 104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 105                                         const unsigned char *name);
 106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 107                                       bitset_t sbcset,
 108                                       const char *class_name,
 109                                       reg_syntax_t syntax);
 110 #endif /* not RE_ENABLE_I18N */
 111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 112                                        RE_TRANSLATE_TYPE trans,
 113                                        const char *class_name,
 114                                        const char *extra,
 115                                        bool non_match, reg_errcode_t *err);
 116 static bin_tree_t *create_tree (re_dfa_t *dfa,
 117                                 bin_tree_t *left, bin_tree_t *right,
 118                                 re_token_type_t type);
 119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 120                                       bin_tree_t *left, bin_tree_t *right,
 121                                       const re_token_t *token);
 122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 123 static void free_token (re_token_t *node);
 124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 126 \f
 127 /* This table gives an error message for each of the error codes listed
 128    in regex.h.  Obviously the order here has to be same as there.
 129    POSIX doesn't require that we do anything for REG_NOERROR,
 130    but why not be nice?  */
 131
 132 static const char __re_error_msgid[] =
 133   {
 134 #define REG_NOERROR_IDX 0
 135     gettext_noop ("Success")    /* REG_NOERROR */
 136     "\0"
 137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 138     gettext_noop ("No match")   /* REG_NOMATCH */
 139     "\0"
 140 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 142     "\0"
 143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 145     "\0"
 146 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 148     "\0"
 149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 151     "\0"
 152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 154     "\0"
 155 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 156     gettext_noop ("Unmatched [, [^, [:, [., or [=")     /* REG_EBRACK */
 157     "\0"
 158 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
 159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 160     "\0"
 161 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 163     "\0"
 164 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 166     "\0"
 167 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 168     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 169     "\0"
 170 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 172     "\0"
 173 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 175     "\0"
 176 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 178     "\0"
 179 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 181     "\0"
 182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 184   };
 185
 186 static const size_t __re_error_msgid_idx[] =
 187   {
 188     REG_NOERROR_IDX,
 189     REG_NOMATCH_IDX,
 190     REG_BADPAT_IDX,
 191     REG_ECOLLATE_IDX,
 192     REG_ECTYPE_IDX,
 193     REG_EESCAPE_IDX,
 194     REG_ESUBREG_IDX,
 195     REG_EBRACK_IDX,
 196     REG_EPAREN_IDX,
 197     REG_EBRACE_IDX,
 198     REG_BADBR_IDX,
 199     REG_ERANGE_IDX,
 200     REG_ESPACE_IDX,
 201     REG_BADRPT_IDX,
 202     REG_EEND_IDX,
 203     REG_ESIZE_IDX,
 204     REG_ERPAREN_IDX
 205   };
 206 \f
 207 /* Entry points for GNU code.  */
 208
 209 /* re_compile_pattern is the GNU regular expression compiler: it
 210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 211    Returns 0 if the pattern was valid, otherwise an error string.
 212
 213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 214    are set in BUFP on entry.  */
 215
 216 const char *
 217 re_compile_pattern (const char *pattern, size_t length,
 218                     struct re_pattern_buffer *bufp)
 219 {
 220   reg_errcode_t ret;
 221
 222   /* And GNU code determines whether or not to get register information
 223      by passing null for the REGS argument to re_match, etc., not by
 224      setting no_sub, unless RE_NO_SUB is set.  */
 225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 226
 227   /* Match anchors at newline.  */
 228   bufp->newline_anchor = 1;
 229
 230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 231
 232   if (!ret)
 233     return NULL;
 234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 235 }
 236 weak_alias (__re_compile_pattern, re_compile_pattern)
 237
 238 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 239    also be assigned to arbitrarily: each pattern buffer stores its own
 240    syntax, so it can be changed between regex compilations.  */
 241 /* This has no initializer because initialized variables in Emacs
 242    become read-only after dumping.  */
 243 reg_syntax_t re_syntax_options;
 244
 245
 246 /* Specify the precise syntax of regexps for compilation.  This provides
 247    for compatibility for various utilities which historically have
 248    different, incompatible syntaxes.
 249
 250    The argument SYNTAX is a bit mask comprised of the various bits
 251    defined in regex.h.  We return the old syntax.  */
 252
 253 reg_syntax_t
 254 re_set_syntax (reg_syntax_t syntax)
 255 {
 256   reg_syntax_t ret = re_syntax_options;
 257
 258   re_syntax_options = syntax;
 259   return ret;
 260 }
 261 weak_alias (__re_set_syntax, re_set_syntax)
 262
 263 int
 264 re_compile_fastmap (struct re_pattern_buffer *bufp)
 265 {
 266   re_dfa_t *dfa = bufp->buffer;
 267   char *fastmap = bufp->fastmap;
 268
 269   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 270   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 271   if (dfa->init_state != dfa->init_state_word)
 272     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 273   if (dfa->init_state != dfa->init_state_nl)
 274     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 275   if (dfa->init_state != dfa->init_state_begbuf)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 277   bufp->fastmap_accurate = 1;
 278   return 0;
 279 }
 280 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 281
 282 static inline void
 283 __attribute__ ((always_inline))
 284 re_set_fastmap (char *fastmap, bool icase, int ch)
 285 {
 286   fastmap[ch] = 1;
 287   if (icase)
 288     fastmap[tolower (ch)] = 1;
 289 }
 290
 291 /* Helper function for re_compile_fastmap.
 292    Compile fastmap for the initial_state INIT_STATE.  */
 293
 294 static void
 295 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 296                          char *fastmap)
 297 {
 298   re_dfa_t *dfa = bufp->buffer;
 299   Idx node_cnt;
 300   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 301   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 302     {
 303       Idx node = init_state->nodes.elems[node_cnt];
 304       re_token_type_t type = dfa->nodes[node].type;
 305
 306       if (type == CHARACTER)
 307         {
 308           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 309 #ifdef RE_ENABLE_I18N
 310           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 311             {
 312               unsigned char buf[MB_LEN_MAX];
 313               unsigned char *p;
 314               wchar_t wc;
 315               mbstate_t state;
 316
 317               p = buf;
 318               *p++ = dfa->nodes[node].opr.c;
 319               while (++node < dfa->nodes_len
 320                      && dfa->nodes[node].type == CHARACTER
 321                      && dfa->nodes[node].mb_partial)
 322                 *p++ = dfa->nodes[node].opr.c;
 323               memset (&state, '\0', sizeof (state));
 324               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 325                              &state) == p - buf
 326                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 327                       != (size_t) -1))
 328                 re_set_fastmap (fastmap, false, buf[0]);
 329             }
 330 #endif
 331         }
 332       else if (type == SIMPLE_BRACKET)
 333         {
 334           int i, ch;
 335           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 336             {
 337               int j;
 338               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 339               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 340                 if (w & ((bitset_word_t) 1 << j))
 341                   re_set_fastmap (fastmap, icase, ch);
 342             }
 343         }
 344 #ifdef RE_ENABLE_I18N
 345       else if (type == COMPLEX_BRACKET)
 346         {
 347           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 348           Idx i;
 349
 350 # ifdef _LIBC
 351           /* See if we have to try all bytes which start multiple collation
 352              elements.
 353              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 354                   collation element, and don't catch 'b' since 'b' is
 355                   the only collation element which starts from 'b' (and
 356                   it is caught by SIMPLE_BRACKET).  */
 357               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 358                   && (cset->ncoll_syms || cset->nranges))
 359                 {
 360                   const int32_t *table = (const int32_t *)
 361                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 362                   for (i = 0; i < SBC_MAX; ++i)
 363                     if (table[i] < 0)
 364                       re_set_fastmap (fastmap, icase, i);
 365                 }
 366 # endif /* _LIBC */
 367
 368           /* See if we have to start the match at all multibyte characters,
 369              i.e. where we would not find an invalid sequence.  This only
 370              applies to multibyte character sets; for single byte character
 371              sets, the SIMPLE_BRACKET again suffices.  */
 372           if (dfa->mb_cur_max > 1
 373               && (cset->nchar_classes || cset->non_match || cset->nranges
 374 # ifdef _LIBC
 375                   || cset->nequiv_classes
 376 # endif /* _LIBC */
 377                  ))
 378             {
 379               unsigned char c = 0;
 380               do
 381                 {
 382                   mbstate_t mbs;
 383                   memset (&mbs, 0, sizeof (mbs));
 384                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 385                     re_set_fastmap (fastmap, false, (int) c);
 386                 }
 387               while (++c != 0);
 388             }
 389
 390           else
 391             {
 392               /* ... Else catch all bytes which can start the mbchars.  */
 393               for (i = 0; i < cset->nmbchars; ++i)
 394                 {
 395                   char buf[256];
 396                   mbstate_t state;
 397                   memset (&state, '\0', sizeof (state));
 398                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 399                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 400                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 401                     {
 402                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 403                           != (size_t) -1)
 404                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 405                     }
 406                 }
 407             }
 408         }
 409 #endif /* RE_ENABLE_I18N */
 410       else if (type == OP_PERIOD
 411 #ifdef RE_ENABLE_I18N
 412                || type == OP_UTF8_PERIOD
 413 #endif /* RE_ENABLE_I18N */
 414                || type == END_OF_RE)
 415         {
 416           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 417           if (type == END_OF_RE)
 418             bufp->can_be_null = 1;
 419           return;
 420         }
 421     }
 422 }
 423 \f
 424 /* Entry point for POSIX code.  */
 425 /* regcomp takes a regular expression as a string and compiles it.
 426
 427    PREG is a regex_t *.  We do not expect any fields to be initialized,
 428    since POSIX says we shouldn't.  Thus, we set
 429
 430      'buffer' to the compiled pattern;
 431      'used' to the length of the compiled pattern;
 432      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 433        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 434        RE_SYNTAX_POSIX_BASIC;
 435      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 436      'fastmap' to an allocated space for the fastmap;
 437      'fastmap_accurate' to zero;
 438      're_nsub' to the number of subexpressions in PATTERN.
 439
 440    PATTERN is the address of the pattern string.
 441
 442    CFLAGS is a series of bits which affect compilation.
 443
 444      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 445      use POSIX basic syntax.
 446
 447      If REG_NEWLINE is set, then . and [^...] don't match newline.
 448      Also, regexec will try a match beginning after every newline.
 449
 450      If REG_ICASE is set, then we considers upper- and lowercase
 451      versions of letters to be equivalent when matching.
 452
 453      If REG_NOSUB is set, then when PREG is passed to regexec, that
 454      routine will report only success or failure, and nothing about the
 455      registers.
 456
 457    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 458    the return codes and their meanings.)  */
 459
 460 int
 461 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 462 {
 463   reg_errcode_t ret;
 464   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 465                          : RE_SYNTAX_POSIX_BASIC);
 466
 467   preg->buffer = NULL;
 468   preg->allocated = 0;
 469   preg->used = 0;
 470
 471   /* Try to allocate space for the fastmap.  */
 472   preg->fastmap = re_malloc (char, SBC_MAX);
 473   if (__glibc_unlikely (preg->fastmap == NULL))
 474     return REG_ESPACE;
 475
 476   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 477
 478   /* If REG_NEWLINE is set, newlines are treated differently.  */
 479   if (cflags & REG_NEWLINE)
 480     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 481       syntax &= ~RE_DOT_NEWLINE;
 482       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 483       /* It also changes the matching behavior.  */
 484       preg->newline_anchor = 1;
 485     }
 486   else
 487     preg->newline_anchor = 0;
 488   preg->no_sub = !!(cflags & REG_NOSUB);
 489   preg->translate = NULL;
 490
 491   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 492
 493   /* POSIX doesn't distinguish between an unmatched open-group and an
 494      unmatched close-group: both are REG_EPAREN.  */
 495   if (ret == REG_ERPAREN)
 496     ret = REG_EPAREN;
 497
 498   /* We have already checked preg->fastmap != NULL.  */
 499   if (__glibc_likely (ret == REG_NOERROR))
 500     /* Compute the fastmap now, since regexec cannot modify the pattern
 501        buffer.  This function never fails in this implementation.  */
 502     (void) re_compile_fastmap (preg);
 503   else
 504     {
 505       /* Some error occurred while compiling the expression.  */
 506       re_free (preg->fastmap);
 507       preg->fastmap = NULL;
 508     }
 509
 510   return (int) ret;
 511 }
 512 libc_hidden_def (__regcomp)
 513 weak_alias (__regcomp, regcomp)
 514
 515 /* Returns a message corresponding to an error code, ERRCODE, returned
 516    from either regcomp or regexec.   We don't use PREG here.  */
 517
 518 size_t
 519 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
 520           size_t errbuf_size)
 521 {
 522   const char *msg;
 523   size_t msg_size;
 524   int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
 525
 526   if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
 527     /* Only error codes returned by the rest of the code should be passed
 528        to this routine.  If we are given anything else, or if other regex
 529        code generates an invalid error code, then the program has a bug.
 530        Dump core so we can fix it.  */
 531     abort ();
 532
 533   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 534
 535   msg_size = strlen (msg) + 1; /* Includes the null.  */
 536
 537   if (__glibc_likely (errbuf_size != 0))
 538     {
 539       size_t cpy_size = msg_size;
 540       if (__glibc_unlikely (msg_size > errbuf_size))
 541         {
 542           cpy_size = errbuf_size - 1;
 543           errbuf[cpy_size] = '\0';
 544         }
 545       memcpy (errbuf, msg, cpy_size);
 546     }
 547
 548   return msg_size;
 549 }
 550 weak_alias (__regerror, regerror)
 551
 552
 553 #ifdef RE_ENABLE_I18N
 554 /* This static array is used for the map to single-byte characters when
 555    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 556    it the same all the time.  UTF-8 is the preferred encoding so this is
 557    a worthwhile optimization.  */
 558 static const bitset_t utf8_sb_map =
 559 {
 560   /* Set the first 128 bits.  */
 561 # if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
 562   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 563 # else
 564 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 565 #   error "bitset_word_t is narrower than 32 bits"
 566 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 567   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 568 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 569   BITSET_WORD_MAX, BITSET_WORD_MAX,
 570 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 571   BITSET_WORD_MAX,
 572 #  endif
 573   (BITSET_WORD_MAX
 574    >> (SBC_MAX % BITSET_WORD_BITS == 0
 575        ? 0
 576        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 577 # endif
 578 };
 579 #endif
 580
 581
 582 static void
 583 free_dfa_content (re_dfa_t *dfa)
 584 {
 585   Idx i, j;
 586
 587   if (dfa->nodes)
 588     for (i = 0; i < dfa->nodes_len; ++i)
 589       free_token (dfa->nodes + i);
 590   re_free (dfa->nexts);
 591   for (i = 0; i < dfa->nodes_len; ++i)
 592     {
 593       if (dfa->eclosures != NULL)
 594         re_node_set_free (dfa->eclosures + i);
 595       if (dfa->inveclosures != NULL)
 596         re_node_set_free (dfa->inveclosures + i);
 597       if (dfa->edests != NULL)
 598         re_node_set_free (dfa->edests + i);
 599     }
 600   re_free (dfa->edests);
 601   re_free (dfa->eclosures);
 602   re_free (dfa->inveclosures);
 603   re_free (dfa->nodes);
 604
 605   if (dfa->state_table)
 606     for (i = 0; i <= dfa->state_hash_mask; ++i)
 607       {
 608         struct re_state_table_entry *entry = dfa->state_table + i;
 609         for (j = 0; j < entry->num; ++j)
 610           {
 611             re_dfastate_t *state = entry->array[j];
 612             free_state (state);
 613           }
 614         re_free (entry->array);
 615       }
 616   re_free (dfa->state_table);
 617 #ifdef RE_ENABLE_I18N
 618   if (dfa->sb_char != utf8_sb_map)
 619     re_free (dfa->sb_char);
 620 #endif
 621   re_free (dfa->subexp_map);
 622 #ifdef DEBUG
 623   re_free (dfa->re_str);
 624 #endif
 625
 626   re_free (dfa);
 627 }
 628
 629
 630 /* Free dynamically allocated space used by PREG.  */
 631
 632 void
 633 regfree (regex_t *preg)
 634 {
 635   re_dfa_t *dfa = preg->buffer;
 636   if (__glibc_likely (dfa != NULL))
 637     {
 638       lock_fini (dfa->lock);
 639       free_dfa_content (dfa);
 640     }
 641   preg->buffer = NULL;
 642   preg->allocated = 0;
 643
 644   re_free (preg->fastmap);
 645   preg->fastmap = NULL;
 646
 647   re_free (preg->translate);
 648   preg->translate = NULL;
 649 }
 650 libc_hidden_def (__regfree)
 651 weak_alias (__regfree, regfree)
 652 \f
 653 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 654    them unless specifically requested.  */
 655
 656 #if defined _REGEX_RE_COMP || defined _LIBC
 657
 658 /* BSD has one and only one pattern buffer.  */
 659 static struct re_pattern_buffer re_comp_buf;
 660
 661 char *
 662 # ifdef _LIBC
 663 /* Make these definitions weak in libc, so POSIX programs can redefine
 664    these names if they don't use our functions, and still use
 665    regcomp/regexec above without link errors.  */
 666 weak_function
 667 # endif
 668 re_comp (const char *s)
 669 {
 670   reg_errcode_t ret;
 671   char *fastmap;
 672
 673   if (!s)
 674     {
 675       if (!re_comp_buf.buffer)
 676         return gettext ("No previous regular expression");
 677       return 0;
 678     }
 679
 680   if (re_comp_buf.buffer)
 681     {
 682       fastmap = re_comp_buf.fastmap;
 683       re_comp_buf.fastmap = NULL;
 684       __regfree (&re_comp_buf);
 685       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 686       re_comp_buf.fastmap = fastmap;
 687     }
 688
 689   if (re_comp_buf.fastmap == NULL)
 690     {
 691       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
 692       if (re_comp_buf.fastmap == NULL)
 693         return (char *) gettext (__re_error_msgid
 694                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 695     }
 696
 697   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 698      don't need to initialize the pattern buffer fields which affect it.  */
 699
 700   /* Match anchors at newlines.  */
 701   re_comp_buf.newline_anchor = 1;
 702
 703   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 704
 705   if (!ret)
 706     return NULL;
 707
 708   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 709   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 710 }
 711
 712 #ifdef _LIBC
 713 void
 714 __libc_regcomp_freemem (void)
 715 {
 716   __regfree (&re_comp_buf);
 717 }
 718 #endif
 719
 720 #endif /* _REGEX_RE_COMP */
 721 \f
 722 /* Internal entry point.
 723    Compile the regular expression PATTERN, whose length is LENGTH.
 724    SYNTAX indicate regular expression's syntax.  */
 725
 726 static reg_errcode_t
 727 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 728                      reg_syntax_t syntax)
 729 {
 730   reg_errcode_t err = REG_NOERROR;
 731   re_dfa_t *dfa;
 732   re_string_t regexp;
 733
 734   /* Initialize the pattern buffer.  */
 735   preg->fastmap_accurate = 0;
 736   preg->syntax = syntax;
 737   preg->not_bol = preg->not_eol = 0;
 738   preg->used = 0;
 739   preg->re_nsub = 0;
 740   preg->can_be_null = 0;
 741   preg->regs_allocated = REGS_UNALLOCATED;
 742
 743   /* Initialize the dfa.  */
 744   dfa = preg->buffer;
 745   if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
 746     {
 747       /* If zero allocated, but buffer is non-null, try to realloc
 748          enough space.  This loses if buffer's address is bogus, but
 749          that is the user's responsibility.  If ->buffer is NULL this
 750          is a simple allocation.  */
 751       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 752       if (dfa == NULL)
 753         return REG_ESPACE;
 754       preg->allocated = sizeof (re_dfa_t);
 755       preg->buffer = dfa;
 756     }
 757   preg->used = sizeof (re_dfa_t);
 758
 759   err = init_dfa (dfa, length);
 760   if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
 761     err = REG_ESPACE;
 762   if (__glibc_unlikely (err != REG_NOERROR))
 763     {
 764       free_dfa_content (dfa);
 765       preg->buffer = NULL;
 766       preg->allocated = 0;
 767       return err;
 768     }
 769 #ifdef DEBUG
 770   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 771   dfa->re_str = re_malloc (char, length + 1);
 772   strncpy (dfa->re_str, pattern, length + 1);
 773 #endif
 774
 775   err = re_string_construct (&regexp, pattern, length, preg->translate,
 776                              (syntax & RE_ICASE) != 0, dfa);
 777   if (__glibc_unlikely (err != REG_NOERROR))
 778     {
 779     re_compile_internal_free_return:
 780       free_workarea_compile (preg);
 781       re_string_destruct (&regexp);
 782       lock_fini (dfa->lock);
 783       free_dfa_content (dfa);
 784       preg->buffer = NULL;
 785       preg->allocated = 0;
 786       return err;
 787     }
 788
 789   /* Parse the regular expression, and build a structure tree.  */
 790   preg->re_nsub = 0;
 791   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 792   if (__glibc_unlikely (dfa->str_tree == NULL))
 793     goto re_compile_internal_free_return;
 794
 795   /* Analyze the tree and create the nfa.  */
 796   err = analyze (preg);
 797   if (__glibc_unlikely (err != REG_NOERROR))
 798     goto re_compile_internal_free_return;
 799
 800 #ifdef RE_ENABLE_I18N
 801   /* If possible, do searching in single byte encoding to speed things up.  */
 802   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 803     optimize_utf8 (dfa);
 804 #endif
 805
 806   /* Then create the initial state of the dfa.  */
 807   err = create_initial_state (dfa);
 808
 809   /* Release work areas.  */
 810   free_workarea_compile (preg);
 811   re_string_destruct (&regexp);
 812
 813   if (__glibc_unlikely (err != REG_NOERROR))
 814     {
 815       lock_fini (dfa->lock);
 816       free_dfa_content (dfa);
 817       preg->buffer = NULL;
 818       preg->allocated = 0;
 819     }
 820
 821   return err;
 822 }
 823
 824 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 825    as the initial length of some arrays.  */
 826
 827 static reg_errcode_t
 828 init_dfa (re_dfa_t *dfa, size_t pat_len)
 829 {
 830   __re_size_t table_size;
 831 #ifndef _LIBC
 832   const char *codeset_name;
 833 #endif
 834 #ifdef RE_ENABLE_I18N
 835   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 836 #else
 837   size_t max_i18n_object_size = 0;
 838 #endif
 839   size_t max_object_size =
 840     MAX (sizeof (struct re_state_table_entry),
 841          MAX (sizeof (re_token_t),
 842               MAX (sizeof (re_node_set),
 843                    MAX (sizeof (regmatch_t),
 844                         max_i18n_object_size))));
 845
 846   memset (dfa, '\0', sizeof (re_dfa_t));
 847
 848   /* Force allocation of str_tree_storage the first time.  */
 849   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 850
 851   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 852      calculation below, and for similar doubling calculations
 853      elsewhere.  And it's <= rather than <, because some of the
 854      doubling calculations add 1 afterwards.  */
 855   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
 856                         <= pat_len))
 857     return REG_ESPACE;
 858
 859   dfa->nodes_alloc = pat_len + 1;
 860   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 861
 862   /*  table_size = 2 ^ ceil(log pat_len) */
 863   for (table_size = 1; ; table_size <<= 1)
 864     if (table_size > pat_len)
 865       break;
 866
 867   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 868   dfa->state_hash_mask = table_size - 1;
 869
 870   dfa->mb_cur_max = MB_CUR_MAX;
 871 #ifdef _LIBC
 872   if (dfa->mb_cur_max == 6
 873       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 874     dfa->is_utf8 = 1;
 875   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 876                        != 0);
 877 #else
 878   codeset_name = nl_langinfo (CODESET);
 879   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 880       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 881       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 882       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 883     dfa->is_utf8 = 1;
 884
 885   /* We check exhaustively in the loop below if this charset is a
 886      superset of ASCII.  */
 887   dfa->map_notascii = 0;
 888 #endif
 889
 890 #ifdef RE_ENABLE_I18N
 891   if (dfa->mb_cur_max > 1)
 892     {
 893       if (dfa->is_utf8)
 894         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 895       else
 896         {
 897           int i, j, ch;
 898
 899           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 900           if (__glibc_unlikely (dfa->sb_char == NULL))
 901             return REG_ESPACE;
 902
 903           /* Set the bits corresponding to single byte chars.  */
 904           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 905             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 906               {
 907                 wint_t wch = __btowc (ch);
 908                 if (wch != WEOF)
 909                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 910 # ifndef _LIBC
 911                 if (isascii (ch) && wch != ch)
 912                   dfa->map_notascii = 1;
 913 # endif
 914               }
 915         }
 916     }
 917 #endif
 918
 919   if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
 920     return REG_ESPACE;
 921   return REG_NOERROR;
 922 }
 923
 924 /* Initialize WORD_CHAR table, which indicate which character is
 925    "word".  In this case "word" means that it is the word construction
 926    character used by some operators like "\<", "\>", etc.  */
 927
 928 static void
 929 init_word_char (re_dfa_t *dfa)
 930 {
 931   int i = 0;
 932   int j;
 933   int ch = 0;
 934   dfa->word_ops_used = 1;
 935   if (__glibc_likely (dfa->map_notascii == 0))
 936     {
 937       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
 938          them, an issue when this code is used in Gnulib.  */
 939       bitset_word_t bits0 = 0x00000000;
 940       bitset_word_t bits1 = 0x03ff0000;
 941       bitset_word_t bits2 = 0x87fffffe;
 942       bitset_word_t bits3 = 0x07fffffe;
 943       if (BITSET_WORD_BITS == 64)
 944         {
 945           /* Pacify gcc -Woverflow on 32-bit platformns.  */
 946           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 947           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 948           i = 2;
 949         }
 950       else if (BITSET_WORD_BITS == 32)
 951         {
 952           dfa->word_char[0] = bits0;
 953           dfa->word_char[1] = bits1;
 954           dfa->word_char[2] = bits2;
 955           dfa->word_char[3] = bits3;
 956           i = 4;
 957         }
 958       else
 959         goto general_case;
 960       ch = 128;
 961
 962       if (__glibc_likely (dfa->is_utf8))
 963         {
 964           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 965           return;
 966         }
 967     }
 968
 969  general_case:
 970   for (; i < BITSET_WORDS; ++i)
 971     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 972       if (isalnum (ch) || ch == '_')
 973         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 974 }
 975
 976 /* Free the work area which are only used while compiling.  */
 977
 978 static void
 979 free_workarea_compile (regex_t *preg)
 980 {
 981   re_dfa_t *dfa = preg->buffer;
 982   bin_tree_storage_t *storage, *next;
 983   for (storage = dfa->str_tree_storage; storage; storage = next)
 984     {
 985       next = storage->next;
 986       re_free (storage);
 987     }
 988   dfa->str_tree_storage = NULL;
 989   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 990   dfa->str_tree = NULL;
 991   re_free (dfa->org_indices);
 992   dfa->org_indices = NULL;
 993 }
 994
 995 /* Create initial states for all contexts.  */
 996
 997 static reg_errcode_t
 998 create_initial_state (re_dfa_t *dfa)
 999 {
1000   Idx first, i;
1001   reg_errcode_t err;
1002   re_node_set init_nodes;
1003
1004   /* Initial states have the epsilon closure of the node which is
1005      the first node of the regular expression.  */
1006   first = dfa->str_tree->first->node_idx;
1007   dfa->init_node = first;
1008   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1009   if (__glibc_unlikely (err != REG_NOERROR))
1010     return err;
1011
1012   /* The back-references which are in initial states can epsilon transit,
1013      since in this case all of the subexpressions can be null.
1014      Then we add epsilon closures of the nodes which are the next nodes of
1015      the back-references.  */
1016   if (dfa->nbackref > 0)
1017     for (i = 0; i < init_nodes.nelem; ++i)
1018       {
1019         Idx node_idx = init_nodes.elems[i];
1020         re_token_type_t type = dfa->nodes[node_idx].type;
1021
1022         Idx clexp_idx;
1023         if (type != OP_BACK_REF)
1024           continue;
1025         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1026           {
1027             re_token_t *clexp_node;
1028             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1029             if (clexp_node->type == OP_CLOSE_SUBEXP
1030                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1031               break;
1032           }
1033         if (clexp_idx == init_nodes.nelem)
1034           continue;
1035
1036         if (type == OP_BACK_REF)
1037           {
1038             Idx dest_idx = dfa->edests[node_idx].elems[0];
1039             if (!re_node_set_contains (&init_nodes, dest_idx))
1040               {
1041                 reg_errcode_t merge_err
1042                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1043                 if (merge_err != REG_NOERROR)
1044                   return merge_err;
1045                 i = 0;
1046               }
1047           }
1048       }
1049
1050   /* It must be the first time to invoke acquire_state.  */
1051   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1052   /* We don't check ERR here, since the initial state must not be NULL.  */
1053   if (__glibc_unlikely (dfa->init_state == NULL))
1054     return err;
1055   if (dfa->init_state->has_constraint)
1056     {
1057       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1058                                                        CONTEXT_WORD);
1059       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1060                                                      CONTEXT_NEWLINE);
1061       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1062                                                          &init_nodes,
1063                                                          CONTEXT_NEWLINE
1064                                                          | CONTEXT_BEGBUF);
1065       if (__glibc_unlikely (dfa->init_state_word == NULL
1066                             || dfa->init_state_nl == NULL
1067                             || dfa->init_state_begbuf == NULL))
1068         return err;
1069     }
1070   else
1071     dfa->init_state_word = dfa->init_state_nl
1072       = dfa->init_state_begbuf = dfa->init_state;
1073
1074   re_node_set_free (&init_nodes);
1075   return REG_NOERROR;
1076 }
1077 \f
1078 #ifdef RE_ENABLE_I18N
1079 /* If it is possible to do searching in single byte encoding instead of UTF-8
1080    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1081    DFA nodes where needed.  */
1082
1083 static void
1084 optimize_utf8 (re_dfa_t *dfa)
1085 {
1086   Idx node;
1087   int i;
1088   bool mb_chars = false;
1089   bool has_period = false;
1090
1091   for (node = 0; node < dfa->nodes_len; ++node)
1092     switch (dfa->nodes[node].type)
1093       {
1094       case CHARACTER:
1095         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1096           mb_chars = true;
1097         break;
1098       case ANCHOR:
1099         switch (dfa->nodes[node].opr.ctx_type)
1100           {
1101           case LINE_FIRST:
1102           case LINE_LAST:
1103           case BUF_FIRST:
1104           case BUF_LAST:
1105             break;
1106           default:
1107             /* Word anchors etc. cannot be handled.  It's okay to test
1108                opr.ctx_type since constraints (for all DFA nodes) are
1109                created by ORing one or more opr.ctx_type values.  */
1110             return;
1111           }
1112         break;
1113       case OP_PERIOD:
1114         has_period = true;
1115         break;
1116       case OP_BACK_REF:
1117       case OP_ALT:
1118       case END_OF_RE:
1119       case OP_DUP_ASTERISK:
1120       case OP_OPEN_SUBEXP:
1121       case OP_CLOSE_SUBEXP:
1122         break;
1123       case COMPLEX_BRACKET:
1124         return;
1125       case SIMPLE_BRACKET:
1126         /* Just double check.  */
1127         {
1128           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1129                         ? 0
1130                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1131           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1132             {
1133               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1134                 return;
1135               rshift = 0;
1136             }
1137         }
1138         break;
1139       default:
1140         abort ();
1141       }
1142
1143   if (mb_chars || has_period)
1144     for (node = 0; node < dfa->nodes_len; ++node)
1145       {
1146         if (dfa->nodes[node].type == CHARACTER
1147             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1148           dfa->nodes[node].mb_partial = 0;
1149         else if (dfa->nodes[node].type == OP_PERIOD)
1150           dfa->nodes[node].type = OP_UTF8_PERIOD;
1151       }
1152
1153   /* The search can be in single byte locale.  */
1154   dfa->mb_cur_max = 1;
1155   dfa->is_utf8 = 0;
1156   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1157 }
1158 #endif
1159 \f
1160 /* Analyze the structure tree, and calculate "first", "next", "edest",
1161    "eclosure", and "inveclosure".  */
1162
1163 static reg_errcode_t
1164 analyze (regex_t *preg)
1165 {
1166   re_dfa_t *dfa = preg->buffer;
1167   reg_errcode_t ret;
1168
1169   /* Allocate arrays.  */
1170   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1171   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1172   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1173   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1174   if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1175                         || dfa->edests == NULL || dfa->eclosures == NULL))
1176     return REG_ESPACE;
1177
1178   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1179   if (dfa->subexp_map != NULL)
1180     {
1181       Idx i;
1182       for (i = 0; i < preg->re_nsub; i++)
1183         dfa->subexp_map[i] = i;
1184       preorder (dfa->str_tree, optimize_subexps, dfa);
1185       for (i = 0; i < preg->re_nsub; i++)
1186         if (dfa->subexp_map[i] != i)
1187           break;
1188       if (i == preg->re_nsub)
1189         {
1190           re_free (dfa->subexp_map);
1191           dfa->subexp_map = NULL;
1192         }
1193     }
1194
1195   ret = postorder (dfa->str_tree, lower_subexps, preg);
1196   if (__glibc_unlikely (ret != REG_NOERROR))
1197     return ret;
1198   ret = postorder (dfa->str_tree, calc_first, dfa);
1199   if (__glibc_unlikely (ret != REG_NOERROR))
1200     return ret;
1201   preorder (dfa->str_tree, calc_next, dfa);
1202   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1203   if (__glibc_unlikely (ret != REG_NOERROR))
1204     return ret;
1205   ret = calc_eclosure (dfa);
1206   if (__glibc_unlikely (ret != REG_NOERROR))
1207     return ret;
1208
1209   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1210      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1211   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1212       || dfa->nbackref)
1213     {
1214       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1215       if (__glibc_unlikely (dfa->inveclosures == NULL))
1216         return REG_ESPACE;
1217       ret = calc_inveclosure (dfa);
1218     }
1219
1220   return ret;
1221 }
1222
1223 /* Our parse trees are very unbalanced, so we cannot use a stack to
1224    implement parse tree visits.  Instead, we use parent pointers and
1225    some hairy code in these two functions.  */
1226 static reg_errcode_t
1227 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1228            void *extra)
1229 {
1230   bin_tree_t *node, *prev;
1231
1232   for (node = root; ; )
1233     {
1234       /* Descend down the tree, preferably to the left (or to the right
1235          if that's the only child).  */
1236       while (node->left || node->right)
1237         if (node->left)
1238           node = node->left;
1239         else
1240           node = node->right;
1241
1242       do
1243         {
1244           reg_errcode_t err = fn (extra, node);
1245           if (__glibc_unlikely (err != REG_NOERROR))
1246             return err;
1247           if (node->parent == NULL)
1248             return REG_NOERROR;
1249           prev = node;
1250           node = node->parent;
1251         }
1252       /* Go up while we have a node that is reached from the right.  */
1253       while (node->right == prev || node->right == NULL);
1254       node = node->right;
1255     }
1256 }
1257
1258 static reg_errcode_t
1259 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1260           void *extra)
1261 {
1262   bin_tree_t *node;
1263
1264   for (node = root; ; )
1265     {
1266       reg_errcode_t err = fn (extra, node);
1267       if (__glibc_unlikely (err != REG_NOERROR))
1268         return err;
1269
1270       /* Go to the left node, or up and to the right.  */
1271       if (node->left)
1272         node = node->left;
1273       else
1274         {
1275           bin_tree_t *prev = NULL;
1276           while (node->right == prev || node->right == NULL)
1277             {
1278               prev = node;
1279               node = node->parent;
1280               if (!node)
1281                 return REG_NOERROR;
1282             }
1283           node = node->right;
1284         }
1285     }
1286 }
1287
1288 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1289    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1290    backreferences as well.  Requires a preorder visit.  */
1291 static reg_errcode_t
1292 optimize_subexps (void *extra, bin_tree_t *node)
1293 {
1294   re_dfa_t *dfa = (re_dfa_t *) extra;
1295
1296   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1297     {
1298       int idx = node->token.opr.idx;
1299       node->token.opr.idx = dfa->subexp_map[idx];
1300       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1301     }
1302
1303   else if (node->token.type == SUBEXP
1304            && node->left && node->left->token.type == SUBEXP)
1305     {
1306       Idx other_idx = node->left->token.opr.idx;
1307
1308       node->left = node->left->left;
1309       if (node->left)
1310         node->left->parent = node;
1311
1312       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1313       if (other_idx < BITSET_WORD_BITS)
1314         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1315     }
1316
1317   return REG_NOERROR;
1318 }
1319
1320 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1321    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1322 static reg_errcode_t
1323 lower_subexps (void *extra, bin_tree_t *node)
1324 {
1325   regex_t *preg = (regex_t *) extra;
1326   reg_errcode_t err = REG_NOERROR;
1327
1328   if (node->left && node->left->token.type == SUBEXP)
1329     {
1330       node->left = lower_subexp (&err, preg, node->left);
1331       if (node->left)
1332         node->left->parent = node;
1333     }
1334   if (node->right && node->right->token.type == SUBEXP)
1335     {
1336       node->right = lower_subexp (&err, preg, node->right);
1337       if (node->right)
1338         node->right->parent = node;
1339     }
1340
1341   return err;
1342 }
1343
1344 static bin_tree_t *
1345 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1346 {
1347   re_dfa_t *dfa = preg->buffer;
1348   bin_tree_t *body = node->left;
1349   bin_tree_t *op, *cls, *tree1, *tree;
1350
1351   if (preg->no_sub
1352       /* We do not optimize empty subexpressions, because otherwise we may
1353          have bad CONCAT nodes with NULL children.  This is obviously not
1354          very common, so we do not lose much.  An example that triggers
1355          this case is the sed "script" /\(\)/x.  */
1356       && node->left != NULL
1357       && (node->token.opr.idx >= BITSET_WORD_BITS
1358           || !(dfa->used_bkref_map
1359                & ((bitset_word_t) 1 << node->token.opr.idx))))
1360     return node->left;
1361
1362   /* Convert the SUBEXP node to the concatenation of an
1363      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1364   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1365   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1366   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1367   tree = create_tree (dfa, op, tree1, CONCAT);
1368   if (__glibc_unlikely (tree == NULL || tree1 == NULL
1369                         || op == NULL || cls == NULL))
1370     {
1371       *err = REG_ESPACE;
1372       return NULL;
1373     }
1374
1375   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1376   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1377   return tree;
1378 }
1379
1380 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1381    nodes.  Requires a postorder visit.  */
1382 static reg_errcode_t
1383 calc_first (void *extra, bin_tree_t *node)
1384 {
1385   re_dfa_t *dfa = (re_dfa_t *) extra;
1386   if (node->token.type == CONCAT)
1387     {
1388       node->first = node->left->first;
1389       node->node_idx = node->left->node_idx;
1390     }
1391   else
1392     {
1393       node->first = node;
1394       node->node_idx = re_dfa_add_node (dfa, node->token);
1395       if (__glibc_unlikely (node->node_idx == -1))
1396         return REG_ESPACE;
1397       if (node->token.type == ANCHOR)
1398         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1399     }
1400   return REG_NOERROR;
1401 }
1402
1403 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1404 static reg_errcode_t
1405 calc_next (void *extra, bin_tree_t *node)
1406 {
1407   switch (node->token.type)
1408     {
1409     case OP_DUP_ASTERISK:
1410       node->left->next = node;
1411       break;
1412     case CONCAT:
1413       node->left->next = node->right->first;
1414       node->right->next = node->next;
1415       break;
1416     default:
1417       if (node->left)
1418         node->left->next = node->next;
1419       if (node->right)
1420         node->right->next = node->next;
1421       break;
1422     }
1423   return REG_NOERROR;
1424 }
1425
1426 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1427 static reg_errcode_t
1428 link_nfa_nodes (void *extra, bin_tree_t *node)
1429 {
1430   re_dfa_t *dfa = (re_dfa_t *) extra;
1431   Idx idx = node->node_idx;
1432   reg_errcode_t err = REG_NOERROR;
1433
1434   switch (node->token.type)
1435     {
1436     case CONCAT:
1437       break;
1438
1439     case END_OF_RE:
1440       DEBUG_ASSERT (node->next == NULL);
1441       break;
1442
1443     case OP_DUP_ASTERISK:
1444     case OP_ALT:
1445       {
1446         Idx left, right;
1447         dfa->has_plural_match = 1;
1448         if (node->left != NULL)
1449           left = node->left->first->node_idx;
1450         else
1451           left = node->next->node_idx;
1452         if (node->right != NULL)
1453           right = node->right->first->node_idx;
1454         else
1455           right = node->next->node_idx;
1456         DEBUG_ASSERT (left > -1);
1457         DEBUG_ASSERT (right > -1);
1458         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1459       }
1460       break;
1461
1462     case ANCHOR:
1463     case OP_OPEN_SUBEXP:
1464     case OP_CLOSE_SUBEXP:
1465       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1466       break;
1467
1468     case OP_BACK_REF:
1469       dfa->nexts[idx] = node->next->node_idx;
1470       if (node->token.type == OP_BACK_REF)
1471         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1472       break;
1473
1474     default:
1475       DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1476       dfa->nexts[idx] = node->next->node_idx;
1477       break;
1478     }
1479
1480   return err;
1481 }
1482
1483 /* Duplicate the epsilon closure of the node ROOT_NODE.
1484    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1485    to their own constraint.  */
1486
1487 static reg_errcode_t
1488 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1489                         Idx root_node, unsigned int init_constraint)
1490 {
1491   Idx org_node, clone_node;
1492   bool ok;
1493   unsigned int constraint = init_constraint;
1494   for (org_node = top_org_node, clone_node = top_clone_node;;)
1495     {
1496       Idx org_dest, clone_dest;
1497       if (dfa->nodes[org_node].type == OP_BACK_REF)
1498         {
1499           /* If the back reference epsilon-transit, its destination must
1500              also have the constraint.  Then duplicate the epsilon closure
1501              of the destination of the back reference, and store it in
1502              edests of the back reference.  */
1503           org_dest = dfa->nexts[org_node];
1504           re_node_set_empty (dfa->edests + clone_node);
1505           clone_dest = duplicate_node (dfa, org_dest, constraint);
1506           if (__glibc_unlikely (clone_dest == -1))
1507             return REG_ESPACE;
1508           dfa->nexts[clone_node] = dfa->nexts[org_node];
1509           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1510           if (__glibc_unlikely (! ok))
1511             return REG_ESPACE;
1512         }
1513       else if (dfa->edests[org_node].nelem == 0)
1514         {
1515           /* In case of the node can't epsilon-transit, don't duplicate the
1516              destination and store the original destination as the
1517              destination of the node.  */
1518           dfa->nexts[clone_node] = dfa->nexts[org_node];
1519           break;
1520         }
1521       else if (dfa->edests[org_node].nelem == 1)
1522         {
1523           /* In case of the node can epsilon-transit, and it has only one
1524              destination.  */
1525           org_dest = dfa->edests[org_node].elems[0];
1526           re_node_set_empty (dfa->edests + clone_node);
1527           /* If the node is root_node itself, it means the epsilon closure
1528              has a loop.  Then tie it to the destination of the root_node.  */
1529           if (org_node == root_node && clone_node != org_node)
1530             {
1531               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1532               if (__glibc_unlikely (! ok))
1533                 return REG_ESPACE;
1534               break;
1535             }
1536           /* In case the node has another constraint, append it.  */
1537           constraint |= dfa->nodes[org_node].constraint;
1538           clone_dest = duplicate_node (dfa, org_dest, constraint);
1539           if (__glibc_unlikely (clone_dest == -1))
1540             return REG_ESPACE;
1541           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542           if (__glibc_unlikely (! ok))
1543             return REG_ESPACE;
1544         }
1545       else /* dfa->edests[org_node].nelem == 2 */
1546         {
1547           /* In case of the node can epsilon-transit, and it has two
1548              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1549           org_dest = dfa->edests[org_node].elems[0];
1550           re_node_set_empty (dfa->edests + clone_node);
1551           /* Search for a duplicated node which satisfies the constraint.  */
1552           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1553           if (clone_dest == -1)
1554             {
1555               /* There is no such duplicated node, create a new one.  */
1556               reg_errcode_t err;
1557               clone_dest = duplicate_node (dfa, org_dest, constraint);
1558               if (__glibc_unlikely (clone_dest == -1))
1559                 return REG_ESPACE;
1560               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1561               if (__glibc_unlikely (! ok))
1562                 return REG_ESPACE;
1563               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1564                                             root_node, constraint);
1565               if (__glibc_unlikely (err != REG_NOERROR))
1566                 return err;
1567             }
1568           else
1569             {
1570               /* There is a duplicated node which satisfies the constraint,
1571                  use it to avoid infinite loop.  */
1572               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1573               if (__glibc_unlikely (! ok))
1574                 return REG_ESPACE;
1575             }
1576
1577           org_dest = dfa->edests[org_node].elems[1];
1578           clone_dest = duplicate_node (dfa, org_dest, constraint);
1579           if (__glibc_unlikely (clone_dest == -1))
1580             return REG_ESPACE;
1581           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582           if (__glibc_unlikely (! ok))
1583             return REG_ESPACE;
1584         }
1585       org_node = org_dest;
1586       clone_node = clone_dest;
1587     }
1588   return REG_NOERROR;
1589 }
1590
1591 /* Search for a node which is duplicated from the node ORG_NODE, and
1592    satisfies the constraint CONSTRAINT.  */
1593
1594 static Idx
1595 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1596                         unsigned int constraint)
1597 {
1598   Idx idx;
1599   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1600     {
1601       if (org_node == dfa->org_indices[idx]
1602           && constraint == dfa->nodes[idx].constraint)
1603         return idx; /* Found.  */
1604     }
1605   return -1; /* Not found.  */
1606 }
1607
1608 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1609    Return the index of the new node, or -1 if insufficient storage is
1610    available.  */
1611
1612 static Idx
1613 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1614 {
1615   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1616   if (__glibc_likely (dup_idx != -1))
1617     {
1618       dfa->nodes[dup_idx].constraint = constraint;
1619       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1620       dfa->nodes[dup_idx].duplicated = 1;
1621
1622       /* Store the index of the original node.  */
1623       dfa->org_indices[dup_idx] = org_idx;
1624     }
1625   return dup_idx;
1626 }
1627
1628 static reg_errcode_t
1629 calc_inveclosure (re_dfa_t *dfa)
1630 {
1631   Idx src, idx;
1632   bool ok;
1633   for (idx = 0; idx < dfa->nodes_len; ++idx)
1634     re_node_set_init_empty (dfa->inveclosures + idx);
1635
1636   for (src = 0; src < dfa->nodes_len; ++src)
1637     {
1638       Idx *elems = dfa->eclosures[src].elems;
1639       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1640         {
1641           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1642           if (__glibc_unlikely (! ok))
1643             return REG_ESPACE;
1644         }
1645     }
1646
1647   return REG_NOERROR;
1648 }
1649
1650 /* Calculate "eclosure" for all the node in DFA.  */
1651
1652 static reg_errcode_t
1653 calc_eclosure (re_dfa_t *dfa)
1654 {
1655   Idx node_idx;
1656   bool incomplete;
1657   DEBUG_ASSERT (dfa->nodes_len > 0);
1658   incomplete = false;
1659   /* For each nodes, calculate epsilon closure.  */
1660   for (node_idx = 0; ; ++node_idx)
1661     {
1662       reg_errcode_t err;
1663       re_node_set eclosure_elem;
1664       if (node_idx == dfa->nodes_len)
1665         {
1666           if (!incomplete)
1667             break;
1668           incomplete = false;
1669           node_idx = 0;
1670         }
1671
1672       DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1673
1674       /* If we have already calculated, skip it.  */
1675       if (dfa->eclosures[node_idx].nelem != 0)
1676         continue;
1677       /* Calculate epsilon closure of 'node_idx'.  */
1678       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1679       if (__glibc_unlikely (err != REG_NOERROR))
1680         return err;
1681
1682       if (dfa->eclosures[node_idx].nelem == 0)
1683         {
1684           incomplete = true;
1685           re_node_set_free (&eclosure_elem);
1686         }
1687     }
1688   return REG_NOERROR;
1689 }
1690
1691 /* Calculate epsilon closure of NODE.  */
1692
1693 static reg_errcode_t
1694 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1695 {
1696   reg_errcode_t err;
1697   Idx i;
1698   re_node_set eclosure;
1699   bool incomplete = false;
1700   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1701   if (__glibc_unlikely (err != REG_NOERROR))
1702     return err;
1703
1704   /* An epsilon closure includes itself.  */
1705   eclosure.elems[eclosure.nelem++] = node;
1706
1707   /* This indicates that we are calculating this node now.
1708      We reference this value to avoid infinite loop.  */
1709   dfa->eclosures[node].nelem = -1;
1710
1711   /* If the current node has constraints, duplicate all nodes
1712      since they must inherit the constraints.  */
1713   if (dfa->nodes[node].constraint
1714       && dfa->edests[node].nelem
1715       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1716     {
1717       err = duplicate_node_closure (dfa, node, node, node,
1718                                     dfa->nodes[node].constraint);
1719       if (__glibc_unlikely (err != REG_NOERROR))
1720         return err;
1721     }
1722
1723   /* Expand each epsilon destination nodes.  */
1724   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1725     for (i = 0; i < dfa->edests[node].nelem; ++i)
1726       {
1727         re_node_set eclosure_elem;
1728         Idx edest = dfa->edests[node].elems[i];
1729         /* If calculating the epsilon closure of 'edest' is in progress,
1730            return intermediate result.  */
1731         if (dfa->eclosures[edest].nelem == -1)
1732           {
1733             incomplete = true;
1734             continue;
1735           }
1736         /* If we haven't calculated the epsilon closure of 'edest' yet,
1737            calculate now. Otherwise use calculated epsilon closure.  */
1738         if (dfa->eclosures[edest].nelem == 0)
1739           {
1740             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1741             if (__glibc_unlikely (err != REG_NOERROR))
1742               return err;
1743           }
1744         else
1745           eclosure_elem = dfa->eclosures[edest];
1746         /* Merge the epsilon closure of 'edest'.  */
1747         err = re_node_set_merge (&eclosure, &eclosure_elem);
1748         if (__glibc_unlikely (err != REG_NOERROR))
1749           return err;
1750         /* If the epsilon closure of 'edest' is incomplete,
1751            the epsilon closure of this node is also incomplete.  */
1752         if (dfa->eclosures[edest].nelem == 0)
1753           {
1754             incomplete = true;
1755             re_node_set_free (&eclosure_elem);
1756           }
1757       }
1758
1759   if (incomplete && !root)
1760     dfa->eclosures[node].nelem = 0;
1761   else
1762     dfa->eclosures[node] = eclosure;
1763   *new_set = eclosure;
1764   return REG_NOERROR;
1765 }
1766 \f
1767 /* Functions for token which are used in the parser.  */
1768
1769 /* Fetch a token from INPUT.
1770    We must not use this function inside bracket expressions.  */
1771
1772 static void
1773 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1774 {
1775   re_string_skip_bytes (input, peek_token (result, input, syntax));
1776 }
1777
1778 /* Peek a token from INPUT, and return the length of the token.
1779    We must not use this function inside bracket expressions.  */
1780
1781 static int
1782 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1783 {
1784   unsigned char c;
1785
1786   if (re_string_eoi (input))
1787     {
1788       token->type = END_OF_RE;
1789       return 0;
1790     }
1791
1792   c = re_string_peek_byte (input, 0);
1793   token->opr.c = c;
1794
1795   token->word_char = 0;
1796 #ifdef RE_ENABLE_I18N
1797   token->mb_partial = 0;
1798   if (input->mb_cur_max > 1
1799       && !re_string_first_byte (input, re_string_cur_idx (input)))
1800     {
1801       token->type = CHARACTER;
1802       token->mb_partial = 1;
1803       return 1;
1804     }
1805 #endif
1806   if (c == '\\')
1807     {
1808       unsigned char c2;
1809       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1810         {
1811           token->type = BACK_SLASH;
1812           return 1;
1813         }
1814
1815       c2 = re_string_peek_byte_case (input, 1);
1816       token->opr.c = c2;
1817       token->type = CHARACTER;
1818 #ifdef RE_ENABLE_I18N
1819       if (input->mb_cur_max > 1)
1820         {
1821           wint_t wc = re_string_wchar_at (input,
1822                                           re_string_cur_idx (input) + 1);
1823           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1824         }
1825       else
1826 #endif
1827         token->word_char = IS_WORD_CHAR (c2) != 0;
1828
1829       switch (c2)
1830         {
1831         case '|':
1832           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1833             token->type = OP_ALT;
1834           break;
1835         case '1': case '2': case '3': case '4': case '5':
1836         case '6': case '7': case '8': case '9':
1837           if (!(syntax & RE_NO_BK_REFS))
1838             {
1839               token->type = OP_BACK_REF;
1840               token->opr.idx = c2 - '1';
1841             }
1842           break;
1843         case '<':
1844           if (!(syntax & RE_NO_GNU_OPS))
1845             {
1846               token->type = ANCHOR;
1847               token->opr.ctx_type = WORD_FIRST;
1848             }
1849           break;
1850         case '>':
1851           if (!(syntax & RE_NO_GNU_OPS))
1852             {
1853               token->type = ANCHOR;
1854               token->opr.ctx_type = WORD_LAST;
1855             }
1856           break;
1857         case 'b':
1858           if (!(syntax & RE_NO_GNU_OPS))
1859             {
1860               token->type = ANCHOR;
1861               token->opr.ctx_type = WORD_DELIM;
1862             }
1863           break;
1864         case 'B':
1865           if (!(syntax & RE_NO_GNU_OPS))
1866             {
1867               token->type = ANCHOR;
1868               token->opr.ctx_type = NOT_WORD_DELIM;
1869             }
1870           break;
1871         case 'w':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             token->type = OP_WORD;
1874           break;
1875         case 'W':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             token->type = OP_NOTWORD;
1878           break;
1879         case 's':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             token->type = OP_SPACE;
1882           break;
1883         case 'S':
1884           if (!(syntax & RE_NO_GNU_OPS))
1885             token->type = OP_NOTSPACE;
1886           break;
1887         case '`':
1888           if (!(syntax & RE_NO_GNU_OPS))
1889             {
1890               token->type = ANCHOR;
1891               token->opr.ctx_type = BUF_FIRST;
1892             }
1893           break;
1894         case '\'':
1895           if (!(syntax & RE_NO_GNU_OPS))
1896             {
1897               token->type = ANCHOR;
1898               token->opr.ctx_type = BUF_LAST;
1899             }
1900           break;
1901         case '(':
1902           if (!(syntax & RE_NO_BK_PARENS))
1903             token->type = OP_OPEN_SUBEXP;
1904           break;
1905         case ')':
1906           if (!(syntax & RE_NO_BK_PARENS))
1907             token->type = OP_CLOSE_SUBEXP;
1908           break;
1909         case '+':
1910           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911             token->type = OP_DUP_PLUS;
1912           break;
1913         case '?':
1914           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915             token->type = OP_DUP_QUESTION;
1916           break;
1917         case '{':
1918           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919             token->type = OP_OPEN_DUP_NUM;
1920           break;
1921         case '}':
1922           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923             token->type = OP_CLOSE_DUP_NUM;
1924           break;
1925         default:
1926           break;
1927         }
1928       return 2;
1929     }
1930
1931   token->type = CHARACTER;
1932 #ifdef RE_ENABLE_I18N
1933   if (input->mb_cur_max > 1)
1934     {
1935       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1936       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1937     }
1938   else
1939 #endif
1940     token->word_char = IS_WORD_CHAR (token->opr.c);
1941
1942   switch (c)
1943     {
1944     case '\n':
1945       if (syntax & RE_NEWLINE_ALT)
1946         token->type = OP_ALT;
1947       break;
1948     case '|':
1949       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1950         token->type = OP_ALT;
1951       break;
1952     case '*':
1953       token->type = OP_DUP_ASTERISK;
1954       break;
1955     case '+':
1956       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1957         token->type = OP_DUP_PLUS;
1958       break;
1959     case '?':
1960       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961         token->type = OP_DUP_QUESTION;
1962       break;
1963     case '{':
1964       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1965         token->type = OP_OPEN_DUP_NUM;
1966       break;
1967     case '}':
1968       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969         token->type = OP_CLOSE_DUP_NUM;
1970       break;
1971     case '(':
1972       if (syntax & RE_NO_BK_PARENS)
1973         token->type = OP_OPEN_SUBEXP;
1974       break;
1975     case ')':
1976       if (syntax & RE_NO_BK_PARENS)
1977         token->type = OP_CLOSE_SUBEXP;
1978       break;
1979     case '[':
1980       token->type = OP_OPEN_BRACKET;
1981       break;
1982     case '.':
1983       token->type = OP_PERIOD;
1984       break;
1985     case '^':
1986       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1987           && re_string_cur_idx (input) != 0)
1988         {
1989           char prev = re_string_peek_byte (input, -1);
1990           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1991             break;
1992         }
1993       token->type = ANCHOR;
1994       token->opr.ctx_type = LINE_FIRST;
1995       break;
1996     case '$':
1997       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1998           && re_string_cur_idx (input) + 1 != re_string_length (input))
1999         {
2000           re_token_t next;
2001           re_string_skip_bytes (input, 1);
2002           peek_token (&next, input, syntax);
2003           re_string_skip_bytes (input, -1);
2004           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2005             break;
2006         }
2007       token->type = ANCHOR;
2008       token->opr.ctx_type = LINE_LAST;
2009       break;
2010     default:
2011       break;
2012     }
2013   return 1;
2014 }
2015
2016 /* Peek a token from INPUT, and return the length of the token.
2017    We must not use this function out of bracket expressions.  */
2018
2019 static int
2020 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2021 {
2022   unsigned char c;
2023   if (re_string_eoi (input))
2024     {
2025       token->type = END_OF_RE;
2026       return 0;
2027     }
2028   c = re_string_peek_byte (input, 0);
2029   token->opr.c = c;
2030
2031 #ifdef RE_ENABLE_I18N
2032   if (input->mb_cur_max > 1
2033       && !re_string_first_byte (input, re_string_cur_idx (input)))
2034     {
2035       token->type = CHARACTER;
2036       return 1;
2037     }
2038 #endif /* RE_ENABLE_I18N */
2039
2040   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2041       && re_string_cur_idx (input) + 1 < re_string_length (input))
2042     {
2043       /* In this case, '\' escape a character.  */
2044       unsigned char c2;
2045       re_string_skip_bytes (input, 1);
2046       c2 = re_string_peek_byte (input, 0);
2047       token->opr.c = c2;
2048       token->type = CHARACTER;
2049       return 1;
2050     }
2051   if (c == '[') /* '[' is a special char in a bracket exps.  */
2052     {
2053       unsigned char c2;
2054       int token_len;
2055       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2056         c2 = re_string_peek_byte (input, 1);
2057       else
2058         c2 = 0;
2059       token->opr.c = c2;
2060       token_len = 2;
2061       switch (c2)
2062         {
2063         case '.':
2064           token->type = OP_OPEN_COLL_ELEM;
2065           break;
2066
2067         case '=':
2068           token->type = OP_OPEN_EQUIV_CLASS;
2069           break;
2070
2071         case ':':
2072           if (syntax & RE_CHAR_CLASSES)
2073             {
2074               token->type = OP_OPEN_CHAR_CLASS;
2075               break;
2076             }
2077           FALLTHROUGH;
2078         default:
2079           token->type = CHARACTER;
2080           token->opr.c = c;
2081           token_len = 1;
2082           break;
2083         }
2084       return token_len;
2085     }
2086   switch (c)
2087     {
2088     case '-':
2089       token->type = OP_CHARSET_RANGE;
2090       break;
2091     case ']':
2092       token->type = OP_CLOSE_BRACKET;
2093       break;
2094     case '^':
2095       token->type = OP_NON_MATCH_LIST;
2096       break;
2097     default:
2098       token->type = CHARACTER;
2099     }
2100   return 1;
2101 }
2102 \f
2103 /* Functions for parser.  */
2104
2105 /* Entry point of the parser.
2106    Parse the regular expression REGEXP and return the structure tree.
2107    If an error occurs, ERR is set by error code, and return NULL.
2108    This function build the following tree, from regular expression <reg_exp>:
2109            CAT
2110            / \
2111           /   \
2112    <reg_exp>  EOR
2113
2114    CAT means concatenation.
2115    EOR means end of regular expression.  */
2116
2117 static bin_tree_t *
2118 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2119        reg_errcode_t *err)
2120 {
2121   re_dfa_t *dfa = preg->buffer;
2122   bin_tree_t *tree, *eor, *root;
2123   re_token_t current_token;
2124   dfa->syntax = syntax;
2125   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2127   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2128     return NULL;
2129   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2130   if (tree != NULL)
2131     root = create_tree (dfa, tree, eor, CONCAT);
2132   else
2133     root = eor;
2134   if (__glibc_unlikely (eor == NULL || root == NULL))
2135     {
2136       *err = REG_ESPACE;
2137       return NULL;
2138     }
2139   return root;
2140 }
2141
2142 /* This function build the following tree, from regular expression
2143    <branch1>|<branch2>:
2144            ALT
2145            / \
2146           /   \
2147    <branch1> <branch2>
2148
2149    ALT means alternative, which represents the operator '|'.  */
2150
2151 static bin_tree_t *
2152 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2153                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2154 {
2155   re_dfa_t *dfa = preg->buffer;
2156   bin_tree_t *tree, *branch = NULL;
2157   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2158   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2159   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2160     return NULL;
2161
2162   while (token->type == OP_ALT)
2163     {
2164       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2165       if (token->type != OP_ALT && token->type != END_OF_RE
2166           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2167         {
2168           bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2169           dfa->completed_bkref_map = initial_bkref_map;
2170           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2171           if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2172             {
2173               if (tree != NULL)
2174                 postorder (tree, free_tree, NULL);
2175               return NULL;
2176             }
2177           dfa->completed_bkref_map |= accumulated_bkref_map;
2178         }
2179       else
2180         branch = NULL;
2181       tree = create_tree (dfa, tree, branch, OP_ALT);
2182       if (__glibc_unlikely (tree == NULL))
2183         {
2184           *err = REG_ESPACE;
2185           return NULL;
2186         }
2187     }
2188   return tree;
2189 }
2190
2191 /* This function build the following tree, from regular expression
2192    <exp1><exp2>:
2193         CAT
2194         / \
2195        /   \
2196    <exp1> <exp2>
2197
2198    CAT means concatenation.  */
2199
2200 static bin_tree_t *
2201 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2202               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2203 {
2204   bin_tree_t *tree, *expr;
2205   re_dfa_t *dfa = preg->buffer;
2206   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2207   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2208     return NULL;
2209
2210   while (token->type != OP_ALT && token->type != END_OF_RE
2211          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2212     {
2213       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2214       if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2215         {
2216           if (tree != NULL)
2217             postorder (tree, free_tree, NULL);
2218           return NULL;
2219         }
2220       if (tree != NULL && expr != NULL)
2221         {
2222           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2223           if (newtree == NULL)
2224             {
2225               postorder (expr, free_tree, NULL);
2226               postorder (tree, free_tree, NULL);
2227               *err = REG_ESPACE;
2228               return NULL;
2229             }
2230           tree = newtree;
2231         }
2232       else if (tree == NULL)
2233         tree = expr;
2234       /* Otherwise expr == NULL, we don't need to create new tree.  */
2235     }
2236   return tree;
2237 }
2238
2239 /* This function build the following tree, from regular expression a*:
2240          *
2241          |
2242          a
2243 */
2244
2245 static bin_tree_t *
2246 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2247                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2248 {
2249   re_dfa_t *dfa = preg->buffer;
2250   bin_tree_t *tree;
2251   switch (token->type)
2252     {
2253     case CHARACTER:
2254       tree = create_token_tree (dfa, NULL, NULL, token);
2255       if (__glibc_unlikely (tree == NULL))
2256         {
2257           *err = REG_ESPACE;
2258           return NULL;
2259         }
2260 #ifdef RE_ENABLE_I18N
2261       if (dfa->mb_cur_max > 1)
2262         {
2263           while (!re_string_eoi (regexp)
2264                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2265             {
2266               bin_tree_t *mbc_remain;
2267               fetch_token (token, regexp, syntax);
2268               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2269               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2270               if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2271                 {
2272                   *err = REG_ESPACE;
2273                   return NULL;
2274                 }
2275             }
2276         }
2277 #endif
2278       break;
2279
2280     case OP_OPEN_SUBEXP:
2281       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2282       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2283         return NULL;
2284       break;
2285
2286     case OP_OPEN_BRACKET:
2287       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2288       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2289         return NULL;
2290       break;
2291
2292     case OP_BACK_REF:
2293       if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2294         {
2295           *err = REG_ESUBREG;
2296           return NULL;
2297         }
2298       dfa->used_bkref_map |= 1 << token->opr.idx;
2299       tree = create_token_tree (dfa, NULL, NULL, token);
2300       if (__glibc_unlikely (tree == NULL))
2301         {
2302           *err = REG_ESPACE;
2303           return NULL;
2304         }
2305       ++dfa->nbackref;
2306       dfa->has_mb_node = 1;
2307       break;
2308
2309     case OP_OPEN_DUP_NUM:
2310       if (syntax & RE_CONTEXT_INVALID_DUP)
2311         {
2312           *err = REG_BADRPT;
2313           return NULL;
2314         }
2315       FALLTHROUGH;
2316     case OP_DUP_ASTERISK:
2317     case OP_DUP_PLUS:
2318     case OP_DUP_QUESTION:
2319       if (syntax & RE_CONTEXT_INVALID_OPS)
2320         {
2321           *err = REG_BADRPT;
2322           return NULL;
2323         }
2324       else if (syntax & RE_CONTEXT_INDEP_OPS)
2325         {
2326           fetch_token (token, regexp, syntax);
2327           return parse_expression (regexp, preg, token, syntax, nest, err);
2328         }
2329       FALLTHROUGH;
2330     case OP_CLOSE_SUBEXP:
2331       if ((token->type == OP_CLOSE_SUBEXP)
2332           && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2333         {
2334           *err = REG_ERPAREN;
2335           return NULL;
2336         }
2337       FALLTHROUGH;
2338     case OP_CLOSE_DUP_NUM:
2339       /* We treat it as a normal character.  */
2340
2341       /* Then we can these characters as normal characters.  */
2342       token->type = CHARACTER;
2343       /* mb_partial and word_char bits should be initialized already
2344          by peek_token.  */
2345       tree = create_token_tree (dfa, NULL, NULL, token);
2346       if (__glibc_unlikely (tree == NULL))
2347         {
2348           *err = REG_ESPACE;
2349           return NULL;
2350         }
2351       break;
2352
2353     case ANCHOR:
2354       if ((token->opr.ctx_type
2355            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2356           && dfa->word_ops_used == 0)
2357         init_word_char (dfa);
2358       if (token->opr.ctx_type == WORD_DELIM
2359           || token->opr.ctx_type == NOT_WORD_DELIM)
2360         {
2361           bin_tree_t *tree_first, *tree_last;
2362           if (token->opr.ctx_type == WORD_DELIM)
2363             {
2364               token->opr.ctx_type = WORD_FIRST;
2365               tree_first = create_token_tree (dfa, NULL, NULL, token);
2366               token->opr.ctx_type = WORD_LAST;
2367             }
2368           else
2369             {
2370               token->opr.ctx_type = INSIDE_WORD;
2371               tree_first = create_token_tree (dfa, NULL, NULL, token);
2372               token->opr.ctx_type = INSIDE_NOTWORD;
2373             }
2374           tree_last = create_token_tree (dfa, NULL, NULL, token);
2375           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2376           if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2377                                 || tree == NULL))
2378             {
2379               *err = REG_ESPACE;
2380               return NULL;
2381             }
2382         }
2383       else
2384         {
2385           tree = create_token_tree (dfa, NULL, NULL, token);
2386           if (__glibc_unlikely (tree == NULL))
2387             {
2388               *err = REG_ESPACE;
2389               return NULL;
2390             }
2391         }
2392       /* We must return here, since ANCHORs can't be followed
2393          by repetition operators.
2394          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2395              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2396       fetch_token (token, regexp, syntax);
2397       return tree;
2398
2399     case OP_PERIOD:
2400       tree = create_token_tree (dfa, NULL, NULL, token);
2401       if (__glibc_unlikely (tree == NULL))
2402         {
2403           *err = REG_ESPACE;
2404           return NULL;
2405         }
2406       if (dfa->mb_cur_max > 1)
2407         dfa->has_mb_node = 1;
2408       break;
2409
2410     case OP_WORD:
2411     case OP_NOTWORD:
2412       tree = build_charclass_op (dfa, regexp->trans,
2413                                  "alnum",
2414                                  "_",
2415                                  token->type == OP_NOTWORD, err);
2416       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2417         return NULL;
2418       break;
2419
2420     case OP_SPACE:
2421     case OP_NOTSPACE:
2422       tree = build_charclass_op (dfa, regexp->trans,
2423                                  "space",
2424                                  "",
2425                                  token->type == OP_NOTSPACE, err);
2426       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2427         return NULL;
2428       break;
2429
2430     case OP_ALT:
2431     case END_OF_RE:
2432       return NULL;
2433
2434     case BACK_SLASH:
2435       *err = REG_EESCAPE;
2436       return NULL;
2437
2438     default:
2439       /* Must not happen?  */
2440       DEBUG_ASSERT (false);
2441       return NULL;
2442     }
2443   fetch_token (token, regexp, syntax);
2444
2445   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2446          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2447     {
2448       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2449                                            syntax, err);
2450       if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2451         {
2452           if (tree != NULL)
2453             postorder (tree, free_tree, NULL);
2454           return NULL;
2455         }
2456       tree = dup_tree;
2457       /* In BRE consecutive duplications are not allowed.  */
2458       if ((syntax & RE_CONTEXT_INVALID_DUP)
2459           && (token->type == OP_DUP_ASTERISK
2460               || token->type == OP_OPEN_DUP_NUM))
2461         {
2462           if (tree != NULL)
2463             postorder (tree, free_tree, NULL);
2464           *err = REG_BADRPT;
2465           return NULL;
2466         }
2467     }
2468
2469   return tree;
2470 }
2471
2472 /* This function build the following tree, from regular expression
2473    (<reg_exp>):
2474          SUBEXP
2475             |
2476         <reg_exp>
2477 */
2478
2479 static bin_tree_t *
2480 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2481                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2482 {
2483   re_dfa_t *dfa = preg->buffer;
2484   bin_tree_t *tree;
2485   size_t cur_nsub;
2486   cur_nsub = preg->re_nsub++;
2487
2488   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2489
2490   /* The subexpression may be a null string.  */
2491   if (token->type == OP_CLOSE_SUBEXP)
2492     tree = NULL;
2493   else
2494     {
2495       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2496       if (__glibc_unlikely (*err == REG_NOERROR
2497                             && token->type != OP_CLOSE_SUBEXP))
2498         {
2499           if (tree != NULL)
2500             postorder (tree, free_tree, NULL);
2501           *err = REG_EPAREN;
2502         }
2503       if (__glibc_unlikely (*err != REG_NOERROR))
2504         return NULL;
2505     }
2506
2507   if (cur_nsub <= '9' - '1')
2508     dfa->completed_bkref_map |= 1 << cur_nsub;
2509
2510   tree = create_tree (dfa, tree, NULL, SUBEXP);
2511   if (__glibc_unlikely (tree == NULL))
2512     {
2513       *err = REG_ESPACE;
2514       return NULL;
2515     }
2516   tree->token.opr.idx = cur_nsub;
2517   return tree;
2518 }
2519
2520 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2521
2522 static bin_tree_t *
2523 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2524               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2525 {
2526   bin_tree_t *tree = NULL, *old_tree = NULL;
2527   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2528   re_token_t start_token = *token;
2529
2530   if (token->type == OP_OPEN_DUP_NUM)
2531     {
2532       end = 0;
2533       start = fetch_number (regexp, token, syntax);
2534       if (start == -1)
2535         {
2536           if (token->type == CHARACTER && token->opr.c == ',')
2537             start = 0; /* We treat "{,m}" as "{0,m}".  */
2538           else
2539             {
2540               *err = REG_BADBR; /* <re>{} is invalid.  */
2541               return NULL;
2542             }
2543         }
2544       if (__glibc_likely (start != -2))
2545         {
2546           /* We treat "{n}" as "{n,n}".  */
2547           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2548                  : ((token->type == CHARACTER && token->opr.c == ',')
2549                     ? fetch_number (regexp, token, syntax) : -2));
2550         }
2551       if (__glibc_unlikely (start == -2 || end == -2))
2552         {
2553           /* Invalid sequence.  */
2554           if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2555             {
2556               if (token->type == END_OF_RE)
2557                 *err = REG_EBRACE;
2558               else
2559                 *err = REG_BADBR;
2560
2561               return NULL;
2562             }
2563
2564           /* If the syntax bit is set, rollback.  */
2565           re_string_set_index (regexp, start_idx);
2566           *token = start_token;
2567           token->type = CHARACTER;
2568           /* mb_partial and word_char bits should be already initialized by
2569              peek_token.  */
2570           return elem;
2571         }
2572
2573       if (__glibc_unlikely ((end != -1 && start > end)
2574                             || token->type != OP_CLOSE_DUP_NUM))
2575         {
2576           /* First number greater than second.  */
2577           *err = REG_BADBR;
2578           return NULL;
2579         }
2580
2581       if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2582         {
2583           *err = REG_ESIZE;
2584           return NULL;
2585         }
2586     }
2587   else
2588     {
2589       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2590       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2591     }
2592
2593   fetch_token (token, regexp, syntax);
2594
2595   if (__glibc_unlikely (elem == NULL))
2596     return NULL;
2597   if (__glibc_unlikely (start == 0 && end == 0))
2598     {
2599       postorder (elem, free_tree, NULL);
2600       return NULL;
2601     }
2602
2603   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2604   if (__glibc_unlikely (start > 0))
2605     {
2606       tree = elem;
2607       for (i = 2; i <= start; ++i)
2608         {
2609           elem = duplicate_tree (elem, dfa);
2610           tree = create_tree (dfa, tree, elem, CONCAT);
2611           if (__glibc_unlikely (elem == NULL || tree == NULL))
2612             goto parse_dup_op_espace;
2613         }
2614
2615       if (start == end)
2616         return tree;
2617
2618       /* Duplicate ELEM before it is marked optional.  */
2619       elem = duplicate_tree (elem, dfa);
2620       if (__glibc_unlikely (elem == NULL))
2621         goto parse_dup_op_espace;
2622       old_tree = tree;
2623     }
2624   else
2625     old_tree = NULL;
2626
2627   if (elem->token.type == SUBEXP)
2628     {
2629       uintptr_t subidx = elem->token.opr.idx;
2630       postorder (elem, mark_opt_subexp, (void *) subidx);
2631     }
2632
2633   tree = create_tree (dfa, elem, NULL,
2634                       (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2635   if (__glibc_unlikely (tree == NULL))
2636     goto parse_dup_op_espace;
2637
2638   /* This loop is actually executed only when end != -1,
2639      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2640      already created the start+1-th copy.  */
2641   if (TYPE_SIGNED (Idx) || end != -1)
2642     for (i = start + 2; i <= end; ++i)
2643       {
2644         elem = duplicate_tree (elem, dfa);
2645         tree = create_tree (dfa, tree, elem, CONCAT);
2646         if (__glibc_unlikely (elem == NULL || tree == NULL))
2647           goto parse_dup_op_espace;
2648
2649         tree = create_tree (dfa, tree, NULL, OP_ALT);
2650         if (__glibc_unlikely (tree == NULL))
2651           goto parse_dup_op_espace;
2652       }
2653
2654   if (old_tree)
2655     tree = create_tree (dfa, old_tree, tree, CONCAT);
2656
2657   return tree;
2658
2659  parse_dup_op_espace:
2660   *err = REG_ESPACE;
2661   return NULL;
2662 }
2663
2664 /* Size of the names for collating symbol/equivalence_class/character_class.
2665    I'm not sure, but maybe enough.  */
2666 #define BRACKET_NAME_BUF_SIZE 32
2667
2668 #ifndef _LIBC
2669
2670 # ifdef RE_ENABLE_I18N
2671 /* Convert the byte B to the corresponding wide character.  In a
2672    unibyte locale, treat B as itself.  In a multibyte locale, return
2673    WEOF if B is an encoding error.  */
2674 static wint_t
2675 parse_byte (unsigned char b, re_charset_t *mbcset)
2676 {
2677   return mbcset == NULL ? b : __btowc (b);
2678 }
2679 # endif
2680
2681   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2682      Build the range expression which starts from START_ELEM, and ends
2683      at END_ELEM.  The result are written to MBCSET and SBCSET.
2684      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2685      mbcset->range_ends, is a pointer argument since we may
2686      update it.  */
2687
2688 static reg_errcode_t
2689 # ifdef RE_ENABLE_I18N
2690 build_range_exp (const reg_syntax_t syntax,
2691                  bitset_t sbcset,
2692                  re_charset_t *mbcset,
2693                  Idx *range_alloc,
2694                  const bracket_elem_t *start_elem,
2695                  const bracket_elem_t *end_elem)
2696 # else /* not RE_ENABLE_I18N */
2697 build_range_exp (const reg_syntax_t syntax,
2698                  bitset_t sbcset,
2699                  const bracket_elem_t *start_elem,
2700                  const bracket_elem_t *end_elem)
2701 # endif /* not RE_ENABLE_I18N */
2702 {
2703   unsigned int start_ch, end_ch;
2704   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2705   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2706                         || start_elem->type == CHAR_CLASS
2707                         || end_elem->type == EQUIV_CLASS
2708                         || end_elem->type == CHAR_CLASS))
2709     return REG_ERANGE;
2710
2711   /* We can handle no multi character collating elements without libc
2712      support.  */
2713   if (__glibc_unlikely ((start_elem->type == COLL_SYM
2714                          && strlen ((char *) start_elem->opr.name) > 1)
2715                         || (end_elem->type == COLL_SYM
2716                             && strlen ((char *) end_elem->opr.name) > 1)))
2717     return REG_ECOLLATE;
2718
2719 # ifdef RE_ENABLE_I18N
2720   {
2721     wchar_t wc;
2722     wint_t start_wc;
2723     wint_t end_wc;
2724
2725     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2726                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2727                    : 0));
2728     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2729               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2730                  : 0));
2731     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2732                 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2733     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2734               ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2735     if (start_wc == WEOF || end_wc == WEOF)
2736       return REG_ECOLLATE;
2737     else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2738                                && start_wc > end_wc))
2739       return REG_ERANGE;
2740
2741     /* Got valid collation sequence values, add them as a new entry.
2742        However, for !_LIBC we have no collation elements: if the
2743        character set is single byte, the single byte character set
2744        that we build below suffices.  parse_bracket_exp passes
2745        no MBCSET if dfa->mb_cur_max == 1.  */
2746     if (mbcset)
2747       {
2748         /* Check the space of the arrays.  */
2749         if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2750           {
2751             /* There is not enough space, need realloc.  */
2752             wchar_t *new_array_start, *new_array_end;
2753             Idx new_nranges;
2754
2755             /* +1 in case of mbcset->nranges is 0.  */
2756             new_nranges = 2 * mbcset->nranges + 1;
2757             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2758                are NULL if *range_alloc == 0.  */
2759             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2760                                           new_nranges);
2761             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2762                                         new_nranges);
2763
2764             if (__glibc_unlikely (new_array_start == NULL
2765                                   || new_array_end == NULL))
2766               {
2767                 re_free (new_array_start);
2768                 re_free (new_array_end);
2769                 return REG_ESPACE;
2770               }
2771
2772             mbcset->range_starts = new_array_start;
2773             mbcset->range_ends = new_array_end;
2774             *range_alloc = new_nranges;
2775           }
2776
2777         mbcset->range_starts[mbcset->nranges] = start_wc;
2778         mbcset->range_ends[mbcset->nranges++] = end_wc;
2779       }
2780
2781     /* Build the table for single byte characters.  */
2782     for (wc = 0; wc < SBC_MAX; ++wc)
2783       {
2784         if (start_wc <= wc && wc <= end_wc)
2785           bitset_set (sbcset, wc);
2786       }
2787   }
2788 # else /* not RE_ENABLE_I18N */
2789   {
2790     unsigned int ch;
2791     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2792                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2793                    : 0));
2794     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2795               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2796                  : 0));
2797     if (start_ch > end_ch)
2798       return REG_ERANGE;
2799     /* Build the table for single byte characters.  */
2800     for (ch = 0; ch < SBC_MAX; ++ch)
2801       if (start_ch <= ch  && ch <= end_ch)
2802         bitset_set (sbcset, ch);
2803   }
2804 # endif /* not RE_ENABLE_I18N */
2805   return REG_NOERROR;
2806 }
2807 #endif /* not _LIBC */
2808
2809 #ifndef _LIBC
2810 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2811    Build the collating element which is represented by NAME.
2812    The result are written to MBCSET and SBCSET.
2813    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2814    pointer argument since we may update it.  */
2815
2816 static reg_errcode_t
2817 # ifdef RE_ENABLE_I18N
2818 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2819                         Idx *coll_sym_alloc, const unsigned char *name)
2820 # else /* not RE_ENABLE_I18N */
2821 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2822 # endif /* not RE_ENABLE_I18N */
2823 {
2824   size_t name_len = strlen ((const char *) name);
2825   if (__glibc_unlikely (name_len != 1))
2826     return REG_ECOLLATE;
2827   else
2828     {
2829       bitset_set (sbcset, name[0]);
2830       return REG_NOERROR;
2831     }
2832 }
2833 #endif /* not _LIBC */
2834
2835 #ifdef _LIBC
2836 /* Local function for parse_bracket_exp used in _LIBC environment.
2837    Seek the collating symbol entry corresponding to NAME.
2838    Return the index of the symbol in the SYMB_TABLE,
2839    or -1 if not found.  */
2840
2841 static inline int32_t
2842 __attribute__ ((always_inline))
2843 seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2844                              const int32_t *symb_table, int32_t table_size,
2845                              const unsigned char *extra)
2846 {
2847   int32_t elem;
2848
2849   for (elem = 0; elem < table_size; elem++)
2850     if (symb_table[2 * elem] != 0)
2851       {
2852         int32_t idx = symb_table[2 * elem + 1];
2853         /* Skip the name of collating element name.  */
2854         idx += 1 + extra[idx];
2855         if (/* Compare the length of the name.  */
2856             name_len == extra[idx]
2857             /* Compare the name.  */
2858             && memcmp (name, &extra[idx + 1], name_len) == 0)
2859           /* Yep, this is the entry.  */
2860           return elem;
2861       }
2862   return -1;
2863 }
2864
2865 /* Local function for parse_bracket_exp used in _LIBC environment.
2866    Look up the collation sequence value of BR_ELEM.
2867    Return the value if succeeded, UINT_MAX otherwise.  */
2868
2869 static inline unsigned int
2870 __attribute__ ((always_inline))
2871 lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2872                                  const unsigned char *collseqmb,
2873                                  const char *collseqwc, int32_t table_size,
2874                                  const int32_t *symb_table,
2875                                  const unsigned char *extra)
2876 {
2877   if (br_elem->type == SB_CHAR)
2878     {
2879       /* if (MB_CUR_MAX == 1) */
2880       if (nrules == 0)
2881         return collseqmb[br_elem->opr.ch];
2882       else
2883         {
2884           wint_t wc = __btowc (br_elem->opr.ch);
2885           return __collseq_table_lookup (collseqwc, wc);
2886         }
2887     }
2888   else if (br_elem->type == MB_CHAR)
2889     {
2890       if (nrules != 0)
2891         return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2892     }
2893   else if (br_elem->type == COLL_SYM)
2894     {
2895       size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2896       if (nrules != 0)
2897         {
2898           int32_t elem, idx;
2899           elem = seek_collating_symbol_entry (br_elem->opr.name,
2900                                               sym_name_len,
2901                                               symb_table, table_size,
2902                                               extra);
2903           if (elem != -1)
2904             {
2905               /* We found the entry.  */
2906               idx = symb_table[2 * elem + 1];
2907               /* Skip the name of collating element name.  */
2908               idx += 1 + extra[idx];
2909               /* Skip the byte sequence of the collating element.  */
2910               idx += 1 + extra[idx];
2911               /* Adjust for the alignment.  */
2912               idx = (idx + 3) & ~3;
2913               /* Skip the multibyte collation sequence value.  */
2914               idx += sizeof (unsigned int);
2915               /* Skip the wide char sequence of the collating element.  */
2916               idx += sizeof (unsigned int) *
2917                 (1 + *(unsigned int *) (extra + idx));
2918               /* Return the collation sequence value.  */
2919               return *(unsigned int *) (extra + idx);
2920             }
2921           else if (sym_name_len == 1)
2922             {
2923               /* No valid character.  Match it as a single byte
2924                  character.  */
2925               return collseqmb[br_elem->opr.name[0]];
2926             }
2927         }
2928       else if (sym_name_len == 1)
2929         return collseqmb[br_elem->opr.name[0]];
2930     }
2931   return UINT_MAX;
2932 }
2933
2934 /* Local function for parse_bracket_exp used in _LIBC environment.
2935    Build the range expression which starts from START_ELEM, and ends
2936    at END_ELEM.  The result are written to MBCSET and SBCSET.
2937    RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2938    mbcset->range_ends, is a pointer argument since we may
2939    update it.  */
2940
2941 static inline reg_errcode_t
2942 __attribute__ ((always_inline))
2943 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2944                  bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2945                  re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2946                  const unsigned char *collseqmb, const char *collseqwc,
2947                  int32_t table_size, const int32_t *symb_table,
2948                  const unsigned char *extra)
2949 {
2950   unsigned int ch;
2951   uint32_t start_collseq;
2952   uint32_t end_collseq;
2953
2954   /* Equivalence Classes and Character Classes can't be a range
2955      start/end.  */
2956   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2957                         || start_elem->type == CHAR_CLASS
2958                         || end_elem->type == EQUIV_CLASS
2959                         || end_elem->type == CHAR_CLASS))
2960     return REG_ERANGE;
2961
2962   /* FIXME: Implement rational ranges here, too.  */
2963   start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2964                                                    table_size, symb_table, extra);
2965   end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2966                                                  table_size, symb_table, extra);
2967   /* Check start/end collation sequence values.  */
2968   if (__glibc_unlikely (start_collseq == UINT_MAX
2969                         || end_collseq == UINT_MAX))
2970     return REG_ECOLLATE;
2971   if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2972                         && start_collseq > end_collseq))
2973     return REG_ERANGE;
2974
2975   /* Got valid collation sequence values, add them as a new entry.
2976      However, if we have no collation elements, and the character set
2977      is single byte, the single byte character set that we
2978      build below suffices. */
2979   if (nrules > 0 || dfa->mb_cur_max > 1)
2980     {
2981       /* Check the space of the arrays.  */
2982       if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2983         {
2984           /* There is not enough space, need realloc.  */
2985           uint32_t *new_array_start;
2986           uint32_t *new_array_end;
2987           Idx new_nranges;
2988
2989           /* +1 in case of mbcset->nranges is 0.  */
2990           new_nranges = 2 * mbcset->nranges + 1;
2991           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2992                                         new_nranges);
2993           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2994                                       new_nranges);
2995
2996           if (__glibc_unlikely (new_array_start == NULL
2997                                 || new_array_end == NULL))
2998             return REG_ESPACE;
2999
3000           mbcset->range_starts = new_array_start;
3001           mbcset->range_ends = new_array_end;
3002           *range_alloc = new_nranges;
3003         }
3004
3005       mbcset->range_starts[mbcset->nranges] = start_collseq;
3006       mbcset->range_ends[mbcset->nranges++] = end_collseq;
3007     }
3008
3009   /* Build the table for single byte characters.  */
3010   for (ch = 0; ch < SBC_MAX; ch++)
3011     {
3012       uint32_t ch_collseq;
3013       /* if (MB_CUR_MAX == 1) */
3014       if (nrules == 0)
3015         ch_collseq = collseqmb[ch];
3016       else
3017         ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3018       if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3019         bitset_set (sbcset, ch);
3020     }
3021   return REG_NOERROR;
3022 }
3023
3024 /* Local function for parse_bracket_exp used in _LIBC environment.
3025    Build the collating element which is represented by NAME.
3026    The result are written to MBCSET and SBCSET.
3027    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3028    pointer argument since we may update it.  */
3029
3030 static inline reg_errcode_t
3031 __attribute__ ((always_inline))
3032 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3033                         Idx *coll_sym_alloc, const unsigned char *name,
3034                         uint32_t nrules, int32_t table_size,
3035                         const int32_t *symb_table, const unsigned char *extra)
3036 {
3037   int32_t elem, idx;
3038   size_t name_len = strlen ((const char *) name);
3039   if (nrules != 0)
3040     {
3041       elem = seek_collating_symbol_entry (name, name_len, symb_table,
3042                                           table_size, extra);
3043       if (elem != -1)
3044         {
3045           /* We found the entry.  */
3046           idx = symb_table[2 * elem + 1];
3047           /* Skip the name of collating element name.  */
3048           idx += 1 + extra[idx];
3049         }
3050       else if (name_len == 1)
3051         {
3052           /* No valid character, treat it as a normal
3053              character.  */
3054           bitset_set (sbcset, name[0]);
3055           return REG_NOERROR;
3056         }
3057       else
3058         return REG_ECOLLATE;
3059
3060       /* Got valid collation sequence, add it as a new entry.  */
3061       /* Check the space of the arrays.  */
3062       if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3063         {
3064           /* Not enough, realloc it.  */
3065           /* +1 in case of mbcset->ncoll_syms is 0.  */
3066           Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3067           /* Use realloc since mbcset->coll_syms is NULL
3068              if *alloc == 0.  */
3069           int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3070                                                new_coll_sym_alloc);
3071           if (__glibc_unlikely (new_coll_syms == NULL))
3072             return REG_ESPACE;
3073           mbcset->coll_syms = new_coll_syms;
3074           *coll_sym_alloc = new_coll_sym_alloc;
3075         }
3076       mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3077       return REG_NOERROR;
3078     }
3079   else
3080     {
3081       if (__glibc_unlikely (name_len != 1))
3082         return REG_ECOLLATE;
3083       else
3084         {
3085           bitset_set (sbcset, name[0]);
3086           return REG_NOERROR;
3087         }
3088     }
3089 }
3090 #endif /* _LIBC */
3091
3092 /* This function parse bracket expression like "[abc]", "[a-c]",
3093    "[[.a-a.]]" etc.  */
3094
3095 static bin_tree_t *
3096 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3097                    reg_syntax_t syntax, reg_errcode_t *err)
3098 {
3099 #ifdef _LIBC
3100   const unsigned char *collseqmb;
3101   const char *collseqwc = NULL;
3102   uint32_t nrules;
3103   int32_t table_size = 0;
3104   const int32_t *symb_table = NULL;
3105   const unsigned char *extra = NULL;
3106 #endif
3107
3108   re_token_t br_token;
3109   re_bitset_ptr_t sbcset;
3110 #ifdef RE_ENABLE_I18N
3111   re_charset_t *mbcset;
3112   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3113   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3114 #endif /* not RE_ENABLE_I18N */
3115   bool non_match = false;
3116   bin_tree_t *work_tree;
3117   int token_len;
3118   bool first_round = true;
3119 #ifdef _LIBC
3120   collseqmb = (const unsigned char *)
3121     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3122   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3123   if (nrules)
3124     {
3125       /*
3126       if (MB_CUR_MAX > 1)
3127       */
3128       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3129       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3130       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3131                                                   _NL_COLLATE_SYMB_TABLEMB);
3132       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3133                                                    _NL_COLLATE_SYMB_EXTRAMB);
3134     }
3135 #endif
3136   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3137 #ifdef RE_ENABLE_I18N
3138   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3139 #endif /* RE_ENABLE_I18N */
3140 #ifdef RE_ENABLE_I18N
3141   if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3142 #else
3143   if (__glibc_unlikely (sbcset == NULL))
3144 #endif /* RE_ENABLE_I18N */
3145     {
3146       re_free (sbcset);
3147 #ifdef RE_ENABLE_I18N
3148       re_free (mbcset);
3149 #endif
3150       *err = REG_ESPACE;
3151       return NULL;
3152     }
3153
3154   token_len = peek_token_bracket (token, regexp, syntax);
3155   if (__glibc_unlikely (token->type == END_OF_RE))
3156     {
3157       *err = REG_BADPAT;
3158       goto parse_bracket_exp_free_return;
3159     }
3160   if (token->type == OP_NON_MATCH_LIST)
3161     {
3162 #ifdef RE_ENABLE_I18N
3163       mbcset->non_match = 1;
3164 #endif /* not RE_ENABLE_I18N */
3165       non_match = true;
3166       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3167         bitset_set (sbcset, '\n');
3168       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3169       token_len = peek_token_bracket (token, regexp, syntax);
3170       if (__glibc_unlikely (token->type == END_OF_RE))
3171         {
3172           *err = REG_BADPAT;
3173           goto parse_bracket_exp_free_return;
3174         }
3175     }
3176
3177   /* We treat the first ']' as a normal character.  */
3178   if (token->type == OP_CLOSE_BRACKET)
3179     token->type = CHARACTER;
3180
3181   while (1)
3182     {
3183       bracket_elem_t start_elem, end_elem;
3184       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3185       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3186       reg_errcode_t ret;
3187       int token_len2 = 0;
3188       bool is_range_exp = false;
3189       re_token_t token2;
3190
3191       start_elem.opr.name = start_name_buf;
3192       start_elem.type = COLL_SYM;
3193       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3194                                    syntax, first_round);
3195       if (__glibc_unlikely (ret != REG_NOERROR))
3196         {
3197           *err = ret;
3198           goto parse_bracket_exp_free_return;
3199         }
3200       first_round = false;
3201
3202       /* Get information about the next token.  We need it in any case.  */
3203       token_len = peek_token_bracket (token, regexp, syntax);
3204
3205       /* Do not check for ranges if we know they are not allowed.  */
3206       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3207         {
3208           if (__glibc_unlikely (token->type == END_OF_RE))
3209             {
3210               *err = REG_EBRACK;
3211               goto parse_bracket_exp_free_return;
3212             }
3213           if (token->type == OP_CHARSET_RANGE)
3214             {
3215               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3216               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3217               if (__glibc_unlikely (token2.type == END_OF_RE))
3218                 {
3219                   *err = REG_EBRACK;
3220                   goto parse_bracket_exp_free_return;
3221                 }
3222               if (token2.type == OP_CLOSE_BRACKET)
3223                 {
3224                   /* We treat the last '-' as a normal character.  */
3225                   re_string_skip_bytes (regexp, -token_len);
3226                   token->type = CHARACTER;
3227                 }
3228               else
3229                 is_range_exp = true;
3230             }
3231         }
3232
3233       if (is_range_exp == true)
3234         {
3235           end_elem.opr.name = end_name_buf;
3236           end_elem.type = COLL_SYM;
3237           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3238                                        dfa, syntax, true);
3239           if (__glibc_unlikely (ret != REG_NOERROR))
3240             {
3241               *err = ret;
3242               goto parse_bracket_exp_free_return;
3243             }
3244
3245           token_len = peek_token_bracket (token, regexp, syntax);
3246
3247 #ifdef _LIBC
3248           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3249                                   &start_elem, &end_elem,
3250                                   dfa, syntax, nrules, collseqmb, collseqwc,
3251                                   table_size, symb_table, extra);
3252 #else
3253 # ifdef RE_ENABLE_I18N
3254           *err = build_range_exp (syntax, sbcset,
3255                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3256                                   &range_alloc, &start_elem, &end_elem);
3257 # else
3258           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3259 # endif
3260 #endif /* RE_ENABLE_I18N */
3261           if (__glibc_unlikely (*err != REG_NOERROR))
3262             goto parse_bracket_exp_free_return;
3263         }
3264       else
3265         {
3266           switch (start_elem.type)
3267             {
3268             case SB_CHAR:
3269               bitset_set (sbcset, start_elem.opr.ch);
3270               break;
3271 #ifdef RE_ENABLE_I18N
3272             case MB_CHAR:
3273               /* Check whether the array has enough space.  */
3274               if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3275                 {
3276                   wchar_t *new_mbchars;
3277                   /* Not enough, realloc it.  */
3278                   /* +1 in case of mbcset->nmbchars is 0.  */
3279                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3280                   /* Use realloc since array is NULL if *alloc == 0.  */
3281                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3282                                             mbchar_alloc);
3283                   if (__glibc_unlikely (new_mbchars == NULL))
3284                     goto parse_bracket_exp_espace;
3285                   mbcset->mbchars = new_mbchars;
3286                 }
3287               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3288               break;
3289 #endif /* RE_ENABLE_I18N */
3290             case EQUIV_CLASS:
3291               *err = build_equiv_class (sbcset,
3292 #ifdef RE_ENABLE_I18N
3293                                         mbcset, &equiv_class_alloc,
3294 #endif /* RE_ENABLE_I18N */
3295                                         start_elem.opr.name);
3296               if (__glibc_unlikely (*err != REG_NOERROR))
3297                 goto parse_bracket_exp_free_return;
3298               break;
3299             case COLL_SYM:
3300               *err = build_collating_symbol (sbcset,
3301 #ifdef RE_ENABLE_I18N
3302                                              mbcset, &coll_sym_alloc,
3303 #endif /* RE_ENABLE_I18N */
3304                                              start_elem.opr.name,
3305                                              nrules, table_size, symb_table, extra);
3306               if (__glibc_unlikely (*err != REG_NOERROR))
3307                 goto parse_bracket_exp_free_return;
3308               break;
3309             case CHAR_CLASS:
3310               *err = build_charclass (regexp->trans, sbcset,
3311 #ifdef RE_ENABLE_I18N
3312                                       mbcset, &char_class_alloc,
3313 #endif /* RE_ENABLE_I18N */
3314                                       (const char *) start_elem.opr.name,
3315                                       syntax);
3316               if (__glibc_unlikely (*err != REG_NOERROR))
3317                goto parse_bracket_exp_free_return;
3318               break;
3319             default:
3320               DEBUG_ASSERT (false);
3321               break;
3322             }
3323         }
3324       if (__glibc_unlikely (token->type == END_OF_RE))
3325         {
3326           *err = REG_EBRACK;
3327           goto parse_bracket_exp_free_return;
3328         }
3329       if (token->type == OP_CLOSE_BRACKET)
3330         break;
3331     }
3332
3333   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3334
3335   /* If it is non-matching list.  */
3336   if (non_match)
3337     bitset_not (sbcset);
3338
3339 #ifdef RE_ENABLE_I18N
3340   /* Ensure only single byte characters are set.  */
3341   if (dfa->mb_cur_max > 1)
3342     bitset_mask (sbcset, dfa->sb_char);
3343
3344   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3345       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3346                                                      || mbcset->non_match)))
3347     {
3348       bin_tree_t *mbc_tree;
3349       int sbc_idx;
3350       /* Build a tree for complex bracket.  */
3351       dfa->has_mb_node = 1;
3352       br_token.type = COMPLEX_BRACKET;
3353       br_token.opr.mbcset = mbcset;
3354       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3355       if (__glibc_unlikely (mbc_tree == NULL))
3356         goto parse_bracket_exp_espace;
3357       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3358         if (sbcset[sbc_idx])
3359           break;
3360       /* If there are no bits set in sbcset, there is no point
3361          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3362       if (sbc_idx < BITSET_WORDS)
3363         {
3364           /* Build a tree for simple bracket.  */
3365           br_token.type = SIMPLE_BRACKET;
3366           br_token.opr.sbcset = sbcset;
3367           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3368           if (__glibc_unlikely (work_tree == NULL))
3369             goto parse_bracket_exp_espace;
3370
3371           /* Then join them by ALT node.  */
3372           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3373           if (__glibc_unlikely (work_tree == NULL))
3374             goto parse_bracket_exp_espace;
3375         }
3376       else
3377         {
3378           re_free (sbcset);
3379           work_tree = mbc_tree;
3380         }
3381     }
3382   else
3383 #endif /* not RE_ENABLE_I18N */
3384     {
3385 #ifdef RE_ENABLE_I18N
3386       free_charset (mbcset);
3387 #endif
3388       /* Build a tree for simple bracket.  */
3389       br_token.type = SIMPLE_BRACKET;
3390       br_token.opr.sbcset = sbcset;
3391       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3392       if (__glibc_unlikely (work_tree == NULL))
3393         goto parse_bracket_exp_espace;
3394     }
3395   return work_tree;
3396
3397  parse_bracket_exp_espace:
3398   *err = REG_ESPACE;
3399  parse_bracket_exp_free_return:
3400   re_free (sbcset);
3401 #ifdef RE_ENABLE_I18N
3402   free_charset (mbcset);
3403 #endif /* RE_ENABLE_I18N */
3404   return NULL;
3405 }
3406
3407 /* Parse an element in the bracket expression.  */
3408
3409 static reg_errcode_t
3410 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3411                        re_token_t *token, int token_len, re_dfa_t *dfa,
3412                        reg_syntax_t syntax, bool accept_hyphen)
3413 {
3414 #ifdef RE_ENABLE_I18N
3415   int cur_char_size;
3416   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3417   if (cur_char_size > 1)
3418     {
3419       elem->type = MB_CHAR;
3420       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3421       re_string_skip_bytes (regexp, cur_char_size);
3422       return REG_NOERROR;
3423     }
3424 #endif /* RE_ENABLE_I18N */
3425   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3426   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3427       || token->type == OP_OPEN_EQUIV_CLASS)
3428     return parse_bracket_symbol (elem, regexp, token);
3429   if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3430     {
3431       /* A '-' must only appear as anything but a range indicator before
3432          the closing bracket.  Everything else is an error.  */
3433       re_token_t token2;
3434       (void) peek_token_bracket (&token2, regexp, syntax);
3435       if (token2.type != OP_CLOSE_BRACKET)
3436         /* The actual error value is not standardized since this whole
3437            case is undefined.  But ERANGE makes good sense.  */
3438         return REG_ERANGE;
3439     }
3440   elem->type = SB_CHAR;
3441   elem->opr.ch = token->opr.c;
3442   return REG_NOERROR;
3443 }
3444
3445 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3446    such as [:<character_class>:], [.<collating_element>.], and
3447    [=<equivalent_class>=].  */
3448
3449 static reg_errcode_t
3450 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3451                       re_token_t *token)
3452 {
3453   unsigned char ch, delim = token->opr.c;
3454   int i = 0;
3455   if (re_string_eoi(regexp))
3456     return REG_EBRACK;
3457   for (;; ++i)
3458     {
3459       if (i >= BRACKET_NAME_BUF_SIZE)
3460         return REG_EBRACK;
3461       if (token->type == OP_OPEN_CHAR_CLASS)
3462         ch = re_string_fetch_byte_case (regexp);
3463       else
3464         ch = re_string_fetch_byte (regexp);
3465       if (re_string_eoi(regexp))
3466         return REG_EBRACK;
3467       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3468         break;
3469       elem->opr.name[i] = ch;
3470     }
3471   re_string_skip_bytes (regexp, 1);
3472   elem->opr.name[i] = '\0';
3473   switch (token->type)
3474     {
3475     case OP_OPEN_COLL_ELEM:
3476       elem->type = COLL_SYM;
3477       break;
3478     case OP_OPEN_EQUIV_CLASS:
3479       elem->type = EQUIV_CLASS;
3480       break;
3481     case OP_OPEN_CHAR_CLASS:
3482       elem->type = CHAR_CLASS;
3483       break;
3484     default:
3485       break;
3486     }
3487   return REG_NOERROR;
3488 }
3489
3490   /* Helper function for parse_bracket_exp.
3491      Build the equivalence class which is represented by NAME.
3492      The result are written to MBCSET and SBCSET.
3493      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3494      is a pointer argument since we may update it.  */
3495
3496 static reg_errcode_t
3497 #ifdef RE_ENABLE_I18N
3498 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3499                    Idx *equiv_class_alloc, const unsigned char *name)
3500 #else /* not RE_ENABLE_I18N */
3501 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3502 #endif /* not RE_ENABLE_I18N */
3503 {
3504 #ifdef _LIBC
3505   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3506   if (nrules != 0)
3507     {
3508       const int32_t *table, *indirect;
3509       const unsigned char *weights, *extra, *cp;
3510       unsigned char char_buf[2];
3511       int32_t idx1, idx2;
3512       unsigned int ch;
3513       size_t len;
3514       /* Calculate the index for equivalence class.  */
3515       cp = name;
3516       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3517       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3518                                                _NL_COLLATE_WEIGHTMB);
3519       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3520                                                    _NL_COLLATE_EXTRAMB);
3521       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3522                                                 _NL_COLLATE_INDIRECTMB);
3523       idx1 = findidx (table, indirect, extra, &cp, -1);
3524       if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3525         /* This isn't a valid character.  */
3526         return REG_ECOLLATE;
3527
3528       /* Build single byte matching table for this equivalence class.  */
3529       len = weights[idx1 & 0xffffff];
3530       for (ch = 0; ch < SBC_MAX; ++ch)
3531         {
3532           char_buf[0] = ch;
3533           cp = char_buf;
3534           idx2 = findidx (table, indirect, extra, &cp, 1);
3535 /*
3536           idx2 = table[ch];
3537 */
3538           if (idx2 == 0)
3539             /* This isn't a valid character.  */
3540             continue;
3541           /* Compare only if the length matches and the collation rule
3542              index is the same.  */
3543           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3544               && memcmp (weights + (idx1 & 0xffffff) + 1,
3545                          weights + (idx2 & 0xffffff) + 1, len) == 0)
3546             bitset_set (sbcset, ch);
3547         }
3548       /* Check whether the array has enough space.  */
3549       if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3550         {
3551           /* Not enough, realloc it.  */
3552           /* +1 in case of mbcset->nequiv_classes is 0.  */
3553           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3554           /* Use realloc since the array is NULL if *alloc == 0.  */
3555           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3556                                                    int32_t,
3557                                                    new_equiv_class_alloc);
3558           if (__glibc_unlikely (new_equiv_classes == NULL))
3559             return REG_ESPACE;
3560           mbcset->equiv_classes = new_equiv_classes;
3561           *equiv_class_alloc = new_equiv_class_alloc;
3562         }
3563       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3564     }
3565   else
3566 #endif /* _LIBC */
3567     {
3568       if (__glibc_unlikely (strlen ((const char *) name) != 1))
3569         return REG_ECOLLATE;
3570       bitset_set (sbcset, *name);
3571     }
3572   return REG_NOERROR;
3573 }
3574
3575   /* Helper function for parse_bracket_exp.
3576      Build the character class which is represented by NAME.
3577      The result are written to MBCSET and SBCSET.
3578      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3579      is a pointer argument since we may update it.  */
3580
3581 static reg_errcode_t
3582 #ifdef RE_ENABLE_I18N
3583 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3584                  re_charset_t *mbcset, Idx *char_class_alloc,
3585                  const char *class_name, reg_syntax_t syntax)
3586 #else /* not RE_ENABLE_I18N */
3587 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3588                  const char *class_name, reg_syntax_t syntax)
3589 #endif /* not RE_ENABLE_I18N */
3590 {
3591   int i;
3592   const char *name = class_name;
3593
3594   /* In case of REG_ICASE "upper" and "lower" match the both of
3595      upper and lower cases.  */
3596   if ((syntax & RE_ICASE)
3597       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3598     name = "alpha";
3599
3600 #ifdef RE_ENABLE_I18N
3601   /* Check the space of the arrays.  */
3602   if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3603     {
3604       /* Not enough, realloc it.  */
3605       /* +1 in case of mbcset->nchar_classes is 0.  */
3606       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3607       /* Use realloc since array is NULL if *alloc == 0.  */
3608       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3609                                                new_char_class_alloc);
3610       if (__glibc_unlikely (new_char_classes == NULL))
3611         return REG_ESPACE;
3612       mbcset->char_classes = new_char_classes;
3613       *char_class_alloc = new_char_class_alloc;
3614     }
3615   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3616 #endif /* RE_ENABLE_I18N */
3617
3618 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3619   do {                                          \
3620     if (__glibc_unlikely (trans != NULL))                       \
3621       {                                         \
3622         for (i = 0; i < SBC_MAX; ++i)           \
3623           if (ctype_func (i))                   \
3624             bitset_set (sbcset, trans[i]);      \
3625       }                                         \
3626     else                                        \
3627       {                                         \
3628         for (i = 0; i < SBC_MAX; ++i)           \
3629           if (ctype_func (i))                   \
3630             bitset_set (sbcset, i);             \
3631       }                                         \
3632   } while (0)
3633
3634   if (strcmp (name, "alnum") == 0)
3635     BUILD_CHARCLASS_LOOP (isalnum);
3636   else if (strcmp (name, "cntrl") == 0)
3637     BUILD_CHARCLASS_LOOP (iscntrl);
3638   else if (strcmp (name, "lower") == 0)
3639     BUILD_CHARCLASS_LOOP (islower);
3640   else if (strcmp (name, "space") == 0)
3641     BUILD_CHARCLASS_LOOP (isspace);
3642   else if (strcmp (name, "alpha") == 0)
3643     BUILD_CHARCLASS_LOOP (isalpha);
3644   else if (strcmp (name, "digit") == 0)
3645     BUILD_CHARCLASS_LOOP (isdigit);
3646   else if (strcmp (name, "print") == 0)
3647     BUILD_CHARCLASS_LOOP (isprint);
3648   else if (strcmp (name, "upper") == 0)
3649     BUILD_CHARCLASS_LOOP (isupper);
3650   else if (strcmp (name, "blank") == 0)
3651     BUILD_CHARCLASS_LOOP (isblank);
3652   else if (strcmp (name, "graph") == 0)
3653     BUILD_CHARCLASS_LOOP (isgraph);
3654   else if (strcmp (name, "punct") == 0)
3655     BUILD_CHARCLASS_LOOP (ispunct);
3656   else if (strcmp (name, "xdigit") == 0)
3657     BUILD_CHARCLASS_LOOP (isxdigit);
3658   else
3659     return REG_ECTYPE;
3660
3661   return REG_NOERROR;
3662 }
3663
3664 static bin_tree_t *
3665 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3666                     const char *class_name,
3667                     const char *extra, bool non_match,
3668                     reg_errcode_t *err)
3669 {
3670   re_bitset_ptr_t sbcset;
3671 #ifdef RE_ENABLE_I18N
3672   re_charset_t *mbcset;
3673   Idx alloc = 0;
3674 #endif /* not RE_ENABLE_I18N */
3675   reg_errcode_t ret;
3676   bin_tree_t *tree;
3677
3678   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3679   if (__glibc_unlikely (sbcset == NULL))
3680     {
3681       *err = REG_ESPACE;
3682       return NULL;
3683     }
3684 #ifdef RE_ENABLE_I18N
3685   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3686   if (__glibc_unlikely (mbcset == NULL))
3687     {
3688       re_free (sbcset);
3689       *err = REG_ESPACE;
3690       return NULL;
3691     }
3692   mbcset->non_match = non_match;
3693 #endif /* RE_ENABLE_I18N */
3694
3695   /* We don't care the syntax in this case.  */
3696   ret = build_charclass (trans, sbcset,
3697 #ifdef RE_ENABLE_I18N
3698                          mbcset, &alloc,
3699 #endif /* RE_ENABLE_I18N */
3700                          class_name, 0);
3701
3702   if (__glibc_unlikely (ret != REG_NOERROR))
3703     {
3704       re_free (sbcset);
3705 #ifdef RE_ENABLE_I18N
3706       free_charset (mbcset);
3707 #endif /* RE_ENABLE_I18N */
3708       *err = ret;
3709       return NULL;
3710     }
3711   /* \w match '_' also.  */
3712   for (; *extra; extra++)
3713     bitset_set (sbcset, *extra);
3714
3715   /* If it is non-matching list.  */
3716   if (non_match)
3717     bitset_not (sbcset);
3718
3719 #ifdef RE_ENABLE_I18N
3720   /* Ensure only single byte characters are set.  */
3721   if (dfa->mb_cur_max > 1)
3722     bitset_mask (sbcset, dfa->sb_char);
3723 #endif
3724
3725   /* Build a tree for simple bracket.  */
3726   re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3727   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3728   if (__glibc_unlikely (tree == NULL))
3729     goto build_word_op_espace;
3730
3731 #ifdef RE_ENABLE_I18N
3732   if (dfa->mb_cur_max > 1)
3733     {
3734       bin_tree_t *mbc_tree;
3735       /* Build a tree for complex bracket.  */
3736       br_token.type = COMPLEX_BRACKET;
3737       br_token.opr.mbcset = mbcset;
3738       dfa->has_mb_node = 1;
3739       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3740       if (__glibc_unlikely (mbc_tree == NULL))
3741         goto build_word_op_espace;
3742       /* Then join them by ALT node.  */
3743       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3744       if (__glibc_likely (mbc_tree != NULL))
3745         return tree;
3746     }
3747   else
3748     {
3749       free_charset (mbcset);
3750       return tree;
3751     }
3752 #else /* not RE_ENABLE_I18N */
3753   return tree;
3754 #endif /* not RE_ENABLE_I18N */
3755
3756  build_word_op_espace:
3757   re_free (sbcset);
3758 #ifdef RE_ENABLE_I18N
3759   free_charset (mbcset);
3760 #endif /* RE_ENABLE_I18N */
3761   *err = REG_ESPACE;
3762   return NULL;
3763 }
3764
3765 /* This is intended for the expressions like "a{1,3}".
3766    Fetch a number from 'input', and return the number.
3767    Return -1 if the number field is empty like "{,1}".
3768    Return RE_DUP_MAX + 1 if the number field is too large.
3769    Return -2 if an error occurred.  */
3770
3771 static Idx
3772 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3773 {
3774   Idx num = -1;
3775   unsigned char c;
3776   while (1)
3777     {
3778       fetch_token (token, input, syntax);
3779       c = token->opr.c;
3780       if (__glibc_unlikely (token->type == END_OF_RE))
3781         return -2;
3782       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3783         break;
3784       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3785              ? -2
3786              : num == -1
3787              ? c - '0'
3788              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3789     }
3790   return num;
3791 }
3792 \f
3793 #ifdef RE_ENABLE_I18N
3794 static void
3795 free_charset (re_charset_t *cset)
3796 {
3797   re_free (cset->mbchars);
3798 # ifdef _LIBC
3799   re_free (cset->coll_syms);
3800   re_free (cset->equiv_classes);
3801 # endif
3802   re_free (cset->range_starts);
3803   re_free (cset->range_ends);
3804   re_free (cset->char_classes);
3805   re_free (cset);
3806 }
3807 #endif /* RE_ENABLE_I18N */
3808 \f
3809 /* Functions for binary tree operation.  */
3810
3811 /* Create a tree node.  */
3812
3813 static bin_tree_t *
3814 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3815              re_token_type_t type)
3816 {
3817   re_token_t t = { .type = type };
3818   return create_token_tree (dfa, left, right, &t);
3819 }
3820
3821 static bin_tree_t *
3822 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3823                    const re_token_t *token)
3824 {
3825   bin_tree_t *tree;
3826   if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3827     {
3828       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3829
3830       if (storage == NULL)
3831         return NULL;
3832       storage->next = dfa->str_tree_storage;
3833       dfa->str_tree_storage = storage;
3834       dfa->str_tree_storage_idx = 0;
3835     }
3836   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3837
3838   tree->parent = NULL;
3839   tree->left = left;
3840   tree->right = right;
3841   tree->token = *token;
3842   tree->token.duplicated = 0;
3843   tree->token.opt_subexp = 0;
3844   tree->first = NULL;
3845   tree->next = NULL;
3846   tree->node_idx = -1;
3847
3848   if (left != NULL)
3849     left->parent = tree;
3850   if (right != NULL)
3851     right->parent = tree;
3852   return tree;
3853 }
3854
3855 /* Mark the tree SRC as an optional subexpression.
3856    To be called from preorder or postorder.  */
3857
3858 static reg_errcode_t
3859 mark_opt_subexp (void *extra, bin_tree_t *node)
3860 {
3861   Idx idx = (uintptr_t) extra;
3862   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3863     node->token.opt_subexp = 1;
3864
3865   return REG_NOERROR;
3866 }
3867
3868 /* Free the allocated memory inside NODE. */
3869
3870 static void
3871 free_token (re_token_t *node)
3872 {
3873 #ifdef RE_ENABLE_I18N
3874   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3875     free_charset (node->opr.mbcset);
3876   else
3877 #endif /* RE_ENABLE_I18N */
3878     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3879       re_free (node->opr.sbcset);
3880 }
3881
3882 /* Worker function for tree walking.  Free the allocated memory inside NODE
3883    and its children. */
3884
3885 static reg_errcode_t
3886 free_tree (void *extra, bin_tree_t *node)
3887 {
3888   free_token (&node->token);
3889   return REG_NOERROR;
3890 }
3891
3892
3893 /* Duplicate the node SRC, and return new node.  This is a preorder
3894    visit similar to the one implemented by the generic visitor, but
3895    we need more infrastructure to maintain two parallel trees --- so,
3896    it's easier to duplicate.  */
3897
3898 static bin_tree_t *
3899 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3900 {
3901   const bin_tree_t *node;
3902   bin_tree_t *dup_root;
3903   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3904
3905   for (node = root; ; )
3906     {
3907       /* Create a new tree and link it back to the current parent.  */
3908       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3909       if (*p_new == NULL)
3910         return NULL;
3911       (*p_new)->parent = dup_node;
3912       (*p_new)->token.duplicated = 1;
3913       dup_node = *p_new;
3914
3915       /* Go to the left node, or up and to the right.  */
3916       if (node->left)
3917         {
3918           node = node->left;
3919           p_new = &dup_node->left;
3920         }
3921       else
3922         {
3923           const bin_tree_t *prev = NULL;
3924           while (node->right == prev || node->right == NULL)
3925             {
3926               prev = node;
3927               node = node->parent;
3928               dup_node = dup_node->parent;
3929               if (!node)
3930                 return dup_root;
3931             }
3932           node = node->right;
3933           p_new = &dup_node->right;
3934         }
3935     }
3936 }