posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #ifdef _LIBC
  21 # include <locale/weight.h>
  22 #endif
  23
  24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  25                                           size_t length, reg_syntax_t syntax);
  26 static void re_compile_fastmap_iter (regex_t *bufp,
  27                                      const re_dfastate_t *init_state,
  28                                      char *fastmap);
  29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  30 #ifdef RE_ENABLE_I18N
  31 static void free_charset (re_charset_t *cset);
  32 #endif /* RE_ENABLE_I18N */
  33 static void free_workarea_compile (regex_t *preg);
  34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  35 #ifdef RE_ENABLE_I18N
  36 static void optimize_utf8 (re_dfa_t *dfa);
  37 #endif
  38 static reg_errcode_t analyze (regex_t *preg);
  39 static reg_errcode_t preorder (bin_tree_t *root,
  40                                reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                void *extra);
  42 static reg_errcode_t postorder (bin_tree_t *root,
  43                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  44                                 void *extra);
  45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  48                                  bin_tree_t *node);
  49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  54                                    unsigned int constraint);
  55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  57                                          Idx node, bool root);
  58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  59 static Idx fetch_number (re_string_t *input, re_token_t *token,
  60                          reg_syntax_t syntax);
  61 static int peek_token (re_token_t *token, re_string_t *input,
  62                         reg_syntax_t syntax);
  63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  64                           reg_syntax_t syntax, reg_errcode_t *err);
  65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  66                                   re_token_t *token, reg_syntax_t syntax,
  67                                   Idx nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  69                                  re_token_t *token, reg_syntax_t syntax,
  70                                  Idx nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  72                                      re_token_t *token, reg_syntax_t syntax,
  73                                      Idx nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  75                                   re_token_t *token, reg_syntax_t syntax,
  76                                   Idx nest, reg_errcode_t *err);
  77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  78                                  re_dfa_t *dfa, re_token_t *token,
  79                                  reg_syntax_t syntax, reg_errcode_t *err);
  80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  81                                       re_token_t *token, reg_syntax_t syntax,
  82                                       reg_errcode_t *err);
  83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  84                                             re_string_t *regexp,
  85                                             re_token_t *token, int token_len,
  86                                             re_dfa_t *dfa,
  87                                             reg_syntax_t syntax,
  88                                             bool accept_hyphen);
  89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  90                                           re_string_t *regexp,
  91                                           re_token_t *token);
  92 #ifdef RE_ENABLE_I18N
  93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  94                                         re_charset_t *mbcset,
  95                                         Idx *equiv_class_alloc,
  96                                         const unsigned char *name);
  97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  98                                       bitset_t sbcset,
  99                                       re_charset_t *mbcset,
 100                                       Idx *char_class_alloc,
 101                                       const char *class_name,
 102                                       reg_syntax_t syntax);
 103 #else  /* not RE_ENABLE_I18N */
 104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 105                                         const unsigned char *name);
 106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 107                                       bitset_t sbcset,
 108                                       const char *class_name,
 109                                       reg_syntax_t syntax);
 110 #endif /* not RE_ENABLE_I18N */
 111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 112                                        RE_TRANSLATE_TYPE trans,
 113                                        const char *class_name,
 114                                        const char *extra,
 115                                        bool non_match, reg_errcode_t *err);
 116 static bin_tree_t *create_tree (re_dfa_t *dfa,
 117                                 bin_tree_t *left, bin_tree_t *right,
 118                                 re_token_type_t type);
 119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 120                                       bin_tree_t *left, bin_tree_t *right,
 121                                       const re_token_t *token);
 122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 123 static void free_token (re_token_t *node);
 124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 126 \f
 127 /* This table gives an error message for each of the error codes listed
 128    in regex.h.  Obviously the order here has to be same as there.
 129    POSIX doesn't require that we do anything for REG_NOERROR,
 130    but why not be nice?  */
 131
 132 static const char __re_error_msgid[] =
 133   {
 134 #define REG_NOERROR_IDX 0
 135     gettext_noop ("Success")    /* REG_NOERROR */
 136     "\0"
 137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 138     gettext_noop ("No match")   /* REG_NOMATCH */
 139     "\0"
 140 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 142     "\0"
 143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 145     "\0"
 146 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 148     "\0"
 149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 151     "\0"
 152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 154     "\0"
 155 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 156     gettext_noop ("Unmatched [, [^, [:, [., or [=")     /* REG_EBRACK */
 157     "\0"
 158 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
 159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 160     "\0"
 161 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 163     "\0"
 164 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 166     "\0"
 167 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 168     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 169     "\0"
 170 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 172     "\0"
 173 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 175     "\0"
 176 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 178     "\0"
 179 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 181     "\0"
 182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 184   };
 185
 186 static const size_t __re_error_msgid_idx[] =
 187   {
 188     REG_NOERROR_IDX,
 189     REG_NOMATCH_IDX,
 190     REG_BADPAT_IDX,
 191     REG_ECOLLATE_IDX,
 192     REG_ECTYPE_IDX,
 193     REG_EESCAPE_IDX,
 194     REG_ESUBREG_IDX,
 195     REG_EBRACK_IDX,
 196     REG_EPAREN_IDX,
 197     REG_EBRACE_IDX,
 198     REG_BADBR_IDX,
 199     REG_ERANGE_IDX,
 200     REG_ESPACE_IDX,
 201     REG_BADRPT_IDX,
 202     REG_EEND_IDX,
 203     REG_ESIZE_IDX,
 204     REG_ERPAREN_IDX
 205   };
 206 \f
 207 /* Entry points for GNU code.  */
 208
 209 /* re_compile_pattern is the GNU regular expression compiler: it
 210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 211    Returns 0 if the pattern was valid, otherwise an error string.
 212
 213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 214    are set in BUFP on entry.  */
 215
 216 const char *
 217 re_compile_pattern (const char *pattern, size_t length,
 218                     struct re_pattern_buffer *bufp)
 219 {
 220   reg_errcode_t ret;
 221
 222   /* And GNU code determines whether or not to get register information
 223      by passing null for the REGS argument to re_match, etc., not by
 224      setting no_sub, unless RE_NO_SUB is set.  */
 225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 226
 227   /* Match anchors at newline.  */
 228   bufp->newline_anchor = 1;
 229
 230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 231
 232   if (!ret)
 233     return NULL;
 234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 235 }
 236 #ifdef _LIBC
 237 weak_alias (__re_compile_pattern, re_compile_pattern)
 238 #endif
 239
 240 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 241    also be assigned to arbitrarily: each pattern buffer stores its own
 242    syntax, so it can be changed between regex compilations.  */
 243 /* This has no initializer because initialized variables in Emacs
 244    become read-only after dumping.  */
 245 reg_syntax_t re_syntax_options;
 246
 247
 248 /* Specify the precise syntax of regexps for compilation.  This provides
 249    for compatibility for various utilities which historically have
 250    different, incompatible syntaxes.
 251
 252    The argument SYNTAX is a bit mask comprised of the various bits
 253    defined in regex.h.  We return the old syntax.  */
 254
 255 reg_syntax_t
 256 re_set_syntax (reg_syntax_t syntax)
 257 {
 258   reg_syntax_t ret = re_syntax_options;
 259
 260   re_syntax_options = syntax;
 261   return ret;
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_set_syntax, re_set_syntax)
 265 #endif
 266
 267 int
 268 re_compile_fastmap (struct re_pattern_buffer *bufp)
 269 {
 270   re_dfa_t *dfa = bufp->buffer;
 271   char *fastmap = bufp->fastmap;
 272
 273   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 274   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 275   if (dfa->init_state != dfa->init_state_word)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 277   if (dfa->init_state != dfa->init_state_nl)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 279   if (dfa->init_state != dfa->init_state_begbuf)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 281   bufp->fastmap_accurate = 1;
 282   return 0;
 283 }
 284 #ifdef _LIBC
 285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 286 #endif
 287
 288 static inline void
 289 __attribute__ ((always_inline))
 290 re_set_fastmap (char *fastmap, bool icase, int ch)
 291 {
 292   fastmap[ch] = 1;
 293   if (icase)
 294     fastmap[tolower (ch)] = 1;
 295 }
 296
 297 /* Helper function for re_compile_fastmap.
 298    Compile fastmap for the initial_state INIT_STATE.  */
 299
 300 static void
 301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 302                          char *fastmap)
 303 {
 304   re_dfa_t *dfa = bufp->buffer;
 305   Idx node_cnt;
 306   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 307   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 308     {
 309       Idx node = init_state->nodes.elems[node_cnt];
 310       re_token_type_t type = dfa->nodes[node].type;
 311
 312       if (type == CHARACTER)
 313         {
 314           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 315 #ifdef RE_ENABLE_I18N
 316           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 317             {
 318               unsigned char buf[MB_LEN_MAX];
 319               unsigned char *p;
 320               wchar_t wc;
 321               mbstate_t state;
 322
 323               p = buf;
 324               *p++ = dfa->nodes[node].opr.c;
 325               while (++node < dfa->nodes_len
 326                      && dfa->nodes[node].type == CHARACTER
 327                      && dfa->nodes[node].mb_partial)
 328                 *p++ = dfa->nodes[node].opr.c;
 329               memset (&state, '\0', sizeof (state));
 330               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 331                              &state) == p - buf
 332                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 333                       != (size_t) -1))
 334                 re_set_fastmap (fastmap, false, buf[0]);
 335             }
 336 #endif
 337         }
 338       else if (type == SIMPLE_BRACKET)
 339         {
 340           int i, ch;
 341           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 342             {
 343               int j;
 344               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 345               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 346                 if (w & ((bitset_word_t) 1 << j))
 347                   re_set_fastmap (fastmap, icase, ch);
 348             }
 349         }
 350 #ifdef RE_ENABLE_I18N
 351       else if (type == COMPLEX_BRACKET)
 352         {
 353           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 354           Idx i;
 355
 356 # ifdef _LIBC
 357           /* See if we have to try all bytes which start multiple collation
 358              elements.
 359              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 360                   collation element, and don't catch 'b' since 'b' is
 361                   the only collation element which starts from 'b' (and
 362                   it is caught by SIMPLE_BRACKET).  */
 363               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 364                   && (cset->ncoll_syms || cset->nranges))
 365                 {
 366                   const int32_t *table = (const int32_t *)
 367                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 368                   for (i = 0; i < SBC_MAX; ++i)
 369                     if (table[i] < 0)
 370                       re_set_fastmap (fastmap, icase, i);
 371                 }
 372 # endif /* _LIBC */
 373
 374           /* See if we have to start the match at all multibyte characters,
 375              i.e. where we would not find an invalid sequence.  This only
 376              applies to multibyte character sets; for single byte character
 377              sets, the SIMPLE_BRACKET again suffices.  */
 378           if (dfa->mb_cur_max > 1
 379               && (cset->nchar_classes || cset->non_match || cset->nranges
 380 # ifdef _LIBC
 381                   || cset->nequiv_classes
 382 # endif /* _LIBC */
 383                  ))
 384             {
 385               unsigned char c = 0;
 386               do
 387                 {
 388                   mbstate_t mbs;
 389                   memset (&mbs, 0, sizeof (mbs));
 390                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 391                     re_set_fastmap (fastmap, false, (int) c);
 392                 }
 393               while (++c != 0);
 394             }
 395
 396           else
 397             {
 398               /* ... Else catch all bytes which can start the mbchars.  */
 399               for (i = 0; i < cset->nmbchars; ++i)
 400                 {
 401                   char buf[256];
 402                   mbstate_t state;
 403                   memset (&state, '\0', sizeof (state));
 404                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 405                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 406                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 407                     {
 408                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 409                           != (size_t) -1)
 410                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 411                     }
 412                 }
 413             }
 414         }
 415 #endif /* RE_ENABLE_I18N */
 416       else if (type == OP_PERIOD
 417 #ifdef RE_ENABLE_I18N
 418                || type == OP_UTF8_PERIOD
 419 #endif /* RE_ENABLE_I18N */
 420                || type == END_OF_RE)
 421         {
 422           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 423           if (type == END_OF_RE)
 424             bufp->can_be_null = 1;
 425           return;
 426         }
 427     }
 428 }
 429 \f
 430 /* Entry point for POSIX code.  */
 431 /* regcomp takes a regular expression as a string and compiles it.
 432
 433    PREG is a regex_t *.  We do not expect any fields to be initialized,
 434    since POSIX says we shouldn't.  Thus, we set
 435
 436      'buffer' to the compiled pattern;
 437      'used' to the length of the compiled pattern;
 438      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 439        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 440        RE_SYNTAX_POSIX_BASIC;
 441      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 442      'fastmap' to an allocated space for the fastmap;
 443      'fastmap_accurate' to zero;
 444      're_nsub' to the number of subexpressions in PATTERN.
 445
 446    PATTERN is the address of the pattern string.
 447
 448    CFLAGS is a series of bits which affect compilation.
 449
 450      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 451      use POSIX basic syntax.
 452
 453      If REG_NEWLINE is set, then . and [^...] don't match newline.
 454      Also, regexec will try a match beginning after every newline.
 455
 456      If REG_ICASE is set, then we considers upper- and lowercase
 457      versions of letters to be equivalent when matching.
 458
 459      If REG_NOSUB is set, then when PREG is passed to regexec, that
 460      routine will report only success or failure, and nothing about the
 461      registers.
 462
 463    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 464    the return codes and their meanings.)  */
 465
 466 int
 467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
 468 {
 469   reg_errcode_t ret;
 470   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 471                          : RE_SYNTAX_POSIX_BASIC);
 472
 473   preg->buffer = NULL;
 474   preg->allocated = 0;
 475   preg->used = 0;
 476
 477   /* Try to allocate space for the fastmap.  */
 478   preg->fastmap = re_malloc (char, SBC_MAX);
 479   if (BE (preg->fastmap == NULL, 0))
 480     return REG_ESPACE;
 481
 482   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 483
 484   /* If REG_NEWLINE is set, newlines are treated differently.  */
 485   if (cflags & REG_NEWLINE)
 486     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 487       syntax &= ~RE_DOT_NEWLINE;
 488       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 489       /* It also changes the matching behavior.  */
 490       preg->newline_anchor = 1;
 491     }
 492   else
 493     preg->newline_anchor = 0;
 494   preg->no_sub = !!(cflags & REG_NOSUB);
 495   preg->translate = NULL;
 496
 497   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 498
 499   /* POSIX doesn't distinguish between an unmatched open-group and an
 500      unmatched close-group: both are REG_EPAREN.  */
 501   if (ret == REG_ERPAREN)
 502     ret = REG_EPAREN;
 503
 504   /* We have already checked preg->fastmap != NULL.  */
 505   if (BE (ret == REG_NOERROR, 1))
 506     /* Compute the fastmap now, since regexec cannot modify the pattern
 507        buffer.  This function never fails in this implementation.  */
 508     (void) re_compile_fastmap (preg);
 509   else
 510     {
 511       /* Some error occurred while compiling the expression.  */
 512       re_free (preg->fastmap);
 513       preg->fastmap = NULL;
 514     }
 515
 516   return (int) ret;
 517 }
 518 #ifdef _LIBC
 519 libc_hidden_def (__regcomp)
 520 weak_alias (__regcomp, regcomp)
 521 #endif
 522
 523 /* Returns a message corresponding to an error code, ERRCODE, returned
 524    from either regcomp or regexec.   We don't use PREG here.  */
 525
 526 size_t
 527 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
 528           size_t errbuf_size)
 529 {
 530   const char *msg;
 531   size_t msg_size;
 532
 533   if (BE (errcode < 0
 534           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 535                                / sizeof (__re_error_msgid_idx[0])), 0))
 536     /* Only error codes returned by the rest of the code should be passed
 537        to this routine.  If we are given anything else, or if other regex
 538        code generates an invalid error code, then the program has a bug.
 539        Dump core so we can fix it.  */
 540     abort ();
 541
 542   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 543
 544   msg_size = strlen (msg) + 1; /* Includes the null.  */
 545
 546   if (BE (errbuf_size != 0, 1))
 547     {
 548       size_t cpy_size = msg_size;
 549       if (BE (msg_size > errbuf_size, 0))
 550         {
 551           cpy_size = errbuf_size - 1;
 552           errbuf[cpy_size] = '\0';
 553         }
 554       memcpy (errbuf, msg, cpy_size);
 555     }
 556
 557   return msg_size;
 558 }
 559 #ifdef _LIBC
 560 weak_alias (__regerror, regerror)
 561 #endif
 562
 563
 564 #ifdef RE_ENABLE_I18N
 565 /* This static array is used for the map to single-byte characters when
 566    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 567    it the same all the time.  UTF-8 is the preferred encoding so this is
 568    a worthwhile optimization.  */
 569 static const bitset_t utf8_sb_map =
 570 {
 571   /* Set the first 128 bits.  */
 572 # if defined __GNUC__ && !defined __STRICT_ANSI__
 573   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 574 # else
 575 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 576 #   error "bitset_word_t is narrower than 32 bits"
 577 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 578   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 579 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 580   BITSET_WORD_MAX, BITSET_WORD_MAX,
 581 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 582   BITSET_WORD_MAX,
 583 #  endif
 584   (BITSET_WORD_MAX
 585    >> (SBC_MAX % BITSET_WORD_BITS == 0
 586        ? 0
 587        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 588 # endif
 589 };
 590 #endif
 591
 592
 593 static void
 594 free_dfa_content (re_dfa_t *dfa)
 595 {
 596   Idx i, j;
 597
 598   if (dfa->nodes)
 599     for (i = 0; i < dfa->nodes_len; ++i)
 600       free_token (dfa->nodes + i);
 601   re_free (dfa->nexts);
 602   for (i = 0; i < dfa->nodes_len; ++i)
 603     {
 604       if (dfa->eclosures != NULL)
 605         re_node_set_free (dfa->eclosures + i);
 606       if (dfa->inveclosures != NULL)
 607         re_node_set_free (dfa->inveclosures + i);
 608       if (dfa->edests != NULL)
 609         re_node_set_free (dfa->edests + i);
 610     }
 611   re_free (dfa->edests);
 612   re_free (dfa->eclosures);
 613   re_free (dfa->inveclosures);
 614   re_free (dfa->nodes);
 615
 616   if (dfa->state_table)
 617     for (i = 0; i <= dfa->state_hash_mask; ++i)
 618       {
 619         struct re_state_table_entry *entry = dfa->state_table + i;
 620         for (j = 0; j < entry->num; ++j)
 621           {
 622             re_dfastate_t *state = entry->array[j];
 623             free_state (state);
 624           }
 625         re_free (entry->array);
 626       }
 627   re_free (dfa->state_table);
 628 #ifdef RE_ENABLE_I18N
 629   if (dfa->sb_char != utf8_sb_map)
 630     re_free (dfa->sb_char);
 631 #endif
 632   re_free (dfa->subexp_map);
 633 #ifdef DEBUG
 634   re_free (dfa->re_str);
 635 #endif
 636
 637   re_free (dfa);
 638 }
 639
 640
 641 /* Free dynamically allocated space used by PREG.  */
 642
 643 void
 644 regfree (regex_t *preg)
 645 {
 646   re_dfa_t *dfa = preg->buffer;
 647   if (BE (dfa != NULL, 1))
 648     {
 649       lock_fini (dfa->lock);
 650       free_dfa_content (dfa);
 651     }
 652   preg->buffer = NULL;
 653   preg->allocated = 0;
 654
 655   re_free (preg->fastmap);
 656   preg->fastmap = NULL;
 657
 658   re_free (preg->translate);
 659   preg->translate = NULL;
 660 }
 661 #ifdef _LIBC
 662 libc_hidden_def (__regfree)
 663 weak_alias (__regfree, regfree)
 664 #endif
 665 \f
 666 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 667    them unless specifically requested.  */
 668
 669 #if defined _REGEX_RE_COMP || defined _LIBC
 670
 671 /* BSD has one and only one pattern buffer.  */
 672 static struct re_pattern_buffer re_comp_buf;
 673
 674 char *
 675 # ifdef _LIBC
 676 /* Make these definitions weak in libc, so POSIX programs can redefine
 677    these names if they don't use our functions, and still use
 678    regcomp/regexec above without link errors.  */
 679 weak_function
 680 # endif
 681 re_comp (const char *s)
 682 {
 683   reg_errcode_t ret;
 684   char *fastmap;
 685
 686   if (!s)
 687     {
 688       if (!re_comp_buf.buffer)
 689         return gettext ("No previous regular expression");
 690       return 0;
 691     }
 692
 693   if (re_comp_buf.buffer)
 694     {
 695       fastmap = re_comp_buf.fastmap;
 696       re_comp_buf.fastmap = NULL;
 697       __regfree (&re_comp_buf);
 698       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 699       re_comp_buf.fastmap = fastmap;
 700     }
 701
 702   if (re_comp_buf.fastmap == NULL)
 703     {
 704       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
 705       if (re_comp_buf.fastmap == NULL)
 706         return (char *) gettext (__re_error_msgid
 707                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 708     }
 709
 710   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 711      don't need to initialize the pattern buffer fields which affect it.  */
 712
 713   /* Match anchors at newlines.  */
 714   re_comp_buf.newline_anchor = 1;
 715
 716   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 717
 718   if (!ret)
 719     return NULL;
 720
 721   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 722   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 723 }
 724
 725 #ifdef _LIBC
 726 libc_freeres_fn (free_mem)
 727 {
 728   __regfree (&re_comp_buf);
 729 }
 730 #endif
 731
 732 #endif /* _REGEX_RE_COMP */
 733 \f
 734 /* Internal entry point.
 735    Compile the regular expression PATTERN, whose length is LENGTH.
 736    SYNTAX indicate regular expression's syntax.  */
 737
 738 static reg_errcode_t
 739 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 740                      reg_syntax_t syntax)
 741 {
 742   reg_errcode_t err = REG_NOERROR;
 743   re_dfa_t *dfa;
 744   re_string_t regexp;
 745
 746   /* Initialize the pattern buffer.  */
 747   preg->fastmap_accurate = 0;
 748   preg->syntax = syntax;
 749   preg->not_bol = preg->not_eol = 0;
 750   preg->used = 0;
 751   preg->re_nsub = 0;
 752   preg->can_be_null = 0;
 753   preg->regs_allocated = REGS_UNALLOCATED;
 754
 755   /* Initialize the dfa.  */
 756   dfa = preg->buffer;
 757   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 758     {
 759       /* If zero allocated, but buffer is non-null, try to realloc
 760          enough space.  This loses if buffer's address is bogus, but
 761          that is the user's responsibility.  If ->buffer is NULL this
 762          is a simple allocation.  */
 763       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 764       if (dfa == NULL)
 765         return REG_ESPACE;
 766       preg->allocated = sizeof (re_dfa_t);
 767       preg->buffer = dfa;
 768     }
 769   preg->used = sizeof (re_dfa_t);
 770
 771   err = init_dfa (dfa, length);
 772   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
 773     err = REG_ESPACE;
 774   if (BE (err != REG_NOERROR, 0))
 775     {
 776       free_dfa_content (dfa);
 777       preg->buffer = NULL;
 778       preg->allocated = 0;
 779       return err;
 780     }
 781 #ifdef DEBUG
 782   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 783   dfa->re_str = re_malloc (char, length + 1);
 784   strncpy (dfa->re_str, pattern, length + 1);
 785 #endif
 786
 787   err = re_string_construct (&regexp, pattern, length, preg->translate,
 788                              (syntax & RE_ICASE) != 0, dfa);
 789   if (BE (err != REG_NOERROR, 0))
 790     {
 791     re_compile_internal_free_return:
 792       free_workarea_compile (preg);
 793       re_string_destruct (&regexp);
 794       lock_fini (dfa->lock);
 795       free_dfa_content (dfa);
 796       preg->buffer = NULL;
 797       preg->allocated = 0;
 798       return err;
 799     }
 800
 801   /* Parse the regular expression, and build a structure tree.  */
 802   preg->re_nsub = 0;
 803   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 804   if (BE (dfa->str_tree == NULL, 0))
 805     goto re_compile_internal_free_return;
 806
 807   /* Analyze the tree and create the nfa.  */
 808   err = analyze (preg);
 809   if (BE (err != REG_NOERROR, 0))
 810     goto re_compile_internal_free_return;
 811
 812 #ifdef RE_ENABLE_I18N
 813   /* If possible, do searching in single byte encoding to speed things up.  */
 814   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 815     optimize_utf8 (dfa);
 816 #endif
 817
 818   /* Then create the initial state of the dfa.  */
 819   err = create_initial_state (dfa);
 820
 821   /* Release work areas.  */
 822   free_workarea_compile (preg);
 823   re_string_destruct (&regexp);
 824
 825   if (BE (err != REG_NOERROR, 0))
 826     {
 827       lock_fini (dfa->lock);
 828       free_dfa_content (dfa);
 829       preg->buffer = NULL;
 830       preg->allocated = 0;
 831     }
 832
 833   return err;
 834 }
 835
 836 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 837    as the initial length of some arrays.  */
 838
 839 static reg_errcode_t
 840 init_dfa (re_dfa_t *dfa, size_t pat_len)
 841 {
 842   __re_size_t table_size;
 843 #ifndef _LIBC
 844   const char *codeset_name;
 845 #endif
 846 #ifdef RE_ENABLE_I18N
 847   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 848 #else
 849   size_t max_i18n_object_size = 0;
 850 #endif
 851   size_t max_object_size =
 852     MAX (sizeof (struct re_state_table_entry),
 853          MAX (sizeof (re_token_t),
 854               MAX (sizeof (re_node_set),
 855                    MAX (sizeof (regmatch_t),
 856                         max_i18n_object_size))));
 857
 858   memset (dfa, '\0', sizeof (re_dfa_t));
 859
 860   /* Force allocation of str_tree_storage the first time.  */
 861   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 862
 863   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 864      calculation below, and for similar doubling calculations
 865      elsewhere.  And it's <= rather than <, because some of the
 866      doubling calculations add 1 afterwards.  */
 867   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 868     return REG_ESPACE;
 869
 870   dfa->nodes_alloc = pat_len + 1;
 871   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 872
 873   /*  table_size = 2 ^ ceil(log pat_len) */
 874   for (table_size = 1; ; table_size <<= 1)
 875     if (table_size > pat_len)
 876       break;
 877
 878   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 879   dfa->state_hash_mask = table_size - 1;
 880
 881   dfa->mb_cur_max = MB_CUR_MAX;
 882 #ifdef _LIBC
 883   if (dfa->mb_cur_max == 6
 884       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 885     dfa->is_utf8 = 1;
 886   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 887                        != 0);
 888 #else
 889   codeset_name = nl_langinfo (CODESET);
 890   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 891       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 892       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 893       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 894     dfa->is_utf8 = 1;
 895
 896   /* We check exhaustively in the loop below if this charset is a
 897      superset of ASCII.  */
 898   dfa->map_notascii = 0;
 899 #endif
 900
 901 #ifdef RE_ENABLE_I18N
 902   if (dfa->mb_cur_max > 1)
 903     {
 904       if (dfa->is_utf8)
 905         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 906       else
 907         {
 908           int i, j, ch;
 909
 910           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 911           if (BE (dfa->sb_char == NULL, 0))
 912             return REG_ESPACE;
 913
 914           /* Set the bits corresponding to single byte chars.  */
 915           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 916             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 917               {
 918                 wint_t wch = __btowc (ch);
 919                 if (wch != WEOF)
 920                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 921 # ifndef _LIBC
 922                 if (isascii (ch) && wch != ch)
 923                   dfa->map_notascii = 1;
 924 # endif
 925               }
 926         }
 927     }
 928 #endif
 929
 930   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 931     return REG_ESPACE;
 932   return REG_NOERROR;
 933 }
 934
 935 /* Initialize WORD_CHAR table, which indicate which character is
 936    "word".  In this case "word" means that it is the word construction
 937    character used by some operators like "\<", "\>", etc.  */
 938
 939 static void
 940 init_word_char (re_dfa_t *dfa)
 941 {
 942   int i = 0;
 943   int j;
 944   int ch = 0;
 945   dfa->word_ops_used = 1;
 946   if (BE (dfa->map_notascii == 0, 1))
 947     {
 948       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
 949          them, an issue when this code is used in Gnulib.  */
 950       bitset_word_t bits0 = 0x00000000;
 951       bitset_word_t bits1 = 0x03ff0000;
 952       bitset_word_t bits2 = 0x87fffffe;
 953       bitset_word_t bits3 = 0x07fffffe;
 954       if (BITSET_WORD_BITS == 64)
 955         {
 956           /* Pacify gcc -Woverflow on 32-bit platformns.  */
 957           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 958           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 959           i = 2;
 960         }
 961       else if (BITSET_WORD_BITS == 32)
 962         {
 963           dfa->word_char[0] = bits0;
 964           dfa->word_char[1] = bits1;
 965           dfa->word_char[2] = bits2;
 966           dfa->word_char[3] = bits3;
 967           i = 4;
 968         }
 969       else
 970         goto general_case;
 971       ch = 128;
 972
 973       if (BE (dfa->is_utf8, 1))
 974         {
 975           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 976           return;
 977         }
 978     }
 979
 980  general_case:
 981   for (; i < BITSET_WORDS; ++i)
 982     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 983       if (isalnum (ch) || ch == '_')
 984         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 985 }
 986
 987 /* Free the work area which are only used while compiling.  */
 988
 989 static void
 990 free_workarea_compile (regex_t *preg)
 991 {
 992   re_dfa_t *dfa = preg->buffer;
 993   bin_tree_storage_t *storage, *next;
 994   for (storage = dfa->str_tree_storage; storage; storage = next)
 995     {
 996       next = storage->next;
 997       re_free (storage);
 998     }
 999   dfa->str_tree_storage = NULL;
1000   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1001   dfa->str_tree = NULL;
1002   re_free (dfa->org_indices);
1003   dfa->org_indices = NULL;
1004 }
1005
1006 /* Create initial states for all contexts.  */
1007
1008 static reg_errcode_t
1009 create_initial_state (re_dfa_t *dfa)
1010 {
1011   Idx first, i;
1012   reg_errcode_t err;
1013   re_node_set init_nodes;
1014
1015   /* Initial states have the epsilon closure of the node which is
1016      the first node of the regular expression.  */
1017   first = dfa->str_tree->first->node_idx;
1018   dfa->init_node = first;
1019   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1020   if (BE (err != REG_NOERROR, 0))
1021     return err;
1022
1023   /* The back-references which are in initial states can epsilon transit,
1024      since in this case all of the subexpressions can be null.
1025      Then we add epsilon closures of the nodes which are the next nodes of
1026      the back-references.  */
1027   if (dfa->nbackref > 0)
1028     for (i = 0; i < init_nodes.nelem; ++i)
1029       {
1030         Idx node_idx = init_nodes.elems[i];
1031         re_token_type_t type = dfa->nodes[node_idx].type;
1032
1033         Idx clexp_idx;
1034         if (type != OP_BACK_REF)
1035           continue;
1036         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1037           {
1038             re_token_t *clexp_node;
1039             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1040             if (clexp_node->type == OP_CLOSE_SUBEXP
1041                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1042               break;
1043           }
1044         if (clexp_idx == init_nodes.nelem)
1045           continue;
1046
1047         if (type == OP_BACK_REF)
1048           {
1049             Idx dest_idx = dfa->edests[node_idx].elems[0];
1050             if (!re_node_set_contains (&init_nodes, dest_idx))
1051               {
1052                 reg_errcode_t merge_err
1053                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1054                 if (merge_err != REG_NOERROR)
1055                   return merge_err;
1056                 i = 0;
1057               }
1058           }
1059       }
1060
1061   /* It must be the first time to invoke acquire_state.  */
1062   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1063   /* We don't check ERR here, since the initial state must not be NULL.  */
1064   if (BE (dfa->init_state == NULL, 0))
1065     return err;
1066   if (dfa->init_state->has_constraint)
1067     {
1068       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1069                                                        CONTEXT_WORD);
1070       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1071                                                      CONTEXT_NEWLINE);
1072       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1073                                                          &init_nodes,
1074                                                          CONTEXT_NEWLINE
1075                                                          | CONTEXT_BEGBUF);
1076       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1077               || dfa->init_state_begbuf == NULL, 0))
1078         return err;
1079     }
1080   else
1081     dfa->init_state_word = dfa->init_state_nl
1082       = dfa->init_state_begbuf = dfa->init_state;
1083
1084   re_node_set_free (&init_nodes);
1085   return REG_NOERROR;
1086 }
1087 \f
1088 #ifdef RE_ENABLE_I18N
1089 /* If it is possible to do searching in single byte encoding instead of UTF-8
1090    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1091    DFA nodes where needed.  */
1092
1093 static void
1094 optimize_utf8 (re_dfa_t *dfa)
1095 {
1096   Idx node;
1097   int i;
1098   bool mb_chars = false;
1099   bool has_period = false;
1100
1101   for (node = 0; node < dfa->nodes_len; ++node)
1102     switch (dfa->nodes[node].type)
1103       {
1104       case CHARACTER:
1105         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1106           mb_chars = true;
1107         break;
1108       case ANCHOR:
1109         switch (dfa->nodes[node].opr.ctx_type)
1110           {
1111           case LINE_FIRST:
1112           case LINE_LAST:
1113           case BUF_FIRST:
1114           case BUF_LAST:
1115             break;
1116           default:
1117             /* Word anchors etc. cannot be handled.  It's okay to test
1118                opr.ctx_type since constraints (for all DFA nodes) are
1119                created by ORing one or more opr.ctx_type values.  */
1120             return;
1121           }
1122         break;
1123       case OP_PERIOD:
1124         has_period = true;
1125         break;
1126       case OP_BACK_REF:
1127       case OP_ALT:
1128       case END_OF_RE:
1129       case OP_DUP_ASTERISK:
1130       case OP_OPEN_SUBEXP:
1131       case OP_CLOSE_SUBEXP:
1132         break;
1133       case COMPLEX_BRACKET:
1134         return;
1135       case SIMPLE_BRACKET:
1136         /* Just double check.  */
1137         {
1138           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1139                         ? 0
1140                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1141           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1142             {
1143               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1144                 return;
1145               rshift = 0;
1146             }
1147         }
1148         break;
1149       default:
1150         abort ();
1151       }
1152
1153   if (mb_chars || has_period)
1154     for (node = 0; node < dfa->nodes_len; ++node)
1155       {
1156         if (dfa->nodes[node].type == CHARACTER
1157             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1158           dfa->nodes[node].mb_partial = 0;
1159         else if (dfa->nodes[node].type == OP_PERIOD)
1160           dfa->nodes[node].type = OP_UTF8_PERIOD;
1161       }
1162
1163   /* The search can be in single byte locale.  */
1164   dfa->mb_cur_max = 1;
1165   dfa->is_utf8 = 0;
1166   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1167 }
1168 #endif
1169 \f
1170 /* Analyze the structure tree, and calculate "first", "next", "edest",
1171    "eclosure", and "inveclosure".  */
1172
1173 static reg_errcode_t
1174 analyze (regex_t *preg)
1175 {
1176   re_dfa_t *dfa = preg->buffer;
1177   reg_errcode_t ret;
1178
1179   /* Allocate arrays.  */
1180   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1181   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1182   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1183   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1184   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1185           || dfa->eclosures == NULL, 0))
1186     return REG_ESPACE;
1187
1188   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1189   if (dfa->subexp_map != NULL)
1190     {
1191       Idx i;
1192       for (i = 0; i < preg->re_nsub; i++)
1193         dfa->subexp_map[i] = i;
1194       preorder (dfa->str_tree, optimize_subexps, dfa);
1195       for (i = 0; i < preg->re_nsub; i++)
1196         if (dfa->subexp_map[i] != i)
1197           break;
1198       if (i == preg->re_nsub)
1199         {
1200           re_free (dfa->subexp_map);
1201           dfa->subexp_map = NULL;
1202         }
1203     }
1204
1205   ret = postorder (dfa->str_tree, lower_subexps, preg);
1206   if (BE (ret != REG_NOERROR, 0))
1207     return ret;
1208   ret = postorder (dfa->str_tree, calc_first, dfa);
1209   if (BE (ret != REG_NOERROR, 0))
1210     return ret;
1211   preorder (dfa->str_tree, calc_next, dfa);
1212   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1213   if (BE (ret != REG_NOERROR, 0))
1214     return ret;
1215   ret = calc_eclosure (dfa);
1216   if (BE (ret != REG_NOERROR, 0))
1217     return ret;
1218
1219   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1220      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1221   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1222       || dfa->nbackref)
1223     {
1224       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1225       if (BE (dfa->inveclosures == NULL, 0))
1226         return REG_ESPACE;
1227       ret = calc_inveclosure (dfa);
1228     }
1229
1230   return ret;
1231 }
1232
1233 /* Our parse trees are very unbalanced, so we cannot use a stack to
1234    implement parse tree visits.  Instead, we use parent pointers and
1235    some hairy code in these two functions.  */
1236 static reg_errcode_t
1237 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1238            void *extra)
1239 {
1240   bin_tree_t *node, *prev;
1241
1242   for (node = root; ; )
1243     {
1244       /* Descend down the tree, preferably to the left (or to the right
1245          if that's the only child).  */
1246       while (node->left || node->right)
1247         if (node->left)
1248           node = node->left;
1249         else
1250           node = node->right;
1251
1252       do
1253         {
1254           reg_errcode_t err = fn (extra, node);
1255           if (BE (err != REG_NOERROR, 0))
1256             return err;
1257           if (node->parent == NULL)
1258             return REG_NOERROR;
1259           prev = node;
1260           node = node->parent;
1261         }
1262       /* Go up while we have a node that is reached from the right.  */
1263       while (node->right == prev || node->right == NULL);
1264       node = node->right;
1265     }
1266 }
1267
1268 static reg_errcode_t
1269 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1270           void *extra)
1271 {
1272   bin_tree_t *node;
1273
1274   for (node = root; ; )
1275     {
1276       reg_errcode_t err = fn (extra, node);
1277       if (BE (err != REG_NOERROR, 0))
1278         return err;
1279
1280       /* Go to the left node, or up and to the right.  */
1281       if (node->left)
1282         node = node->left;
1283       else
1284         {
1285           bin_tree_t *prev = NULL;
1286           while (node->right == prev || node->right == NULL)
1287             {
1288               prev = node;
1289               node = node->parent;
1290               if (!node)
1291                 return REG_NOERROR;
1292             }
1293           node = node->right;
1294         }
1295     }
1296 }
1297
1298 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1299    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1300    backreferences as well.  Requires a preorder visit.  */
1301 static reg_errcode_t
1302 optimize_subexps (void *extra, bin_tree_t *node)
1303 {
1304   re_dfa_t *dfa = (re_dfa_t *) extra;
1305
1306   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1307     {
1308       int idx = node->token.opr.idx;
1309       node->token.opr.idx = dfa->subexp_map[idx];
1310       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1311     }
1312
1313   else if (node->token.type == SUBEXP
1314            && node->left && node->left->token.type == SUBEXP)
1315     {
1316       Idx other_idx = node->left->token.opr.idx;
1317
1318       node->left = node->left->left;
1319       if (node->left)
1320         node->left->parent = node;
1321
1322       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1323       if (other_idx < BITSET_WORD_BITS)
1324         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1325     }
1326
1327   return REG_NOERROR;
1328 }
1329
1330 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1331    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1332 static reg_errcode_t
1333 lower_subexps (void *extra, bin_tree_t *node)
1334 {
1335   regex_t *preg = (regex_t *) extra;
1336   reg_errcode_t err = REG_NOERROR;
1337
1338   if (node->left && node->left->token.type == SUBEXP)
1339     {
1340       node->left = lower_subexp (&err, preg, node->left);
1341       if (node->left)
1342         node->left->parent = node;
1343     }
1344   if (node->right && node->right->token.type == SUBEXP)
1345     {
1346       node->right = lower_subexp (&err, preg, node->right);
1347       if (node->right)
1348         node->right->parent = node;
1349     }
1350
1351   return err;
1352 }
1353
1354 static bin_tree_t *
1355 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1356 {
1357   re_dfa_t *dfa = preg->buffer;
1358   bin_tree_t *body = node->left;
1359   bin_tree_t *op, *cls, *tree1, *tree;
1360
1361   if (preg->no_sub
1362       /* We do not optimize empty subexpressions, because otherwise we may
1363          have bad CONCAT nodes with NULL children.  This is obviously not
1364          very common, so we do not lose much.  An example that triggers
1365          this case is the sed "script" /\(\)/x.  */
1366       && node->left != NULL
1367       && (node->token.opr.idx >= BITSET_WORD_BITS
1368           || !(dfa->used_bkref_map
1369                & ((bitset_word_t) 1 << node->token.opr.idx))))
1370     return node->left;
1371
1372   /* Convert the SUBEXP node to the concatenation of an
1373      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1374   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1375   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1376   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1377   tree = create_tree (dfa, op, tree1, CONCAT);
1378   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1379     {
1380       *err = REG_ESPACE;
1381       return NULL;
1382     }
1383
1384   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1385   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1386   return tree;
1387 }
1388
1389 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1390    nodes.  Requires a postorder visit.  */
1391 static reg_errcode_t
1392 calc_first (void *extra, bin_tree_t *node)
1393 {
1394   re_dfa_t *dfa = (re_dfa_t *) extra;
1395   if (node->token.type == CONCAT)
1396     {
1397       node->first = node->left->first;
1398       node->node_idx = node->left->node_idx;
1399     }
1400   else
1401     {
1402       node->first = node;
1403       node->node_idx = re_dfa_add_node (dfa, node->token);
1404       if (BE (node->node_idx == -1, 0))
1405         return REG_ESPACE;
1406       if (node->token.type == ANCHOR)
1407         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1408     }
1409   return REG_NOERROR;
1410 }
1411
1412 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1413 static reg_errcode_t
1414 calc_next (void *extra, bin_tree_t *node)
1415 {
1416   switch (node->token.type)
1417     {
1418     case OP_DUP_ASTERISK:
1419       node->left->next = node;
1420       break;
1421     case CONCAT:
1422       node->left->next = node->right->first;
1423       node->right->next = node->next;
1424       break;
1425     default:
1426       if (node->left)
1427         node->left->next = node->next;
1428       if (node->right)
1429         node->right->next = node->next;
1430       break;
1431     }
1432   return REG_NOERROR;
1433 }
1434
1435 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1436 static reg_errcode_t
1437 link_nfa_nodes (void *extra, bin_tree_t *node)
1438 {
1439   re_dfa_t *dfa = (re_dfa_t *) extra;
1440   Idx idx = node->node_idx;
1441   reg_errcode_t err = REG_NOERROR;
1442
1443   switch (node->token.type)
1444     {
1445     case CONCAT:
1446       break;
1447
1448     case END_OF_RE:
1449       assert (node->next == NULL);
1450       break;
1451
1452     case OP_DUP_ASTERISK:
1453     case OP_ALT:
1454       {
1455         Idx left, right;
1456         dfa->has_plural_match = 1;
1457         if (node->left != NULL)
1458           left = node->left->first->node_idx;
1459         else
1460           left = node->next->node_idx;
1461         if (node->right != NULL)
1462           right = node->right->first->node_idx;
1463         else
1464           right = node->next->node_idx;
1465         assert (left > -1);
1466         assert (right > -1);
1467         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1468       }
1469       break;
1470
1471     case ANCHOR:
1472     case OP_OPEN_SUBEXP:
1473     case OP_CLOSE_SUBEXP:
1474       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1475       break;
1476
1477     case OP_BACK_REF:
1478       dfa->nexts[idx] = node->next->node_idx;
1479       if (node->token.type == OP_BACK_REF)
1480         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1481       break;
1482
1483     default:
1484       assert (!IS_EPSILON_NODE (node->token.type));
1485       dfa->nexts[idx] = node->next->node_idx;
1486       break;
1487     }
1488
1489   return err;
1490 }
1491
1492 /* Duplicate the epsilon closure of the node ROOT_NODE.
1493    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1494    to their own constraint.  */
1495
1496 static reg_errcode_t
1497 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1498                         Idx root_node, unsigned int init_constraint)
1499 {
1500   Idx org_node, clone_node;
1501   bool ok;
1502   unsigned int constraint = init_constraint;
1503   for (org_node = top_org_node, clone_node = top_clone_node;;)
1504     {
1505       Idx org_dest, clone_dest;
1506       if (dfa->nodes[org_node].type == OP_BACK_REF)
1507         {
1508           /* If the back reference epsilon-transit, its destination must
1509              also have the constraint.  Then duplicate the epsilon closure
1510              of the destination of the back reference, and store it in
1511              edests of the back reference.  */
1512           org_dest = dfa->nexts[org_node];
1513           re_node_set_empty (dfa->edests + clone_node);
1514           clone_dest = duplicate_node (dfa, org_dest, constraint);
1515           if (BE (clone_dest == -1, 0))
1516             return REG_ESPACE;
1517           dfa->nexts[clone_node] = dfa->nexts[org_node];
1518           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519           if (BE (! ok, 0))
1520             return REG_ESPACE;
1521         }
1522       else if (dfa->edests[org_node].nelem == 0)
1523         {
1524           /* In case of the node can't epsilon-transit, don't duplicate the
1525              destination and store the original destination as the
1526              destination of the node.  */
1527           dfa->nexts[clone_node] = dfa->nexts[org_node];
1528           break;
1529         }
1530       else if (dfa->edests[org_node].nelem == 1)
1531         {
1532           /* In case of the node can epsilon-transit, and it has only one
1533              destination.  */
1534           org_dest = dfa->edests[org_node].elems[0];
1535           re_node_set_empty (dfa->edests + clone_node);
1536           /* If the node is root_node itself, it means the epsilon closure
1537              has a loop.  Then tie it to the destination of the root_node.  */
1538           if (org_node == root_node && clone_node != org_node)
1539             {
1540               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1541               if (BE (! ok, 0))
1542                 return REG_ESPACE;
1543               break;
1544             }
1545           /* In case the node has another constraint, append it.  */
1546           constraint |= dfa->nodes[org_node].constraint;
1547           clone_dest = duplicate_node (dfa, org_dest, constraint);
1548           if (BE (clone_dest == -1, 0))
1549             return REG_ESPACE;
1550           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1551           if (BE (! ok, 0))
1552             return REG_ESPACE;
1553         }
1554       else /* dfa->edests[org_node].nelem == 2 */
1555         {
1556           /* In case of the node can epsilon-transit, and it has two
1557              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1558           org_dest = dfa->edests[org_node].elems[0];
1559           re_node_set_empty (dfa->edests + clone_node);
1560           /* Search for a duplicated node which satisfies the constraint.  */
1561           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1562           if (clone_dest == -1)
1563             {
1564               /* There is no such duplicated node, create a new one.  */
1565               reg_errcode_t err;
1566               clone_dest = duplicate_node (dfa, org_dest, constraint);
1567               if (BE (clone_dest == -1, 0))
1568                 return REG_ESPACE;
1569               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1570               if (BE (! ok, 0))
1571                 return REG_ESPACE;
1572               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1573                                             root_node, constraint);
1574               if (BE (err != REG_NOERROR, 0))
1575                 return err;
1576             }
1577           else
1578             {
1579               /* There is a duplicated node which satisfies the constraint,
1580                  use it to avoid infinite loop.  */
1581               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582               if (BE (! ok, 0))
1583                 return REG_ESPACE;
1584             }
1585
1586           org_dest = dfa->edests[org_node].elems[1];
1587           clone_dest = duplicate_node (dfa, org_dest, constraint);
1588           if (BE (clone_dest == -1, 0))
1589             return REG_ESPACE;
1590           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591           if (BE (! ok, 0))
1592             return REG_ESPACE;
1593         }
1594       org_node = org_dest;
1595       clone_node = clone_dest;
1596     }
1597   return REG_NOERROR;
1598 }
1599
1600 /* Search for a node which is duplicated from the node ORG_NODE, and
1601    satisfies the constraint CONSTRAINT.  */
1602
1603 static Idx
1604 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1605                         unsigned int constraint)
1606 {
1607   Idx idx;
1608   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1609     {
1610       if (org_node == dfa->org_indices[idx]
1611           && constraint == dfa->nodes[idx].constraint)
1612         return idx; /* Found.  */
1613     }
1614   return -1; /* Not found.  */
1615 }
1616
1617 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1618    Return the index of the new node, or -1 if insufficient storage is
1619    available.  */
1620
1621 static Idx
1622 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1623 {
1624   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1625   if (BE (dup_idx != -1, 1))
1626     {
1627       dfa->nodes[dup_idx].constraint = constraint;
1628       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1629       dfa->nodes[dup_idx].duplicated = 1;
1630
1631       /* Store the index of the original node.  */
1632       dfa->org_indices[dup_idx] = org_idx;
1633     }
1634   return dup_idx;
1635 }
1636
1637 static reg_errcode_t
1638 calc_inveclosure (re_dfa_t *dfa)
1639 {
1640   Idx src, idx;
1641   bool ok;
1642   for (idx = 0; idx < dfa->nodes_len; ++idx)
1643     re_node_set_init_empty (dfa->inveclosures + idx);
1644
1645   for (src = 0; src < dfa->nodes_len; ++src)
1646     {
1647       Idx *elems = dfa->eclosures[src].elems;
1648       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1649         {
1650           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1651           if (BE (! ok, 0))
1652             return REG_ESPACE;
1653         }
1654     }
1655
1656   return REG_NOERROR;
1657 }
1658
1659 /* Calculate "eclosure" for all the node in DFA.  */
1660
1661 static reg_errcode_t
1662 calc_eclosure (re_dfa_t *dfa)
1663 {
1664   Idx node_idx;
1665   bool incomplete;
1666 #ifdef DEBUG
1667   assert (dfa->nodes_len > 0);
1668 #endif
1669   incomplete = false;
1670   /* For each nodes, calculate epsilon closure.  */
1671   for (node_idx = 0; ; ++node_idx)
1672     {
1673       reg_errcode_t err;
1674       re_node_set eclosure_elem;
1675       if (node_idx == dfa->nodes_len)
1676         {
1677           if (!incomplete)
1678             break;
1679           incomplete = false;
1680           node_idx = 0;
1681         }
1682
1683 #ifdef DEBUG
1684       assert (dfa->eclosures[node_idx].nelem != -1);
1685 #endif
1686
1687       /* If we have already calculated, skip it.  */
1688       if (dfa->eclosures[node_idx].nelem != 0)
1689         continue;
1690       /* Calculate epsilon closure of 'node_idx'.  */
1691       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1692       if (BE (err != REG_NOERROR, 0))
1693         return err;
1694
1695       if (dfa->eclosures[node_idx].nelem == 0)
1696         {
1697           incomplete = true;
1698           re_node_set_free (&eclosure_elem);
1699         }
1700     }
1701   return REG_NOERROR;
1702 }
1703
1704 /* Calculate epsilon closure of NODE.  */
1705
1706 static reg_errcode_t
1707 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1708 {
1709   reg_errcode_t err;
1710   Idx i;
1711   re_node_set eclosure;
1712   bool ok;
1713   bool incomplete = false;
1714   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1715   if (BE (err != REG_NOERROR, 0))
1716     return err;
1717
1718   /* This indicates that we are calculating this node now.
1719      We reference this value to avoid infinite loop.  */
1720   dfa->eclosures[node].nelem = -1;
1721
1722   /* If the current node has constraints, duplicate all nodes
1723      since they must inherit the constraints.  */
1724   if (dfa->nodes[node].constraint
1725       && dfa->edests[node].nelem
1726       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1727     {
1728       err = duplicate_node_closure (dfa, node, node, node,
1729                                     dfa->nodes[node].constraint);
1730       if (BE (err != REG_NOERROR, 0))
1731         return err;
1732     }
1733
1734   /* Expand each epsilon destination nodes.  */
1735   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1736     for (i = 0; i < dfa->edests[node].nelem; ++i)
1737       {
1738         re_node_set eclosure_elem;
1739         Idx edest = dfa->edests[node].elems[i];
1740         /* If calculating the epsilon closure of 'edest' is in progress,
1741            return intermediate result.  */
1742         if (dfa->eclosures[edest].nelem == -1)
1743           {
1744             incomplete = true;
1745             continue;
1746           }
1747         /* If we haven't calculated the epsilon closure of 'edest' yet,
1748            calculate now. Otherwise use calculated epsilon closure.  */
1749         if (dfa->eclosures[edest].nelem == 0)
1750           {
1751             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1752             if (BE (err != REG_NOERROR, 0))
1753               return err;
1754           }
1755         else
1756           eclosure_elem = dfa->eclosures[edest];
1757         /* Merge the epsilon closure of 'edest'.  */
1758         err = re_node_set_merge (&eclosure, &eclosure_elem);
1759         if (BE (err != REG_NOERROR, 0))
1760           return err;
1761         /* If the epsilon closure of 'edest' is incomplete,
1762            the epsilon closure of this node is also incomplete.  */
1763         if (dfa->eclosures[edest].nelem == 0)
1764           {
1765             incomplete = true;
1766             re_node_set_free (&eclosure_elem);
1767           }
1768       }
1769
1770   /* An epsilon closure includes itself.  */
1771   ok = re_node_set_insert (&eclosure, node);
1772   if (BE (! ok, 0))
1773     return REG_ESPACE;
1774   if (incomplete && !root)
1775     dfa->eclosures[node].nelem = 0;
1776   else
1777     dfa->eclosures[node] = eclosure;
1778   *new_set = eclosure;
1779   return REG_NOERROR;
1780 }
1781 \f
1782 /* Functions for token which are used in the parser.  */
1783
1784 /* Fetch a token from INPUT.
1785    We must not use this function inside bracket expressions.  */
1786
1787 static void
1788 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1789 {
1790   re_string_skip_bytes (input, peek_token (result, input, syntax));
1791 }
1792
1793 /* Peek a token from INPUT, and return the length of the token.
1794    We must not use this function inside bracket expressions.  */
1795
1796 static int
1797 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1798 {
1799   unsigned char c;
1800
1801   if (re_string_eoi (input))
1802     {
1803       token->type = END_OF_RE;
1804       return 0;
1805     }
1806
1807   c = re_string_peek_byte (input, 0);
1808   token->opr.c = c;
1809
1810   token->word_char = 0;
1811 #ifdef RE_ENABLE_I18N
1812   token->mb_partial = 0;
1813   if (input->mb_cur_max > 1 &&
1814       !re_string_first_byte (input, re_string_cur_idx (input)))
1815     {
1816       token->type = CHARACTER;
1817       token->mb_partial = 1;
1818       return 1;
1819     }
1820 #endif
1821   if (c == '\\')
1822     {
1823       unsigned char c2;
1824       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1825         {
1826           token->type = BACK_SLASH;
1827           return 1;
1828         }
1829
1830       c2 = re_string_peek_byte_case (input, 1);
1831       token->opr.c = c2;
1832       token->type = CHARACTER;
1833 #ifdef RE_ENABLE_I18N
1834       if (input->mb_cur_max > 1)
1835         {
1836           wint_t wc = re_string_wchar_at (input,
1837                                           re_string_cur_idx (input) + 1);
1838           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1839         }
1840       else
1841 #endif
1842         token->word_char = IS_WORD_CHAR (c2) != 0;
1843
1844       switch (c2)
1845         {
1846         case '|':
1847           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1848             token->type = OP_ALT;
1849           break;
1850         case '1': case '2': case '3': case '4': case '5':
1851         case '6': case '7': case '8': case '9':
1852           if (!(syntax & RE_NO_BK_REFS))
1853             {
1854               token->type = OP_BACK_REF;
1855               token->opr.idx = c2 - '1';
1856             }
1857           break;
1858         case '<':
1859           if (!(syntax & RE_NO_GNU_OPS))
1860             {
1861               token->type = ANCHOR;
1862               token->opr.ctx_type = WORD_FIRST;
1863             }
1864           break;
1865         case '>':
1866           if (!(syntax & RE_NO_GNU_OPS))
1867             {
1868               token->type = ANCHOR;
1869               token->opr.ctx_type = WORD_LAST;
1870             }
1871           break;
1872         case 'b':
1873           if (!(syntax & RE_NO_GNU_OPS))
1874             {
1875               token->type = ANCHOR;
1876               token->opr.ctx_type = WORD_DELIM;
1877             }
1878           break;
1879         case 'B':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             {
1882               token->type = ANCHOR;
1883               token->opr.ctx_type = NOT_WORD_DELIM;
1884             }
1885           break;
1886         case 'w':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             token->type = OP_WORD;
1889           break;
1890         case 'W':
1891           if (!(syntax & RE_NO_GNU_OPS))
1892             token->type = OP_NOTWORD;
1893           break;
1894         case 's':
1895           if (!(syntax & RE_NO_GNU_OPS))
1896             token->type = OP_SPACE;
1897           break;
1898         case 'S':
1899           if (!(syntax & RE_NO_GNU_OPS))
1900             token->type = OP_NOTSPACE;
1901           break;
1902         case '`':
1903           if (!(syntax & RE_NO_GNU_OPS))
1904             {
1905               token->type = ANCHOR;
1906               token->opr.ctx_type = BUF_FIRST;
1907             }
1908           break;
1909         case '\'':
1910           if (!(syntax & RE_NO_GNU_OPS))
1911             {
1912               token->type = ANCHOR;
1913               token->opr.ctx_type = BUF_LAST;
1914             }
1915           break;
1916         case '(':
1917           if (!(syntax & RE_NO_BK_PARENS))
1918             token->type = OP_OPEN_SUBEXP;
1919           break;
1920         case ')':
1921           if (!(syntax & RE_NO_BK_PARENS))
1922             token->type = OP_CLOSE_SUBEXP;
1923           break;
1924         case '+':
1925           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1926             token->type = OP_DUP_PLUS;
1927           break;
1928         case '?':
1929           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1930             token->type = OP_DUP_QUESTION;
1931           break;
1932         case '{':
1933           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1934             token->type = OP_OPEN_DUP_NUM;
1935           break;
1936         case '}':
1937           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1938             token->type = OP_CLOSE_DUP_NUM;
1939           break;
1940         default:
1941           break;
1942         }
1943       return 2;
1944     }
1945
1946   token->type = CHARACTER;
1947 #ifdef RE_ENABLE_I18N
1948   if (input->mb_cur_max > 1)
1949     {
1950       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1951       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1952     }
1953   else
1954 #endif
1955     token->word_char = IS_WORD_CHAR (token->opr.c);
1956
1957   switch (c)
1958     {
1959     case '\n':
1960       if (syntax & RE_NEWLINE_ALT)
1961         token->type = OP_ALT;
1962       break;
1963     case '|':
1964       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1965         token->type = OP_ALT;
1966       break;
1967     case '*':
1968       token->type = OP_DUP_ASTERISK;
1969       break;
1970     case '+':
1971       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1972         token->type = OP_DUP_PLUS;
1973       break;
1974     case '?':
1975       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1976         token->type = OP_DUP_QUESTION;
1977       break;
1978     case '{':
1979       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1980         token->type = OP_OPEN_DUP_NUM;
1981       break;
1982     case '}':
1983       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1984         token->type = OP_CLOSE_DUP_NUM;
1985       break;
1986     case '(':
1987       if (syntax & RE_NO_BK_PARENS)
1988         token->type = OP_OPEN_SUBEXP;
1989       break;
1990     case ')':
1991       if (syntax & RE_NO_BK_PARENS)
1992         token->type = OP_CLOSE_SUBEXP;
1993       break;
1994     case '[':
1995       token->type = OP_OPEN_BRACKET;
1996       break;
1997     case '.':
1998       token->type = OP_PERIOD;
1999       break;
2000     case '^':
2001       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2002           re_string_cur_idx (input) != 0)
2003         {
2004           char prev = re_string_peek_byte (input, -1);
2005           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2006             break;
2007         }
2008       token->type = ANCHOR;
2009       token->opr.ctx_type = LINE_FIRST;
2010       break;
2011     case '$':
2012       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2013           re_string_cur_idx (input) + 1 != re_string_length (input))
2014         {
2015           re_token_t next;
2016           re_string_skip_bytes (input, 1);
2017           peek_token (&next, input, syntax);
2018           re_string_skip_bytes (input, -1);
2019           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2020             break;
2021         }
2022       token->type = ANCHOR;
2023       token->opr.ctx_type = LINE_LAST;
2024       break;
2025     default:
2026       break;
2027     }
2028   return 1;
2029 }
2030
2031 /* Peek a token from INPUT, and return the length of the token.
2032    We must not use this function out of bracket expressions.  */
2033
2034 static int
2035 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2036 {
2037   unsigned char c;
2038   if (re_string_eoi (input))
2039     {
2040       token->type = END_OF_RE;
2041       return 0;
2042     }
2043   c = re_string_peek_byte (input, 0);
2044   token->opr.c = c;
2045
2046 #ifdef RE_ENABLE_I18N
2047   if (input->mb_cur_max > 1 &&
2048       !re_string_first_byte (input, re_string_cur_idx (input)))
2049     {
2050       token->type = CHARACTER;
2051       return 1;
2052     }
2053 #endif /* RE_ENABLE_I18N */
2054
2055   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2056       && re_string_cur_idx (input) + 1 < re_string_length (input))
2057     {
2058       /* In this case, '\' escape a character.  */
2059       unsigned char c2;
2060       re_string_skip_bytes (input, 1);
2061       c2 = re_string_peek_byte (input, 0);
2062       token->opr.c = c2;
2063       token->type = CHARACTER;
2064       return 1;
2065     }
2066   if (c == '[') /* '[' is a special char in a bracket exps.  */
2067     {
2068       unsigned char c2;
2069       int token_len;
2070       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2071         c2 = re_string_peek_byte (input, 1);
2072       else
2073         c2 = 0;
2074       token->opr.c = c2;
2075       token_len = 2;
2076       switch (c2)
2077         {
2078         case '.':
2079           token->type = OP_OPEN_COLL_ELEM;
2080           break;
2081
2082         case '=':
2083           token->type = OP_OPEN_EQUIV_CLASS;
2084           break;
2085
2086         case ':':
2087           if (syntax & RE_CHAR_CLASSES)
2088             {
2089               token->type = OP_OPEN_CHAR_CLASS;
2090               break;
2091             }
2092           FALLTHROUGH;
2093         default:
2094           token->type = CHARACTER;
2095           token->opr.c = c;
2096           token_len = 1;
2097           break;
2098         }
2099       return token_len;
2100     }
2101   switch (c)
2102     {
2103     case '-':
2104       token->type = OP_CHARSET_RANGE;
2105       break;
2106     case ']':
2107       token->type = OP_CLOSE_BRACKET;
2108       break;
2109     case '^':
2110       token->type = OP_NON_MATCH_LIST;
2111       break;
2112     default:
2113       token->type = CHARACTER;
2114     }
2115   return 1;
2116 }
2117 \f
2118 /* Functions for parser.  */
2119
2120 /* Entry point of the parser.
2121    Parse the regular expression REGEXP and return the structure tree.
2122    If an error occurs, ERR is set by error code, and return NULL.
2123    This function build the following tree, from regular expression <reg_exp>:
2124            CAT
2125            / \
2126           /   \
2127    <reg_exp>  EOR
2128
2129    CAT means concatenation.
2130    EOR means end of regular expression.  */
2131
2132 static bin_tree_t *
2133 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2134        reg_errcode_t *err)
2135 {
2136   re_dfa_t *dfa = preg->buffer;
2137   bin_tree_t *tree, *eor, *root;
2138   re_token_t current_token;
2139   dfa->syntax = syntax;
2140   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2141   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2142   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2143     return NULL;
2144   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2145   if (tree != NULL)
2146     root = create_tree (dfa, tree, eor, CONCAT);
2147   else
2148     root = eor;
2149   if (BE (eor == NULL || root == NULL, 0))
2150     {
2151       *err = REG_ESPACE;
2152       return NULL;
2153     }
2154   return root;
2155 }
2156
2157 /* This function build the following tree, from regular expression
2158    <branch1>|<branch2>:
2159            ALT
2160            / \
2161           /   \
2162    <branch1> <branch2>
2163
2164    ALT means alternative, which represents the operator '|'.  */
2165
2166 static bin_tree_t *
2167 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2168                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2169 {
2170   re_dfa_t *dfa = preg->buffer;
2171   bin_tree_t *tree, *branch = NULL;
2172   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2173   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2174   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2175     return NULL;
2176
2177   while (token->type == OP_ALT)
2178     {
2179       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2180       if (token->type != OP_ALT && token->type != END_OF_RE
2181           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2182         {
2183           bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2184           dfa->completed_bkref_map = initial_bkref_map;
2185           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2186           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2187             {
2188               if (tree != NULL)
2189                 postorder (tree, free_tree, NULL);
2190               return NULL;
2191             }
2192           dfa->completed_bkref_map |= accumulated_bkref_map;
2193         }
2194       else
2195         branch = NULL;
2196       tree = create_tree (dfa, tree, branch, OP_ALT);
2197       if (BE (tree == NULL, 0))
2198         {
2199           *err = REG_ESPACE;
2200           return NULL;
2201         }
2202     }
2203   return tree;
2204 }
2205
2206 /* This function build the following tree, from regular expression
2207    <exp1><exp2>:
2208         CAT
2209         / \
2210        /   \
2211    <exp1> <exp2>
2212
2213    CAT means concatenation.  */
2214
2215 static bin_tree_t *
2216 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2217               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2218 {
2219   bin_tree_t *tree, *expr;
2220   re_dfa_t *dfa = preg->buffer;
2221   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2222   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2223     return NULL;
2224
2225   while (token->type != OP_ALT && token->type != END_OF_RE
2226          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2227     {
2228       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2229       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2230         {
2231           if (tree != NULL)
2232             postorder (tree, free_tree, NULL);
2233           return NULL;
2234         }
2235       if (tree != NULL && expr != NULL)
2236         {
2237           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2238           if (newtree == NULL)
2239             {
2240               postorder (expr, free_tree, NULL);
2241               postorder (tree, free_tree, NULL);
2242               *err = REG_ESPACE;
2243               return NULL;
2244             }
2245           tree = newtree;
2246         }
2247       else if (tree == NULL)
2248         tree = expr;
2249       /* Otherwise expr == NULL, we don't need to create new tree.  */
2250     }
2251   return tree;
2252 }
2253
2254 /* This function build the following tree, from regular expression a*:
2255          *
2256          |
2257          a
2258 */
2259
2260 static bin_tree_t *
2261 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2262                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2263 {
2264   re_dfa_t *dfa = preg->buffer;
2265   bin_tree_t *tree;
2266   switch (token->type)
2267     {
2268     case CHARACTER:
2269       tree = create_token_tree (dfa, NULL, NULL, token);
2270       if (BE (tree == NULL, 0))
2271         {
2272           *err = REG_ESPACE;
2273           return NULL;
2274         }
2275 #ifdef RE_ENABLE_I18N
2276       if (dfa->mb_cur_max > 1)
2277         {
2278           while (!re_string_eoi (regexp)
2279                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2280             {
2281               bin_tree_t *mbc_remain;
2282               fetch_token (token, regexp, syntax);
2283               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2284               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2285               if (BE (mbc_remain == NULL || tree == NULL, 0))
2286                 {
2287                   *err = REG_ESPACE;
2288                   return NULL;
2289                 }
2290             }
2291         }
2292 #endif
2293       break;
2294
2295     case OP_OPEN_SUBEXP:
2296       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2297       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2298         return NULL;
2299       break;
2300
2301     case OP_OPEN_BRACKET:
2302       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2303       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304         return NULL;
2305       break;
2306
2307     case OP_BACK_REF:
2308       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2309         {
2310           *err = REG_ESUBREG;
2311           return NULL;
2312         }
2313       dfa->used_bkref_map |= 1 << token->opr.idx;
2314       tree = create_token_tree (dfa, NULL, NULL, token);
2315       if (BE (tree == NULL, 0))
2316         {
2317           *err = REG_ESPACE;
2318           return NULL;
2319         }
2320       ++dfa->nbackref;
2321       dfa->has_mb_node = 1;
2322       break;
2323
2324     case OP_OPEN_DUP_NUM:
2325       if (syntax & RE_CONTEXT_INVALID_DUP)
2326         {
2327           *err = REG_BADRPT;
2328           return NULL;
2329         }
2330       FALLTHROUGH;
2331     case OP_DUP_ASTERISK:
2332     case OP_DUP_PLUS:
2333     case OP_DUP_QUESTION:
2334       if (syntax & RE_CONTEXT_INVALID_OPS)
2335         {
2336           *err = REG_BADRPT;
2337           return NULL;
2338         }
2339       else if (syntax & RE_CONTEXT_INDEP_OPS)
2340         {
2341           fetch_token (token, regexp, syntax);
2342           return parse_expression (regexp, preg, token, syntax, nest, err);
2343         }
2344       FALLTHROUGH;
2345     case OP_CLOSE_SUBEXP:
2346       if ((token->type == OP_CLOSE_SUBEXP) &&
2347           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2348         {
2349           *err = REG_ERPAREN;
2350           return NULL;
2351         }
2352       FALLTHROUGH;
2353     case OP_CLOSE_DUP_NUM:
2354       /* We treat it as a normal character.  */
2355
2356       /* Then we can these characters as normal characters.  */
2357       token->type = CHARACTER;
2358       /* mb_partial and word_char bits should be initialized already
2359          by peek_token.  */
2360       tree = create_token_tree (dfa, NULL, NULL, token);
2361       if (BE (tree == NULL, 0))
2362         {
2363           *err = REG_ESPACE;
2364           return NULL;
2365         }
2366       break;
2367
2368     case ANCHOR:
2369       if ((token->opr.ctx_type
2370            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2371           && dfa->word_ops_used == 0)
2372         init_word_char (dfa);
2373       if (token->opr.ctx_type == WORD_DELIM
2374           || token->opr.ctx_type == NOT_WORD_DELIM)
2375         {
2376           bin_tree_t *tree_first, *tree_last;
2377           if (token->opr.ctx_type == WORD_DELIM)
2378             {
2379               token->opr.ctx_type = WORD_FIRST;
2380               tree_first = create_token_tree (dfa, NULL, NULL, token);
2381               token->opr.ctx_type = WORD_LAST;
2382             }
2383           else
2384             {
2385               token->opr.ctx_type = INSIDE_WORD;
2386               tree_first = create_token_tree (dfa, NULL, NULL, token);
2387               token->opr.ctx_type = INSIDE_NOTWORD;
2388             }
2389           tree_last = create_token_tree (dfa, NULL, NULL, token);
2390           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2392             {
2393               *err = REG_ESPACE;
2394               return NULL;
2395             }
2396         }
2397       else
2398         {
2399           tree = create_token_tree (dfa, NULL, NULL, token);
2400           if (BE (tree == NULL, 0))
2401             {
2402               *err = REG_ESPACE;
2403               return NULL;
2404             }
2405         }
2406       /* We must return here, since ANCHORs can't be followed
2407          by repetition operators.
2408          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2410       fetch_token (token, regexp, syntax);
2411       return tree;
2412
2413     case OP_PERIOD:
2414       tree = create_token_tree (dfa, NULL, NULL, token);
2415       if (BE (tree == NULL, 0))
2416         {
2417           *err = REG_ESPACE;
2418           return NULL;
2419         }
2420       if (dfa->mb_cur_max > 1)
2421         dfa->has_mb_node = 1;
2422       break;
2423
2424     case OP_WORD:
2425     case OP_NOTWORD:
2426       tree = build_charclass_op (dfa, regexp->trans,
2427                                  "alnum",
2428                                  "_",
2429                                  token->type == OP_NOTWORD, err);
2430       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431         return NULL;
2432       break;
2433
2434     case OP_SPACE:
2435     case OP_NOTSPACE:
2436       tree = build_charclass_op (dfa, regexp->trans,
2437                                  "space",
2438                                  "",
2439                                  token->type == OP_NOTSPACE, err);
2440       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2441         return NULL;
2442       break;
2443
2444     case OP_ALT:
2445     case END_OF_RE:
2446       return NULL;
2447
2448     case BACK_SLASH:
2449       *err = REG_EESCAPE;
2450       return NULL;
2451
2452     default:
2453       /* Must not happen?  */
2454 #ifdef DEBUG
2455       assert (0);
2456 #endif
2457       return NULL;
2458     }
2459   fetch_token (token, regexp, syntax);
2460
2461   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2462          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2463     {
2464       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2465                                            syntax, err);
2466       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2467         {
2468           if (tree != NULL)
2469             postorder (tree, free_tree, NULL);
2470           return NULL;
2471         }
2472       tree = dup_tree;
2473       /* In BRE consecutive duplications are not allowed.  */
2474       if ((syntax & RE_CONTEXT_INVALID_DUP)
2475           && (token->type == OP_DUP_ASTERISK
2476               || token->type == OP_OPEN_DUP_NUM))
2477         {
2478           if (tree != NULL)
2479             postorder (tree, free_tree, NULL);
2480           *err = REG_BADRPT;
2481           return NULL;
2482         }
2483     }
2484
2485   return tree;
2486 }
2487
2488 /* This function build the following tree, from regular expression
2489    (<reg_exp>):
2490          SUBEXP
2491             |
2492         <reg_exp>
2493 */
2494
2495 static bin_tree_t *
2496 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2497                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2498 {
2499   re_dfa_t *dfa = preg->buffer;
2500   bin_tree_t *tree;
2501   size_t cur_nsub;
2502   cur_nsub = preg->re_nsub++;
2503
2504   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2505
2506   /* The subexpression may be a null string.  */
2507   if (token->type == OP_CLOSE_SUBEXP)
2508     tree = NULL;
2509   else
2510     {
2511       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2512       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2513         {
2514           if (tree != NULL)
2515             postorder (tree, free_tree, NULL);
2516           *err = REG_EPAREN;
2517         }
2518       if (BE (*err != REG_NOERROR, 0))
2519         return NULL;
2520     }
2521
2522   if (cur_nsub <= '9' - '1')
2523     dfa->completed_bkref_map |= 1 << cur_nsub;
2524
2525   tree = create_tree (dfa, tree, NULL, SUBEXP);
2526   if (BE (tree == NULL, 0))
2527     {
2528       *err = REG_ESPACE;
2529       return NULL;
2530     }
2531   tree->token.opr.idx = cur_nsub;
2532   return tree;
2533 }
2534
2535 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2536
2537 static bin_tree_t *
2538 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2539               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2540 {
2541   bin_tree_t *tree = NULL, *old_tree = NULL;
2542   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2543   re_token_t start_token = *token;
2544
2545   if (token->type == OP_OPEN_DUP_NUM)
2546     {
2547       end = 0;
2548       start = fetch_number (regexp, token, syntax);
2549       if (start == -1)
2550         {
2551           if (token->type == CHARACTER && token->opr.c == ',')
2552             start = 0; /* We treat "{,m}" as "{0,m}".  */
2553           else
2554             {
2555               *err = REG_BADBR; /* <re>{} is invalid.  */
2556               return NULL;
2557             }
2558         }
2559       if (BE (start != -2, 1))
2560         {
2561           /* We treat "{n}" as "{n,n}".  */
2562           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2563                  : ((token->type == CHARACTER && token->opr.c == ',')
2564                     ? fetch_number (regexp, token, syntax) : -2));
2565         }
2566       if (BE (start == -2 || end == -2, 0))
2567         {
2568           /* Invalid sequence.  */
2569           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2570             {
2571               if (token->type == END_OF_RE)
2572                 *err = REG_EBRACE;
2573               else
2574                 *err = REG_BADBR;
2575
2576               return NULL;
2577             }
2578
2579           /* If the syntax bit is set, rollback.  */
2580           re_string_set_index (regexp, start_idx);
2581           *token = start_token;
2582           token->type = CHARACTER;
2583           /* mb_partial and word_char bits should be already initialized by
2584              peek_token.  */
2585           return elem;
2586         }
2587
2588       if (BE ((end != -1 && start > end)
2589               || token->type != OP_CLOSE_DUP_NUM, 0))
2590         {
2591           /* First number greater than second.  */
2592           *err = REG_BADBR;
2593           return NULL;
2594         }
2595
2596       if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2597         {
2598           *err = REG_ESIZE;
2599           return NULL;
2600         }
2601     }
2602   else
2603     {
2604       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2605       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2606     }
2607
2608   fetch_token (token, regexp, syntax);
2609
2610   if (BE (elem == NULL, 0))
2611     return NULL;
2612   if (BE (start == 0 && end == 0, 0))
2613     {
2614       postorder (elem, free_tree, NULL);
2615       return NULL;
2616     }
2617
2618   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2619   if (BE (start > 0, 0))
2620     {
2621       tree = elem;
2622       for (i = 2; i <= start; ++i)
2623         {
2624           elem = duplicate_tree (elem, dfa);
2625           tree = create_tree (dfa, tree, elem, CONCAT);
2626           if (BE (elem == NULL || tree == NULL, 0))
2627             goto parse_dup_op_espace;
2628         }
2629
2630       if (start == end)
2631         return tree;
2632
2633       /* Duplicate ELEM before it is marked optional.  */
2634       elem = duplicate_tree (elem, dfa);
2635       if (BE (elem == NULL, 0))
2636         goto parse_dup_op_espace;
2637       old_tree = tree;
2638     }
2639   else
2640     old_tree = NULL;
2641
2642   if (elem->token.type == SUBEXP)
2643     {
2644       uintptr_t subidx = elem->token.opr.idx;
2645       postorder (elem, mark_opt_subexp, (void *) subidx);
2646     }
2647
2648   tree = create_tree (dfa, elem, NULL,
2649                       (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2650   if (BE (tree == NULL, 0))
2651     goto parse_dup_op_espace;
2652
2653   /* This loop is actually executed only when end != -1,
2654      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2655      already created the start+1-th copy.  */
2656   if (TYPE_SIGNED (Idx) || end != -1)
2657     for (i = start + 2; i <= end; ++i)
2658       {
2659         elem = duplicate_tree (elem, dfa);
2660         tree = create_tree (dfa, tree, elem, CONCAT);
2661         if (BE (elem == NULL || tree == NULL, 0))
2662           goto parse_dup_op_espace;
2663
2664         tree = create_tree (dfa, tree, NULL, OP_ALT);
2665         if (BE (tree == NULL, 0))
2666           goto parse_dup_op_espace;
2667       }
2668
2669   if (old_tree)
2670     tree = create_tree (dfa, old_tree, tree, CONCAT);
2671
2672   return tree;
2673
2674  parse_dup_op_espace:
2675   *err = REG_ESPACE;
2676   return NULL;
2677 }
2678
2679 /* Size of the names for collating symbol/equivalence_class/character_class.
2680    I'm not sure, but maybe enough.  */
2681 #define BRACKET_NAME_BUF_SIZE 32
2682
2683 #ifndef _LIBC
2684
2685 # ifdef RE_ENABLE_I18N
2686 /* Convert the byte B to the corresponding wide character.  In a
2687    unibyte locale, treat B as itself.  In a multibyte locale, return
2688    WEOF if B is an encoding error.  */
2689 static wint_t
2690 parse_byte (unsigned char b, re_charset_t *mbcset)
2691 {
2692   return mbcset == NULL ? b : __btowc (b);
2693 }
2694 # endif
2695
2696   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2697      Build the range expression which starts from START_ELEM, and ends
2698      at END_ELEM.  The result are written to MBCSET and SBCSET.
2699      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2700      mbcset->range_ends, is a pointer argument since we may
2701      update it.  */
2702
2703 static reg_errcode_t
2704 # ifdef RE_ENABLE_I18N
2705 build_range_exp (const reg_syntax_t syntax,
2706                  bitset_t sbcset,
2707                  re_charset_t *mbcset,
2708                  Idx *range_alloc,
2709                  const bracket_elem_t *start_elem,
2710                  const bracket_elem_t *end_elem)
2711 # else /* not RE_ENABLE_I18N */
2712 build_range_exp (const reg_syntax_t syntax,
2713                  bitset_t sbcset,
2714                  const bracket_elem_t *start_elem,
2715                  const bracket_elem_t *end_elem)
2716 # endif /* not RE_ENABLE_I18N */
2717 {
2718   unsigned int start_ch, end_ch;
2719   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2720   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2721           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2722           0))
2723     return REG_ERANGE;
2724
2725   /* We can handle no multi character collating elements without libc
2726      support.  */
2727   if (BE ((start_elem->type == COLL_SYM
2728            && strlen ((char *) start_elem->opr.name) > 1)
2729           || (end_elem->type == COLL_SYM
2730               && strlen ((char *) end_elem->opr.name) > 1), 0))
2731     return REG_ECOLLATE;
2732
2733 # ifdef RE_ENABLE_I18N
2734   {
2735     wchar_t wc;
2736     wint_t start_wc;
2737     wint_t end_wc;
2738
2739     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2740                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2741                    : 0));
2742     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2743               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2744                  : 0));
2745     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2746                 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2747     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2748               ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2749     if (start_wc == WEOF || end_wc == WEOF)
2750       return REG_ECOLLATE;
2751     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2752       return REG_ERANGE;
2753
2754     /* Got valid collation sequence values, add them as a new entry.
2755        However, for !_LIBC we have no collation elements: if the
2756        character set is single byte, the single byte character set
2757        that we build below suffices.  parse_bracket_exp passes
2758        no MBCSET if dfa->mb_cur_max == 1.  */
2759     if (mbcset)
2760       {
2761         /* Check the space of the arrays.  */
2762         if (BE (*range_alloc == mbcset->nranges, 0))
2763           {
2764             /* There is not enough space, need realloc.  */
2765             wchar_t *new_array_start, *new_array_end;
2766             Idx new_nranges;
2767
2768             /* +1 in case of mbcset->nranges is 0.  */
2769             new_nranges = 2 * mbcset->nranges + 1;
2770             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2771                are NULL if *range_alloc == 0.  */
2772             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2773                                           new_nranges);
2774             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2775                                         new_nranges);
2776
2777             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2778               {
2779                 re_free (new_array_start);
2780                 re_free (new_array_end);
2781                 return REG_ESPACE;
2782               }
2783
2784             mbcset->range_starts = new_array_start;
2785             mbcset->range_ends = new_array_end;
2786             *range_alloc = new_nranges;
2787           }
2788
2789         mbcset->range_starts[mbcset->nranges] = start_wc;
2790         mbcset->range_ends[mbcset->nranges++] = end_wc;
2791       }
2792
2793     /* Build the table for single byte characters.  */
2794     for (wc = 0; wc < SBC_MAX; ++wc)
2795       {
2796         if (start_wc <= wc && wc <= end_wc)
2797           bitset_set (sbcset, wc);
2798       }
2799   }
2800 # else /* not RE_ENABLE_I18N */
2801   {
2802     unsigned int ch;
2803     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2804                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2805                    : 0));
2806     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2807               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2808                  : 0));
2809     if (start_ch > end_ch)
2810       return REG_ERANGE;
2811     /* Build the table for single byte characters.  */
2812     for (ch = 0; ch < SBC_MAX; ++ch)
2813       if (start_ch <= ch  && ch <= end_ch)
2814         bitset_set (sbcset, ch);
2815   }
2816 # endif /* not RE_ENABLE_I18N */
2817   return REG_NOERROR;
2818 }
2819 #endif /* not _LIBC */
2820
2821 #ifndef _LIBC
2822 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2823    Build the collating element which is represented by NAME.
2824    The result are written to MBCSET and SBCSET.
2825    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2826    pointer argument since we may update it.  */
2827
2828 static reg_errcode_t
2829 # ifdef RE_ENABLE_I18N
2830 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2831                         Idx *coll_sym_alloc, const unsigned char *name)
2832 # else /* not RE_ENABLE_I18N */
2833 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2834 # endif /* not RE_ENABLE_I18N */
2835 {
2836   size_t name_len = strlen ((const char *) name);
2837   if (BE (name_len != 1, 0))
2838     return REG_ECOLLATE;
2839   else
2840     {
2841       bitset_set (sbcset, name[0]);
2842       return REG_NOERROR;
2843     }
2844 }
2845 #endif /* not _LIBC */
2846
2847 /* This function parse bracket expression like "[abc]", "[a-c]",
2848    "[[.a-a.]]" etc.  */
2849
2850 static bin_tree_t *
2851 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2852                    reg_syntax_t syntax, reg_errcode_t *err)
2853 {
2854 #ifdef _LIBC
2855   const unsigned char *collseqmb;
2856   const char *collseqwc;
2857   uint32_t nrules;
2858   int32_t table_size;
2859   const int32_t *symb_table;
2860   const unsigned char *extra;
2861
2862   /* Local function for parse_bracket_exp used in _LIBC environment.
2863      Seek the collating symbol entry corresponding to NAME.
2864      Return the index of the symbol in the SYMB_TABLE,
2865      or -1 if not found.  */
2866
2867   auto inline int32_t
2868   __attribute__ ((always_inline))
2869   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2870     {
2871       int32_t elem;
2872
2873       for (elem = 0; elem < table_size; elem++)
2874         if (symb_table[2 * elem] != 0)
2875           {
2876             int32_t idx = symb_table[2 * elem + 1];
2877             /* Skip the name of collating element name.  */
2878             idx += 1 + extra[idx];
2879             if (/* Compare the length of the name.  */
2880                 name_len == extra[idx]
2881                 /* Compare the name.  */
2882                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2883               /* Yep, this is the entry.  */
2884               return elem;
2885           }
2886       return -1;
2887     }
2888
2889   /* Local function for parse_bracket_exp used in _LIBC environment.
2890      Look up the collation sequence value of BR_ELEM.
2891      Return the value if succeeded, UINT_MAX otherwise.  */
2892
2893   auto inline unsigned int
2894   __attribute__ ((always_inline))
2895   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2896     {
2897       if (br_elem->type == SB_CHAR)
2898         {
2899           /*
2900           if (MB_CUR_MAX == 1)
2901           */
2902           if (nrules == 0)
2903             return collseqmb[br_elem->opr.ch];
2904           else
2905             {
2906               wint_t wc = __btowc (br_elem->opr.ch);
2907               return __collseq_table_lookup (collseqwc, wc);
2908             }
2909         }
2910       else if (br_elem->type == MB_CHAR)
2911         {
2912           if (nrules != 0)
2913             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2914         }
2915       else if (br_elem->type == COLL_SYM)
2916         {
2917           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2918           if (nrules != 0)
2919             {
2920               int32_t elem, idx;
2921               elem = seek_collating_symbol_entry (br_elem->opr.name,
2922                                                   sym_name_len);
2923               if (elem != -1)
2924                 {
2925                   /* We found the entry.  */
2926                   idx = symb_table[2 * elem + 1];
2927                   /* Skip the name of collating element name.  */
2928                   idx += 1 + extra[idx];
2929                   /* Skip the byte sequence of the collating element.  */
2930                   idx += 1 + extra[idx];
2931                   /* Adjust for the alignment.  */
2932                   idx = (idx + 3) & ~3;
2933                   /* Skip the multibyte collation sequence value.  */
2934                   idx += sizeof (unsigned int);
2935                   /* Skip the wide char sequence of the collating element.  */
2936                   idx += sizeof (unsigned int) *
2937                     (1 + *(unsigned int *) (extra + idx));
2938                   /* Return the collation sequence value.  */
2939                   return *(unsigned int *) (extra + idx);
2940                 }
2941               else if (sym_name_len == 1)
2942                 {
2943                   /* No valid character.  Match it as a single byte
2944                      character.  */
2945                   return collseqmb[br_elem->opr.name[0]];
2946                 }
2947             }
2948           else if (sym_name_len == 1)
2949             return collseqmb[br_elem->opr.name[0]];
2950         }
2951       return UINT_MAX;
2952     }
2953
2954   /* Local function for parse_bracket_exp used in _LIBC environment.
2955      Build the range expression which starts from START_ELEM, and ends
2956      at END_ELEM.  The result are written to MBCSET and SBCSET.
2957      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2958      mbcset->range_ends, is a pointer argument since we may
2959      update it.  */
2960
2961   auto inline reg_errcode_t
2962   __attribute__ ((always_inline))
2963   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2964                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2965     {
2966       unsigned int ch;
2967       uint32_t start_collseq;
2968       uint32_t end_collseq;
2969
2970       /* Equivalence Classes and Character Classes can't be a range
2971          start/end.  */
2972       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2973               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2974               0))
2975         return REG_ERANGE;
2976
2977       /* FIXME: Implement rational ranges here, too.  */
2978       start_collseq = lookup_collation_sequence_value (start_elem);
2979       end_collseq = lookup_collation_sequence_value (end_elem);
2980       /* Check start/end collation sequence values.  */
2981       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2982         return REG_ECOLLATE;
2983       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2984         return REG_ERANGE;
2985
2986       /* Got valid collation sequence values, add them as a new entry.
2987          However, if we have no collation elements, and the character set
2988          is single byte, the single byte character set that we
2989          build below suffices. */
2990       if (nrules > 0 || dfa->mb_cur_max > 1)
2991         {
2992           /* Check the space of the arrays.  */
2993           if (BE (*range_alloc == mbcset->nranges, 0))
2994             {
2995               /* There is not enough space, need realloc.  */
2996               uint32_t *new_array_start;
2997               uint32_t *new_array_end;
2998               Idx new_nranges;
2999
3000               /* +1 in case of mbcset->nranges is 0.  */
3001               new_nranges = 2 * mbcset->nranges + 1;
3002               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3003                                             new_nranges);
3004               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3005                                           new_nranges);
3006
3007               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3008                 return REG_ESPACE;
3009
3010               mbcset->range_starts = new_array_start;
3011               mbcset->range_ends = new_array_end;
3012               *range_alloc = new_nranges;
3013             }
3014
3015           mbcset->range_starts[mbcset->nranges] = start_collseq;
3016           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3017         }
3018
3019       /* Build the table for single byte characters.  */
3020       for (ch = 0; ch < SBC_MAX; ch++)
3021         {
3022           uint32_t ch_collseq;
3023           /*
3024           if (MB_CUR_MAX == 1)
3025           */
3026           if (nrules == 0)
3027             ch_collseq = collseqmb[ch];
3028           else
3029             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3030           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3031             bitset_set (sbcset, ch);
3032         }
3033       return REG_NOERROR;
3034     }
3035
3036   /* Local function for parse_bracket_exp used in _LIBC environment.
3037      Build the collating element which is represented by NAME.
3038      The result are written to MBCSET and SBCSET.
3039      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3040      pointer argument since we may update it.  */
3041
3042   auto inline reg_errcode_t
3043   __attribute__ ((always_inline))
3044   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3045                           Idx *coll_sym_alloc, const unsigned char *name)
3046     {
3047       int32_t elem, idx;
3048       size_t name_len = strlen ((const char *) name);
3049       if (nrules != 0)
3050         {
3051           elem = seek_collating_symbol_entry (name, name_len);
3052           if (elem != -1)
3053             {
3054               /* We found the entry.  */
3055               idx = symb_table[2 * elem + 1];
3056               /* Skip the name of collating element name.  */
3057               idx += 1 + extra[idx];
3058             }
3059           else if (name_len == 1)
3060             {
3061               /* No valid character, treat it as a normal
3062                  character.  */
3063               bitset_set (sbcset, name[0]);
3064               return REG_NOERROR;
3065             }
3066           else
3067             return REG_ECOLLATE;
3068
3069           /* Got valid collation sequence, add it as a new entry.  */
3070           /* Check the space of the arrays.  */
3071           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3072             {
3073               /* Not enough, realloc it.  */
3074               /* +1 in case of mbcset->ncoll_syms is 0.  */
3075               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3076               /* Use realloc since mbcset->coll_syms is NULL
3077                  if *alloc == 0.  */
3078               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3079                                                    new_coll_sym_alloc);
3080               if (BE (new_coll_syms == NULL, 0))
3081                 return REG_ESPACE;
3082               mbcset->coll_syms = new_coll_syms;
3083               *coll_sym_alloc = new_coll_sym_alloc;
3084             }
3085           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3086           return REG_NOERROR;
3087         }
3088       else
3089         {
3090           if (BE (name_len != 1, 0))
3091             return REG_ECOLLATE;
3092           else
3093             {
3094               bitset_set (sbcset, name[0]);
3095               return REG_NOERROR;
3096             }
3097         }
3098     }
3099 #endif
3100
3101   re_token_t br_token;
3102   re_bitset_ptr_t sbcset;
3103 #ifdef RE_ENABLE_I18N
3104   re_charset_t *mbcset;
3105   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3106   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3107 #endif /* not RE_ENABLE_I18N */
3108   bool non_match = false;
3109   bin_tree_t *work_tree;
3110   int token_len;
3111   bool first_round = true;
3112 #ifdef _LIBC
3113   collseqmb = (const unsigned char *)
3114     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3115   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3116   if (nrules)
3117     {
3118       /*
3119       if (MB_CUR_MAX > 1)
3120       */
3121       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3122       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3123       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3124                                                   _NL_COLLATE_SYMB_TABLEMB);
3125       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3126                                                    _NL_COLLATE_SYMB_EXTRAMB);
3127     }
3128 #endif
3129   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3130 #ifdef RE_ENABLE_I18N
3131   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3132 #endif /* RE_ENABLE_I18N */
3133 #ifdef RE_ENABLE_I18N
3134   if (BE (sbcset == NULL || mbcset == NULL, 0))
3135 #else
3136   if (BE (sbcset == NULL, 0))
3137 #endif /* RE_ENABLE_I18N */
3138     {
3139       re_free (sbcset);
3140 #ifdef RE_ENABLE_I18N
3141       re_free (mbcset);
3142 #endif
3143       *err = REG_ESPACE;
3144       return NULL;
3145     }
3146
3147   token_len = peek_token_bracket (token, regexp, syntax);
3148   if (BE (token->type == END_OF_RE, 0))
3149     {
3150       *err = REG_BADPAT;
3151       goto parse_bracket_exp_free_return;
3152     }
3153   if (token->type == OP_NON_MATCH_LIST)
3154     {
3155 #ifdef RE_ENABLE_I18N
3156       mbcset->non_match = 1;
3157 #endif /* not RE_ENABLE_I18N */
3158       non_match = true;
3159       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3160         bitset_set (sbcset, '\n');
3161       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3162       token_len = peek_token_bracket (token, regexp, syntax);
3163       if (BE (token->type == END_OF_RE, 0))
3164         {
3165           *err = REG_BADPAT;
3166           goto parse_bracket_exp_free_return;
3167         }
3168     }
3169
3170   /* We treat the first ']' as a normal character.  */
3171   if (token->type == OP_CLOSE_BRACKET)
3172     token->type = CHARACTER;
3173
3174   while (1)
3175     {
3176       bracket_elem_t start_elem, end_elem;
3177       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3178       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3179       reg_errcode_t ret;
3180       int token_len2 = 0;
3181       bool is_range_exp = false;
3182       re_token_t token2;
3183
3184       start_elem.opr.name = start_name_buf;
3185       start_elem.type = COLL_SYM;
3186       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3187                                    syntax, first_round);
3188       if (BE (ret != REG_NOERROR, 0))
3189         {
3190           *err = ret;
3191           goto parse_bracket_exp_free_return;
3192         }
3193       first_round = false;
3194
3195       /* Get information about the next token.  We need it in any case.  */
3196       token_len = peek_token_bracket (token, regexp, syntax);
3197
3198       /* Do not check for ranges if we know they are not allowed.  */
3199       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3200         {
3201           if (BE (token->type == END_OF_RE, 0))
3202             {
3203               *err = REG_EBRACK;
3204               goto parse_bracket_exp_free_return;
3205             }
3206           if (token->type == OP_CHARSET_RANGE)
3207             {
3208               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3209               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3210               if (BE (token2.type == END_OF_RE, 0))
3211                 {
3212                   *err = REG_EBRACK;
3213                   goto parse_bracket_exp_free_return;
3214                 }
3215               if (token2.type == OP_CLOSE_BRACKET)
3216                 {
3217                   /* We treat the last '-' as a normal character.  */
3218                   re_string_skip_bytes (regexp, -token_len);
3219                   token->type = CHARACTER;
3220                 }
3221               else
3222                 is_range_exp = true;
3223             }
3224         }
3225
3226       if (is_range_exp == true)
3227         {
3228           end_elem.opr.name = end_name_buf;
3229           end_elem.type = COLL_SYM;
3230           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3231                                        dfa, syntax, true);
3232           if (BE (ret != REG_NOERROR, 0))
3233             {
3234               *err = ret;
3235               goto parse_bracket_exp_free_return;
3236             }
3237
3238           token_len = peek_token_bracket (token, regexp, syntax);
3239
3240 #ifdef _LIBC
3241           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3242                                   &start_elem, &end_elem);
3243 #else
3244 # ifdef RE_ENABLE_I18N
3245           *err = build_range_exp (syntax, sbcset,
3246                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3247                                   &range_alloc, &start_elem, &end_elem);
3248 # else
3249           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3250 # endif
3251 #endif /* RE_ENABLE_I18N */
3252           if (BE (*err != REG_NOERROR, 0))
3253             goto parse_bracket_exp_free_return;
3254         }
3255       else
3256         {
3257           switch (start_elem.type)
3258             {
3259             case SB_CHAR:
3260               bitset_set (sbcset, start_elem.opr.ch);
3261               break;
3262 #ifdef RE_ENABLE_I18N
3263             case MB_CHAR:
3264               /* Check whether the array has enough space.  */
3265               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3266                 {
3267                   wchar_t *new_mbchars;
3268                   /* Not enough, realloc it.  */
3269                   /* +1 in case of mbcset->nmbchars is 0.  */
3270                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3271                   /* Use realloc since array is NULL if *alloc == 0.  */
3272                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3273                                             mbchar_alloc);
3274                   if (BE (new_mbchars == NULL, 0))
3275                     goto parse_bracket_exp_espace;
3276                   mbcset->mbchars = new_mbchars;
3277                 }
3278               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3279               break;
3280 #endif /* RE_ENABLE_I18N */
3281             case EQUIV_CLASS:
3282               *err = build_equiv_class (sbcset,
3283 #ifdef RE_ENABLE_I18N
3284                                         mbcset, &equiv_class_alloc,
3285 #endif /* RE_ENABLE_I18N */
3286                                         start_elem.opr.name);
3287               if (BE (*err != REG_NOERROR, 0))
3288                 goto parse_bracket_exp_free_return;
3289               break;
3290             case COLL_SYM:
3291               *err = build_collating_symbol (sbcset,
3292 #ifdef RE_ENABLE_I18N
3293                                              mbcset, &coll_sym_alloc,
3294 #endif /* RE_ENABLE_I18N */
3295                                              start_elem.opr.name);
3296               if (BE (*err != REG_NOERROR, 0))
3297                 goto parse_bracket_exp_free_return;
3298               break;
3299             case CHAR_CLASS:
3300               *err = build_charclass (regexp->trans, sbcset,
3301 #ifdef RE_ENABLE_I18N
3302                                       mbcset, &char_class_alloc,
3303 #endif /* RE_ENABLE_I18N */
3304                                       (const char *) start_elem.opr.name,
3305                                       syntax);
3306               if (BE (*err != REG_NOERROR, 0))
3307                goto parse_bracket_exp_free_return;
3308               break;
3309             default:
3310               assert (0);
3311               break;
3312             }
3313         }
3314       if (BE (token->type == END_OF_RE, 0))
3315         {
3316           *err = REG_EBRACK;
3317           goto parse_bracket_exp_free_return;
3318         }
3319       if (token->type == OP_CLOSE_BRACKET)
3320         break;
3321     }
3322
3323   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3324
3325   /* If it is non-matching list.  */
3326   if (non_match)
3327     bitset_not (sbcset);
3328
3329 #ifdef RE_ENABLE_I18N
3330   /* Ensure only single byte characters are set.  */
3331   if (dfa->mb_cur_max > 1)
3332     bitset_mask (sbcset, dfa->sb_char);
3333
3334   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3335       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3336                                                      || mbcset->non_match)))
3337     {
3338       bin_tree_t *mbc_tree;
3339       int sbc_idx;
3340       /* Build a tree for complex bracket.  */
3341       dfa->has_mb_node = 1;
3342       br_token.type = COMPLEX_BRACKET;
3343       br_token.opr.mbcset = mbcset;
3344       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3345       if (BE (mbc_tree == NULL, 0))
3346         goto parse_bracket_exp_espace;
3347       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3348         if (sbcset[sbc_idx])
3349           break;
3350       /* If there are no bits set in sbcset, there is no point
3351          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3352       if (sbc_idx < BITSET_WORDS)
3353         {
3354           /* Build a tree for simple bracket.  */
3355           br_token.type = SIMPLE_BRACKET;
3356           br_token.opr.sbcset = sbcset;
3357           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3358           if (BE (work_tree == NULL, 0))
3359             goto parse_bracket_exp_espace;
3360
3361           /* Then join them by ALT node.  */
3362           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3363           if (BE (work_tree == NULL, 0))
3364             goto parse_bracket_exp_espace;
3365         }
3366       else
3367         {
3368           re_free (sbcset);
3369           work_tree = mbc_tree;
3370         }
3371     }
3372   else
3373 #endif /* not RE_ENABLE_I18N */
3374     {
3375 #ifdef RE_ENABLE_I18N
3376       free_charset (mbcset);
3377 #endif
3378       /* Build a tree for simple bracket.  */
3379       br_token.type = SIMPLE_BRACKET;
3380       br_token.opr.sbcset = sbcset;
3381       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3382       if (BE (work_tree == NULL, 0))
3383         goto parse_bracket_exp_espace;
3384     }
3385   return work_tree;
3386
3387  parse_bracket_exp_espace:
3388   *err = REG_ESPACE;
3389  parse_bracket_exp_free_return:
3390   re_free (sbcset);
3391 #ifdef RE_ENABLE_I18N
3392   free_charset (mbcset);
3393 #endif /* RE_ENABLE_I18N */
3394   return NULL;
3395 }
3396
3397 /* Parse an element in the bracket expression.  */
3398
3399 static reg_errcode_t
3400 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3401                        re_token_t *token, int token_len, re_dfa_t *dfa,
3402                        reg_syntax_t syntax, bool accept_hyphen)
3403 {
3404 #ifdef RE_ENABLE_I18N
3405   int cur_char_size;
3406   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3407   if (cur_char_size > 1)
3408     {
3409       elem->type = MB_CHAR;
3410       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3411       re_string_skip_bytes (regexp, cur_char_size);
3412       return REG_NOERROR;
3413     }
3414 #endif /* RE_ENABLE_I18N */
3415   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3416   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3417       || token->type == OP_OPEN_EQUIV_CLASS)
3418     return parse_bracket_symbol (elem, regexp, token);
3419   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3420     {
3421       /* A '-' must only appear as anything but a range indicator before
3422          the closing bracket.  Everything else is an error.  */
3423       re_token_t token2;
3424       (void) peek_token_bracket (&token2, regexp, syntax);
3425       if (token2.type != OP_CLOSE_BRACKET)
3426         /* The actual error value is not standardized since this whole
3427            case is undefined.  But ERANGE makes good sense.  */
3428         return REG_ERANGE;
3429     }
3430   elem->type = SB_CHAR;
3431   elem->opr.ch = token->opr.c;
3432   return REG_NOERROR;
3433 }
3434
3435 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3436    such as [:<character_class>:], [.<collating_element>.], and
3437    [=<equivalent_class>=].  */
3438
3439 static reg_errcode_t
3440 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3441                       re_token_t *token)
3442 {
3443   unsigned char ch, delim = token->opr.c;
3444   int i = 0;
3445   if (re_string_eoi(regexp))
3446     return REG_EBRACK;
3447   for (;; ++i)
3448     {
3449       if (i >= BRACKET_NAME_BUF_SIZE)
3450         return REG_EBRACK;
3451       if (token->type == OP_OPEN_CHAR_CLASS)
3452         ch = re_string_fetch_byte_case (regexp);
3453       else
3454         ch = re_string_fetch_byte (regexp);
3455       if (re_string_eoi(regexp))
3456         return REG_EBRACK;
3457       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3458         break;
3459       elem->opr.name[i] = ch;
3460     }
3461   re_string_skip_bytes (regexp, 1);
3462   elem->opr.name[i] = '\0';
3463   switch (token->type)
3464     {
3465     case OP_OPEN_COLL_ELEM:
3466       elem->type = COLL_SYM;
3467       break;
3468     case OP_OPEN_EQUIV_CLASS:
3469       elem->type = EQUIV_CLASS;
3470       break;
3471     case OP_OPEN_CHAR_CLASS:
3472       elem->type = CHAR_CLASS;
3473       break;
3474     default:
3475       break;
3476     }
3477   return REG_NOERROR;
3478 }
3479
3480   /* Helper function for parse_bracket_exp.
3481      Build the equivalence class which is represented by NAME.
3482      The result are written to MBCSET and SBCSET.
3483      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3484      is a pointer argument since we may update it.  */
3485
3486 static reg_errcode_t
3487 #ifdef RE_ENABLE_I18N
3488 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3489                    Idx *equiv_class_alloc, const unsigned char *name)
3490 #else /* not RE_ENABLE_I18N */
3491 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3492 #endif /* not RE_ENABLE_I18N */
3493 {
3494 #ifdef _LIBC
3495   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3496   if (nrules != 0)
3497     {
3498       const int32_t *table, *indirect;
3499       const unsigned char *weights, *extra, *cp;
3500       unsigned char char_buf[2];
3501       int32_t idx1, idx2;
3502       unsigned int ch;
3503       size_t len;
3504       /* Calculate the index for equivalence class.  */
3505       cp = name;
3506       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3507       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3508                                                _NL_COLLATE_WEIGHTMB);
3509       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3510                                                    _NL_COLLATE_EXTRAMB);
3511       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3512                                                 _NL_COLLATE_INDIRECTMB);
3513       idx1 = findidx (table, indirect, extra, &cp, -1);
3514       if (BE (idx1 == 0 || *cp != '\0', 0))
3515         /* This isn't a valid character.  */
3516         return REG_ECOLLATE;
3517
3518       /* Build single byte matching table for this equivalence class.  */
3519       len = weights[idx1 & 0xffffff];
3520       for (ch = 0; ch < SBC_MAX; ++ch)
3521         {
3522           char_buf[0] = ch;
3523           cp = char_buf;
3524           idx2 = findidx (table, indirect, extra, &cp, 1);
3525 /*
3526           idx2 = table[ch];
3527 */
3528           if (idx2 == 0)
3529             /* This isn't a valid character.  */
3530             continue;
3531           /* Compare only if the length matches and the collation rule
3532              index is the same.  */
3533           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3534               && memcmp (weights + (idx1 & 0xffffff) + 1,
3535                          weights + (idx2 & 0xffffff) + 1, len) == 0)
3536             bitset_set (sbcset, ch);
3537         }
3538       /* Check whether the array has enough space.  */
3539       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3540         {
3541           /* Not enough, realloc it.  */
3542           /* +1 in case of mbcset->nequiv_classes is 0.  */
3543           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3544           /* Use realloc since the array is NULL if *alloc == 0.  */
3545           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3546                                                    int32_t,
3547                                                    new_equiv_class_alloc);
3548           if (BE (new_equiv_classes == NULL, 0))
3549             return REG_ESPACE;
3550           mbcset->equiv_classes = new_equiv_classes;
3551           *equiv_class_alloc = new_equiv_class_alloc;
3552         }
3553       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3554     }
3555   else
3556 #endif /* _LIBC */
3557     {
3558       if (BE (strlen ((const char *) name) != 1, 0))
3559         return REG_ECOLLATE;
3560       bitset_set (sbcset, *name);
3561     }
3562   return REG_NOERROR;
3563 }
3564
3565   /* Helper function for parse_bracket_exp.
3566      Build the character class which is represented by NAME.
3567      The result are written to MBCSET and SBCSET.
3568      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3569      is a pointer argument since we may update it.  */
3570
3571 static reg_errcode_t
3572 #ifdef RE_ENABLE_I18N
3573 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3574                  re_charset_t *mbcset, Idx *char_class_alloc,
3575                  const char *class_name, reg_syntax_t syntax)
3576 #else /* not RE_ENABLE_I18N */
3577 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3578                  const char *class_name, reg_syntax_t syntax)
3579 #endif /* not RE_ENABLE_I18N */
3580 {
3581   int i;
3582   const char *name = class_name;
3583
3584   /* In case of REG_ICASE "upper" and "lower" match the both of
3585      upper and lower cases.  */
3586   if ((syntax & RE_ICASE)
3587       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3588     name = "alpha";
3589
3590 #ifdef RE_ENABLE_I18N
3591   /* Check the space of the arrays.  */
3592   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3593     {
3594       /* Not enough, realloc it.  */
3595       /* +1 in case of mbcset->nchar_classes is 0.  */
3596       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3597       /* Use realloc since array is NULL if *alloc == 0.  */
3598       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3599                                                new_char_class_alloc);
3600       if (BE (new_char_classes == NULL, 0))
3601         return REG_ESPACE;
3602       mbcset->char_classes = new_char_classes;
3603       *char_class_alloc = new_char_class_alloc;
3604     }
3605   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3606 #endif /* RE_ENABLE_I18N */
3607
3608 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3609   do {                                          \
3610     if (BE (trans != NULL, 0))                  \
3611       {                                         \
3612         for (i = 0; i < SBC_MAX; ++i)           \
3613           if (ctype_func (i))                   \
3614             bitset_set (sbcset, trans[i]);      \
3615       }                                         \
3616     else                                        \
3617       {                                         \
3618         for (i = 0; i < SBC_MAX; ++i)           \
3619           if (ctype_func (i))                   \
3620             bitset_set (sbcset, i);             \
3621       }                                         \
3622   } while (0)
3623
3624   if (strcmp (name, "alnum") == 0)
3625     BUILD_CHARCLASS_LOOP (isalnum);
3626   else if (strcmp (name, "cntrl") == 0)
3627     BUILD_CHARCLASS_LOOP (iscntrl);
3628   else if (strcmp (name, "lower") == 0)
3629     BUILD_CHARCLASS_LOOP (islower);
3630   else if (strcmp (name, "space") == 0)
3631     BUILD_CHARCLASS_LOOP (isspace);
3632   else if (strcmp (name, "alpha") == 0)
3633     BUILD_CHARCLASS_LOOP (isalpha);
3634   else if (strcmp (name, "digit") == 0)
3635     BUILD_CHARCLASS_LOOP (isdigit);
3636   else if (strcmp (name, "print") == 0)
3637     BUILD_CHARCLASS_LOOP (isprint);
3638   else if (strcmp (name, "upper") == 0)
3639     BUILD_CHARCLASS_LOOP (isupper);
3640   else if (strcmp (name, "blank") == 0)
3641     BUILD_CHARCLASS_LOOP (isblank);
3642   else if (strcmp (name, "graph") == 0)
3643     BUILD_CHARCLASS_LOOP (isgraph);
3644   else if (strcmp (name, "punct") == 0)
3645     BUILD_CHARCLASS_LOOP (ispunct);
3646   else if (strcmp (name, "xdigit") == 0)
3647     BUILD_CHARCLASS_LOOP (isxdigit);
3648   else
3649     return REG_ECTYPE;
3650
3651   return REG_NOERROR;
3652 }
3653
3654 static bin_tree_t *
3655 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3656                     const char *class_name,
3657                     const char *extra, bool non_match,
3658                     reg_errcode_t *err)
3659 {
3660   re_bitset_ptr_t sbcset;
3661 #ifdef RE_ENABLE_I18N
3662   re_charset_t *mbcset;
3663   Idx alloc = 0;
3664 #endif /* not RE_ENABLE_I18N */
3665   reg_errcode_t ret;
3666   re_token_t br_token;
3667   bin_tree_t *tree;
3668
3669   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3670   if (BE (sbcset == NULL, 0))
3671     {
3672       *err = REG_ESPACE;
3673       return NULL;
3674     }
3675 #ifdef RE_ENABLE_I18N
3676   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3677   if (BE (mbcset == NULL, 0))
3678     {
3679       re_free (sbcset);
3680       *err = REG_ESPACE;
3681       return NULL;
3682     }
3683   mbcset->non_match = non_match;
3684 #endif /* RE_ENABLE_I18N */
3685
3686   /* We don't care the syntax in this case.  */
3687   ret = build_charclass (trans, sbcset,
3688 #ifdef RE_ENABLE_I18N
3689                          mbcset, &alloc,
3690 #endif /* RE_ENABLE_I18N */
3691                          class_name, 0);
3692
3693   if (BE (ret != REG_NOERROR, 0))
3694     {
3695       re_free (sbcset);
3696 #ifdef RE_ENABLE_I18N
3697       free_charset (mbcset);
3698 #endif /* RE_ENABLE_I18N */
3699       *err = ret;
3700       return NULL;
3701     }
3702   /* \w match '_' also.  */
3703   for (; *extra; extra++)
3704     bitset_set (sbcset, *extra);
3705
3706   /* If it is non-matching list.  */
3707   if (non_match)
3708     bitset_not (sbcset);
3709
3710 #ifdef RE_ENABLE_I18N
3711   /* Ensure only single byte characters are set.  */
3712   if (dfa->mb_cur_max > 1)
3713     bitset_mask (sbcset, dfa->sb_char);
3714 #endif
3715
3716   /* Build a tree for simple bracket.  */
3717 #if defined GCC_LINT || defined lint
3718   memset (&br_token, 0, sizeof br_token);
3719 #endif
3720   br_token.type = SIMPLE_BRACKET;
3721   br_token.opr.sbcset = sbcset;
3722   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3723   if (BE (tree == NULL, 0))
3724     goto build_word_op_espace;
3725
3726 #ifdef RE_ENABLE_I18N
3727   if (dfa->mb_cur_max > 1)
3728     {
3729       bin_tree_t *mbc_tree;
3730       /* Build a tree for complex bracket.  */
3731       br_token.type = COMPLEX_BRACKET;
3732       br_token.opr.mbcset = mbcset;
3733       dfa->has_mb_node = 1;
3734       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3735       if (BE (mbc_tree == NULL, 0))
3736         goto build_word_op_espace;
3737       /* Then join them by ALT node.  */
3738       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3739       if (BE (mbc_tree != NULL, 1))
3740         return tree;
3741     }
3742   else
3743     {
3744       free_charset (mbcset);
3745       return tree;
3746     }
3747 #else /* not RE_ENABLE_I18N */
3748   return tree;
3749 #endif /* not RE_ENABLE_I18N */
3750
3751  build_word_op_espace:
3752   re_free (sbcset);
3753 #ifdef RE_ENABLE_I18N
3754   free_charset (mbcset);
3755 #endif /* RE_ENABLE_I18N */
3756   *err = REG_ESPACE;
3757   return NULL;
3758 }
3759
3760 /* This is intended for the expressions like "a{1,3}".
3761    Fetch a number from 'input', and return the number.
3762    Return -1 if the number field is empty like "{,1}".
3763    Return RE_DUP_MAX + 1 if the number field is too large.
3764    Return -2 if an error occurred.  */
3765
3766 static Idx
3767 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3768 {
3769   Idx num = -1;
3770   unsigned char c;
3771   while (1)
3772     {
3773       fetch_token (token, input, syntax);
3774       c = token->opr.c;
3775       if (BE (token->type == END_OF_RE, 0))
3776         return -2;
3777       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3778         break;
3779       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3780              ? -2
3781              : num == -1
3782              ? c - '0'
3783              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3784     }
3785   return num;
3786 }
3787 \f
3788 #ifdef RE_ENABLE_I18N
3789 static void
3790 free_charset (re_charset_t *cset)
3791 {
3792   re_free (cset->mbchars);
3793 # ifdef _LIBC
3794   re_free (cset->coll_syms);
3795   re_free (cset->equiv_classes);
3796 # endif
3797   re_free (cset->range_starts);
3798   re_free (cset->range_ends);
3799   re_free (cset->char_classes);
3800   re_free (cset);
3801 }
3802 #endif /* RE_ENABLE_I18N */
3803 \f
3804 /* Functions for binary tree operation.  */
3805
3806 /* Create a tree node.  */
3807
3808 static bin_tree_t *
3809 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3810              re_token_type_t type)
3811 {
3812   re_token_t t;
3813 #if defined GCC_LINT || defined lint
3814   memset (&t, 0, sizeof t);
3815 #endif
3816   t.type = type;
3817   return create_token_tree (dfa, left, right, &t);
3818 }
3819
3820 static bin_tree_t *
3821 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3822                    const re_token_t *token)
3823 {
3824   bin_tree_t *tree;
3825   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3826     {
3827       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3828
3829       if (storage == NULL)
3830         return NULL;
3831       storage->next = dfa->str_tree_storage;
3832       dfa->str_tree_storage = storage;
3833       dfa->str_tree_storage_idx = 0;
3834     }
3835   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3836
3837   tree->parent = NULL;
3838   tree->left = left;
3839   tree->right = right;
3840   tree->token = *token;
3841   tree->token.duplicated = 0;
3842   tree->token.opt_subexp = 0;
3843   tree->first = NULL;
3844   tree->next = NULL;
3845   tree->node_idx = -1;
3846
3847   if (left != NULL)
3848     left->parent = tree;
3849   if (right != NULL)
3850     right->parent = tree;
3851   return tree;
3852 }
3853
3854 /* Mark the tree SRC as an optional subexpression.
3855    To be called from preorder or postorder.  */
3856
3857 static reg_errcode_t
3858 mark_opt_subexp (void *extra, bin_tree_t *node)
3859 {
3860   Idx idx = (uintptr_t) extra;
3861   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3862     node->token.opt_subexp = 1;
3863
3864   return REG_NOERROR;
3865 }
3866
3867 /* Free the allocated memory inside NODE. */
3868
3869 static void
3870 free_token (re_token_t *node)
3871 {
3872 #ifdef RE_ENABLE_I18N
3873   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3874     free_charset (node->opr.mbcset);
3875   else
3876 #endif /* RE_ENABLE_I18N */
3877     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3878       re_free (node->opr.sbcset);
3879 }
3880
3881 /* Worker function for tree walking.  Free the allocated memory inside NODE
3882    and its children. */
3883
3884 static reg_errcode_t
3885 free_tree (void *extra, bin_tree_t *node)
3886 {
3887   free_token (&node->token);
3888   return REG_NOERROR;
3889 }
3890
3891
3892 /* Duplicate the node SRC, and return new node.  This is a preorder
3893    visit similar to the one implemented by the generic visitor, but
3894    we need more infrastructure to maintain two parallel trees --- so,
3895    it's easier to duplicate.  */
3896
3897 static bin_tree_t *
3898 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3899 {
3900   const bin_tree_t *node;
3901   bin_tree_t *dup_root;
3902   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3903
3904   for (node = root; ; )
3905     {
3906       /* Create a new tree and link it back to the current parent.  */
3907       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3908       if (*p_new == NULL)
3909         return NULL;
3910       (*p_new)->parent = dup_node;
3911       (*p_new)->token.duplicated = 1;
3912       dup_node = *p_new;
3913
3914       /* Go to the left node, or up and to the right.  */
3915       if (node->left)
3916         {
3917           node = node->left;
3918           p_new = &dup_node->left;
3919         }
3920       else
3921         {
3922           const bin_tree_t *prev = NULL;
3923           while (node->right == prev || node->right == NULL)
3924             {
3925               prev = node;
3926               node = node->parent;
3927               dup_node = dup_node->parent;
3928               if (!node)
3929                 return dup_root;
3930             }
3931           node = node->right;
3932           p_new = &dup_node->right;
3933         }
3934     }
3935 }