lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #ifdef _LIBC
  21 # include <locale/weight.h>
  22 #endif
  23
  24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  25                                           size_t length, reg_syntax_t syntax);
  26 static void re_compile_fastmap_iter (regex_t *bufp,
  27                                      const re_dfastate_t *init_state,
  28                                      char *fastmap);
  29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  30 #ifdef RE_ENABLE_I18N
  31 static void free_charset (re_charset_t *cset);
  32 #endif /* RE_ENABLE_I18N */
  33 static void free_workarea_compile (regex_t *preg);
  34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  35 #ifdef RE_ENABLE_I18N
  36 static void optimize_utf8 (re_dfa_t *dfa);
  37 #endif
  38 static reg_errcode_t analyze (regex_t *preg);
  39 static reg_errcode_t preorder (bin_tree_t *root,
  40                                reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                void *extra);
  42 static reg_errcode_t postorder (bin_tree_t *root,
  43                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  44                                 void *extra);
  45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  48                                  bin_tree_t *node);
  49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  54                                    unsigned int constraint);
  55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  57                                          Idx node, bool root);
  58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  59 static Idx fetch_number (re_string_t *input, re_token_t *token,
  60                          reg_syntax_t syntax);
  61 static int peek_token (re_token_t *token, re_string_t *input,
  62                         reg_syntax_t syntax) internal_function;
  63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  64                           reg_syntax_t syntax, reg_errcode_t *err);
  65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  66                                   re_token_t *token, reg_syntax_t syntax,
  67                                   Idx nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  69                                  re_token_t *token, reg_syntax_t syntax,
  70                                  Idx nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  72                                      re_token_t *token, reg_syntax_t syntax,
  73                                      Idx nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  75                                   re_token_t *token, reg_syntax_t syntax,
  76                                   Idx nest, reg_errcode_t *err);
  77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  78                                  re_dfa_t *dfa, re_token_t *token,
  79                                  reg_syntax_t syntax, reg_errcode_t *err);
  80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  81                                       re_token_t *token, reg_syntax_t syntax,
  82                                       reg_errcode_t *err);
  83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  84                                             re_string_t *regexp,
  85                                             re_token_t *token, int token_len,
  86                                             re_dfa_t *dfa,
  87                                             reg_syntax_t syntax,
  88                                             bool accept_hyphen);
  89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  90                                           re_string_t *regexp,
  91                                           re_token_t *token);
  92 #ifdef RE_ENABLE_I18N
  93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  94                                         re_charset_t *mbcset,
  95                                         Idx *equiv_class_alloc,
  96                                         const unsigned char *name);
  97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  98                                       bitset_t sbcset,
  99                                       re_charset_t *mbcset,
 100                                       Idx *char_class_alloc,
 101                                       const char *class_name,
 102                                       reg_syntax_t syntax);
 103 #else  /* not RE_ENABLE_I18N */
 104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 105                                         const unsigned char *name);
 106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 107                                       bitset_t sbcset,
 108                                       const char *class_name,
 109                                       reg_syntax_t syntax);
 110 #endif /* not RE_ENABLE_I18N */
 111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 112                                        RE_TRANSLATE_TYPE trans,
 113                                        const char *class_name,
 114                                        const char *extra,
 115                                        bool non_match, reg_errcode_t *err);
 116 static bin_tree_t *create_tree (re_dfa_t *dfa,
 117                                 bin_tree_t *left, bin_tree_t *right,
 118                                 re_token_type_t type);
 119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 120                                       bin_tree_t *left, bin_tree_t *right,
 121                                       const re_token_t *token);
 122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 123 static void free_token (re_token_t *node);
 124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 126 \f
 127 /* This table gives an error message for each of the error codes listed
 128    in regex.h.  Obviously the order here has to be same as there.
 129    POSIX doesn't require that we do anything for REG_NOERROR,
 130    but why not be nice?  */
 131
 132 static const char __re_error_msgid[] =
 133   {
 134 #define REG_NOERROR_IDX 0
 135     gettext_noop ("Success")    /* REG_NOERROR */
 136     "\0"
 137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 138     gettext_noop ("No match")   /* REG_NOMATCH */
 139     "\0"
 140 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 142     "\0"
 143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 145     "\0"
 146 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 148     "\0"
 149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 151     "\0"
 152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 154     "\0"
 155 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 156     gettext_noop ("Unmatched [, [^, [:, [., or [=")     /* REG_EBRACK */
 157     "\0"
 158 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
 159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 160     "\0"
 161 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 163     "\0"
 164 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 166     "\0"
 167 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 168     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 169     "\0"
 170 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 172     "\0"
 173 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 175     "\0"
 176 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 178     "\0"
 179 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 181     "\0"
 182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 184   };
 185
 186 static const size_t __re_error_msgid_idx[] =
 187   {
 188     REG_NOERROR_IDX,
 189     REG_NOMATCH_IDX,
 190     REG_BADPAT_IDX,
 191     REG_ECOLLATE_IDX,
 192     REG_ECTYPE_IDX,
 193     REG_EESCAPE_IDX,
 194     REG_ESUBREG_IDX,
 195     REG_EBRACK_IDX,
 196     REG_EPAREN_IDX,
 197     REG_EBRACE_IDX,
 198     REG_BADBR_IDX,
 199     REG_ERANGE_IDX,
 200     REG_ESPACE_IDX,
 201     REG_BADRPT_IDX,
 202     REG_EEND_IDX,
 203     REG_ESIZE_IDX,
 204     REG_ERPAREN_IDX
 205   };
 206 \f
 207 /* Entry points for GNU code.  */
 208
 209 /* re_compile_pattern is the GNU regular expression compiler: it
 210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 211    Returns 0 if the pattern was valid, otherwise an error string.
 212
 213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 214    are set in BUFP on entry.  */
 215
 216 const char *
 217 re_compile_pattern (const char *pattern, size_t length,
 218                     struct re_pattern_buffer *bufp)
 219 {
 220   reg_errcode_t ret;
 221
 222   /* And GNU code determines whether or not to get register information
 223      by passing null for the REGS argument to re_match, etc., not by
 224      setting no_sub, unless RE_NO_SUB is set.  */
 225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 226
 227   /* Match anchors at newline.  */
 228   bufp->newline_anchor = 1;
 229
 230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 231
 232   if (!ret)
 233     return NULL;
 234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 235 }
 236 #ifdef _LIBC
 237 weak_alias (__re_compile_pattern, re_compile_pattern)
 238 #endif
 239
 240 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 241    also be assigned to arbitrarily: each pattern buffer stores its own
 242    syntax, so it can be changed between regex compilations.  */
 243 /* This has no initializer because initialized variables in Emacs
 244    become read-only after dumping.  */
 245 reg_syntax_t re_syntax_options;
 246
 247
 248 /* Specify the precise syntax of regexps for compilation.  This provides
 249    for compatibility for various utilities which historically have
 250    different, incompatible syntaxes.
 251
 252    The argument SYNTAX is a bit mask comprised of the various bits
 253    defined in regex.h.  We return the old syntax.  */
 254
 255 reg_syntax_t
 256 re_set_syntax (reg_syntax_t syntax)
 257 {
 258   reg_syntax_t ret = re_syntax_options;
 259
 260   re_syntax_options = syntax;
 261   return ret;
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_set_syntax, re_set_syntax)
 265 #endif
 266
 267 int
 268 re_compile_fastmap (struct re_pattern_buffer *bufp)
 269 {
 270   re_dfa_t *dfa = bufp->buffer;
 271   char *fastmap = bufp->fastmap;
 272
 273   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 274   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 275   if (dfa->init_state != dfa->init_state_word)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 277   if (dfa->init_state != dfa->init_state_nl)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 279   if (dfa->init_state != dfa->init_state_begbuf)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 281   bufp->fastmap_accurate = 1;
 282   return 0;
 283 }
 284 #ifdef _LIBC
 285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 286 #endif
 287
 288 static inline void
 289 __attribute__ ((always_inline))
 290 re_set_fastmap (char *fastmap, bool icase, int ch)
 291 {
 292   fastmap[ch] = 1;
 293   if (icase)
 294     fastmap[tolower (ch)] = 1;
 295 }
 296
 297 /* Helper function for re_compile_fastmap.
 298    Compile fastmap for the initial_state INIT_STATE.  */
 299
 300 static void
 301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 302                          char *fastmap)
 303 {
 304   re_dfa_t *dfa = bufp->buffer;
 305   Idx node_cnt;
 306   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 307   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 308     {
 309       Idx node = init_state->nodes.elems[node_cnt];
 310       re_token_type_t type = dfa->nodes[node].type;
 311
 312       if (type == CHARACTER)
 313         {
 314           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 315 #ifdef RE_ENABLE_I18N
 316           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 317             {
 318               unsigned char buf[MB_LEN_MAX];
 319               unsigned char *p;
 320               wchar_t wc;
 321               mbstate_t state;
 322
 323               p = buf;
 324               *p++ = dfa->nodes[node].opr.c;
 325               while (++node < dfa->nodes_len
 326                      && dfa->nodes[node].type == CHARACTER
 327                      && dfa->nodes[node].mb_partial)
 328                 *p++ = dfa->nodes[node].opr.c;
 329               memset (&state, '\0', sizeof (state));
 330               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 331                              &state) == p - buf
 332                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 333                       != (size_t) -1))
 334                 re_set_fastmap (fastmap, false, buf[0]);
 335             }
 336 #endif
 337         }
 338       else if (type == SIMPLE_BRACKET)
 339         {
 340           int i, ch;
 341           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 342             {
 343               int j;
 344               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 345               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 346                 if (w & ((bitset_word_t) 1 << j))
 347                   re_set_fastmap (fastmap, icase, ch);
 348             }
 349         }
 350 #ifdef RE_ENABLE_I18N
 351       else if (type == COMPLEX_BRACKET)
 352         {
 353           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 354           Idx i;
 355
 356 # ifdef _LIBC
 357           /* See if we have to try all bytes which start multiple collation
 358              elements.
 359              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 360                   collation element, and don't catch 'b' since 'b' is
 361                   the only collation element which starts from 'b' (and
 362                   it is caught by SIMPLE_BRACKET).  */
 363               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 364                   && (cset->ncoll_syms || cset->nranges))
 365                 {
 366                   const int32_t *table = (const int32_t *)
 367                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 368                   for (i = 0; i < SBC_MAX; ++i)
 369                     if (table[i] < 0)
 370                       re_set_fastmap (fastmap, icase, i);
 371                 }
 372 # endif /* _LIBC */
 373
 374           /* See if we have to start the match at all multibyte characters,
 375              i.e. where we would not find an invalid sequence.  This only
 376              applies to multibyte character sets; for single byte character
 377              sets, the SIMPLE_BRACKET again suffices.  */
 378           if (dfa->mb_cur_max > 1
 379               && (cset->nchar_classes || cset->non_match || cset->nranges
 380 # ifdef _LIBC
 381                   || cset->nequiv_classes
 382 # endif /* _LIBC */
 383                  ))
 384             {
 385               unsigned char c = 0;
 386               do
 387                 {
 388                   mbstate_t mbs;
 389                   memset (&mbs, 0, sizeof (mbs));
 390                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 391                     re_set_fastmap (fastmap, false, (int) c);
 392                 }
 393               while (++c != 0);
 394             }
 395
 396           else
 397             {
 398               /* ... Else catch all bytes which can start the mbchars.  */
 399               for (i = 0; i < cset->nmbchars; ++i)
 400                 {
 401                   char buf[256];
 402                   mbstate_t state;
 403                   memset (&state, '\0', sizeof (state));
 404                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 405                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 406                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 407                     {
 408                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 409                           != (size_t) -1)
 410                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 411                     }
 412                 }
 413             }
 414         }
 415 #endif /* RE_ENABLE_I18N */
 416       else if (type == OP_PERIOD
 417 #ifdef RE_ENABLE_I18N
 418                || type == OP_UTF8_PERIOD
 419 #endif /* RE_ENABLE_I18N */
 420                || type == END_OF_RE)
 421         {
 422           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 423           if (type == END_OF_RE)
 424             bufp->can_be_null = 1;
 425           return;
 426         }
 427     }
 428 }
 429 \f
 430 /* Entry point for POSIX code.  */
 431 /* regcomp takes a regular expression as a string and compiles it.
 432
 433    PREG is a regex_t *.  We do not expect any fields to be initialized,
 434    since POSIX says we shouldn't.  Thus, we set
 435
 436      'buffer' to the compiled pattern;
 437      'used' to the length of the compiled pattern;
 438      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 439        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 440        RE_SYNTAX_POSIX_BASIC;
 441      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 442      'fastmap' to an allocated space for the fastmap;
 443      'fastmap_accurate' to zero;
 444      're_nsub' to the number of subexpressions in PATTERN.
 445
 446    PATTERN is the address of the pattern string.
 447
 448    CFLAGS is a series of bits which affect compilation.
 449
 450      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 451      use POSIX basic syntax.
 452
 453      If REG_NEWLINE is set, then . and [^...] don't match newline.
 454      Also, regexec will try a match beginning after every newline.
 455
 456      If REG_ICASE is set, then we considers upper- and lowercase
 457      versions of letters to be equivalent when matching.
 458
 459      If REG_NOSUB is set, then when PREG is passed to regexec, that
 460      routine will report only success or failure, and nothing about the
 461      registers.
 462
 463    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 464    the return codes and their meanings.)  */
 465
 466 int
 467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
 468 {
 469   reg_errcode_t ret;
 470   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 471                          : RE_SYNTAX_POSIX_BASIC);
 472
 473   preg->buffer = NULL;
 474   preg->allocated = 0;
 475   preg->used = 0;
 476
 477   /* Try to allocate space for the fastmap.  */
 478   preg->fastmap = re_malloc (char, SBC_MAX);
 479   if (BE (preg->fastmap == NULL, 0))
 480     return REG_ESPACE;
 481
 482   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 483
 484   /* If REG_NEWLINE is set, newlines are treated differently.  */
 485   if (cflags & REG_NEWLINE)
 486     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 487       syntax &= ~RE_DOT_NEWLINE;
 488       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 489       /* It also changes the matching behavior.  */
 490       preg->newline_anchor = 1;
 491     }
 492   else
 493     preg->newline_anchor = 0;
 494   preg->no_sub = !!(cflags & REG_NOSUB);
 495   preg->translate = NULL;
 496
 497   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 498
 499   /* POSIX doesn't distinguish between an unmatched open-group and an
 500      unmatched close-group: both are REG_EPAREN.  */
 501   if (ret == REG_ERPAREN)
 502     ret = REG_EPAREN;
 503
 504   /* We have already checked preg->fastmap != NULL.  */
 505   if (BE (ret == REG_NOERROR, 1))
 506     /* Compute the fastmap now, since regexec cannot modify the pattern
 507        buffer.  This function never fails in this implementation.  */
 508     (void) re_compile_fastmap (preg);
 509   else
 510     {
 511       /* Some error occurred while compiling the expression.  */
 512       re_free (preg->fastmap);
 513       preg->fastmap = NULL;
 514     }
 515
 516   return (int) ret;
 517 }
 518 #ifdef _LIBC
 519 weak_alias (__regcomp, regcomp)
 520 #endif
 521
 522 /* Returns a message corresponding to an error code, ERRCODE, returned
 523    from either regcomp or regexec.   We don't use PREG here.  */
 524
 525 size_t
 526 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
 527           size_t errbuf_size)
 528 {
 529   const char *msg;
 530   size_t msg_size;
 531
 532   if (BE (errcode < 0
 533           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 534                                / sizeof (__re_error_msgid_idx[0])), 0))
 535     /* Only error codes returned by the rest of the code should be passed
 536        to this routine.  If we are given anything else, or if other regex
 537        code generates an invalid error code, then the program has a bug.
 538        Dump core so we can fix it.  */
 539     abort ();
 540
 541   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 542
 543   msg_size = strlen (msg) + 1; /* Includes the null.  */
 544
 545   if (BE (errbuf_size != 0, 1))
 546     {
 547       size_t cpy_size = msg_size;
 548       if (BE (msg_size > errbuf_size, 0))
 549         {
 550           cpy_size = errbuf_size - 1;
 551           errbuf[cpy_size] = '\0';
 552         }
 553       memcpy (errbuf, msg, cpy_size);
 554     }
 555
 556   return msg_size;
 557 }
 558 #ifdef _LIBC
 559 weak_alias (__regerror, regerror)
 560 #endif
 561
 562
 563 #ifdef RE_ENABLE_I18N
 564 /* This static array is used for the map to single-byte characters when
 565    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 566    it the same all the time.  UTF-8 is the preferred encoding so this is
 567    a worthwhile optimization.  */
 568 static const bitset_t utf8_sb_map =
 569 {
 570   /* Set the first 128 bits.  */
 571 # if defined __GNUC__ && !defined __STRICT_ANSI__
 572   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 573 # else
 574 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 575 #   error "bitset_word_t is narrower than 32 bits"
 576 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 577   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 578 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 579   BITSET_WORD_MAX, BITSET_WORD_MAX,
 580 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 581   BITSET_WORD_MAX,
 582 #  endif
 583   (BITSET_WORD_MAX
 584    >> (SBC_MAX % BITSET_WORD_BITS == 0
 585        ? 0
 586        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 587 # endif
 588 };
 589 #endif
 590
 591
 592 static void
 593 free_dfa_content (re_dfa_t *dfa)
 594 {
 595   Idx i, j;
 596
 597   if (dfa->nodes)
 598     for (i = 0; i < dfa->nodes_len; ++i)
 599       free_token (dfa->nodes + i);
 600   re_free (dfa->nexts);
 601   for (i = 0; i < dfa->nodes_len; ++i)
 602     {
 603       if (dfa->eclosures != NULL)
 604         re_node_set_free (dfa->eclosures + i);
 605       if (dfa->inveclosures != NULL)
 606         re_node_set_free (dfa->inveclosures + i);
 607       if (dfa->edests != NULL)
 608         re_node_set_free (dfa->edests + i);
 609     }
 610   re_free (dfa->edests);
 611   re_free (dfa->eclosures);
 612   re_free (dfa->inveclosures);
 613   re_free (dfa->nodes);
 614
 615   if (dfa->state_table)
 616     for (i = 0; i <= dfa->state_hash_mask; ++i)
 617       {
 618         struct re_state_table_entry *entry = dfa->state_table + i;
 619         for (j = 0; j < entry->num; ++j)
 620           {
 621             re_dfastate_t *state = entry->array[j];
 622             free_state (state);
 623           }
 624         re_free (entry->array);
 625       }
 626   re_free (dfa->state_table);
 627 #ifdef RE_ENABLE_I18N
 628   if (dfa->sb_char != utf8_sb_map)
 629     re_free (dfa->sb_char);
 630 #endif
 631   re_free (dfa->subexp_map);
 632 #ifdef DEBUG
 633   re_free (dfa->re_str);
 634 #endif
 635
 636   re_free (dfa);
 637 }
 638
 639
 640 /* Free dynamically allocated space used by PREG.  */
 641
 642 void
 643 regfree (regex_t *preg)
 644 {
 645   re_dfa_t *dfa = preg->buffer;
 646   if (BE (dfa != NULL, 1))
 647     {
 648       lock_fini (dfa->lock);
 649       free_dfa_content (dfa);
 650     }
 651   preg->buffer = NULL;
 652   preg->allocated = 0;
 653
 654   re_free (preg->fastmap);
 655   preg->fastmap = NULL;
 656
 657   re_free (preg->translate);
 658   preg->translate = NULL;
 659 }
 660 #ifdef _LIBC
 661 weak_alias (__regfree, regfree)
 662 #endif
 663 \f
 664 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 665    them unless specifically requested.  */
 666
 667 #if defined _REGEX_RE_COMP || defined _LIBC
 668
 669 /* BSD has one and only one pattern buffer.  */
 670 static struct re_pattern_buffer re_comp_buf;
 671
 672 char *
 673 # ifdef _LIBC
 674 /* Make these definitions weak in libc, so POSIX programs can redefine
 675    these names if they don't use our functions, and still use
 676    regcomp/regexec above without link errors.  */
 677 weak_function
 678 # endif
 679 re_comp (const char *s)
 680 {
 681   reg_errcode_t ret;
 682   char *fastmap;
 683
 684   if (!s)
 685     {
 686       if (!re_comp_buf.buffer)
 687         return gettext ("No previous regular expression");
 688       return 0;
 689     }
 690
 691   if (re_comp_buf.buffer)
 692     {
 693       fastmap = re_comp_buf.fastmap;
 694       re_comp_buf.fastmap = NULL;
 695       __regfree (&re_comp_buf);
 696       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 697       re_comp_buf.fastmap = fastmap;
 698     }
 699
 700   if (re_comp_buf.fastmap == NULL)
 701     {
 702       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 703       if (re_comp_buf.fastmap == NULL)
 704         return (char *) gettext (__re_error_msgid
 705                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 706     }
 707
 708   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 709      don't need to initialize the pattern buffer fields which affect it.  */
 710
 711   /* Match anchors at newlines.  */
 712   re_comp_buf.newline_anchor = 1;
 713
 714   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 715
 716   if (!ret)
 717     return NULL;
 718
 719   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 720   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 721 }
 722
 723 #ifdef _LIBC
 724 libc_freeres_fn (free_mem)
 725 {
 726   __regfree (&re_comp_buf);
 727 }
 728 #endif
 729
 730 #endif /* _REGEX_RE_COMP */
 731 \f
 732 /* Internal entry point.
 733    Compile the regular expression PATTERN, whose length is LENGTH.
 734    SYNTAX indicate regular expression's syntax.  */
 735
 736 static reg_errcode_t
 737 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 738                      reg_syntax_t syntax)
 739 {
 740   reg_errcode_t err = REG_NOERROR;
 741   re_dfa_t *dfa;
 742   re_string_t regexp;
 743
 744   /* Initialize the pattern buffer.  */
 745   preg->fastmap_accurate = 0;
 746   preg->syntax = syntax;
 747   preg->not_bol = preg->not_eol = 0;
 748   preg->used = 0;
 749   preg->re_nsub = 0;
 750   preg->can_be_null = 0;
 751   preg->regs_allocated = REGS_UNALLOCATED;
 752
 753   /* Initialize the dfa.  */
 754   dfa = preg->buffer;
 755   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 756     {
 757       /* If zero allocated, but buffer is non-null, try to realloc
 758          enough space.  This loses if buffer's address is bogus, but
 759          that is the user's responsibility.  If ->buffer is NULL this
 760          is a simple allocation.  */
 761       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 762       if (dfa == NULL)
 763         return REG_ESPACE;
 764       preg->allocated = sizeof (re_dfa_t);
 765       preg->buffer = dfa;
 766     }
 767   preg->used = sizeof (re_dfa_t);
 768
 769   err = init_dfa (dfa, length);
 770   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
 771     err = REG_ESPACE;
 772   if (BE (err != REG_NOERROR, 0))
 773     {
 774       free_dfa_content (dfa);
 775       preg->buffer = NULL;
 776       preg->allocated = 0;
 777       return err;
 778     }
 779 #ifdef DEBUG
 780   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 781   dfa->re_str = re_malloc (char, length + 1);
 782   strncpy (dfa->re_str, pattern, length + 1);
 783 #endif
 784
 785   err = re_string_construct (&regexp, pattern, length, preg->translate,
 786                              (syntax & RE_ICASE) != 0, dfa);
 787   if (BE (err != REG_NOERROR, 0))
 788     {
 789     re_compile_internal_free_return:
 790       free_workarea_compile (preg);
 791       re_string_destruct (&regexp);
 792       lock_fini (dfa->lock);
 793       free_dfa_content (dfa);
 794       preg->buffer = NULL;
 795       preg->allocated = 0;
 796       return err;
 797     }
 798
 799   /* Parse the regular expression, and build a structure tree.  */
 800   preg->re_nsub = 0;
 801   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 802   if (BE (dfa->str_tree == NULL, 0))
 803     goto re_compile_internal_free_return;
 804
 805   /* Analyze the tree and create the nfa.  */
 806   err = analyze (preg);
 807   if (BE (err != REG_NOERROR, 0))
 808     goto re_compile_internal_free_return;
 809
 810 #ifdef RE_ENABLE_I18N
 811   /* If possible, do searching in single byte encoding to speed things up.  */
 812   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 813     optimize_utf8 (dfa);
 814 #endif
 815
 816   /* Then create the initial state of the dfa.  */
 817   err = create_initial_state (dfa);
 818
 819   /* Release work areas.  */
 820   free_workarea_compile (preg);
 821   re_string_destruct (&regexp);
 822
 823   if (BE (err != REG_NOERROR, 0))
 824     {
 825       lock_fini (dfa->lock);
 826       free_dfa_content (dfa);
 827       preg->buffer = NULL;
 828       preg->allocated = 0;
 829     }
 830
 831   return err;
 832 }
 833
 834 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 835    as the initial length of some arrays.  */
 836
 837 static reg_errcode_t
 838 init_dfa (re_dfa_t *dfa, size_t pat_len)
 839 {
 840   __re_size_t table_size;
 841 #ifndef _LIBC
 842   const char *codeset_name;
 843 #endif
 844 #ifdef RE_ENABLE_I18N
 845   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 846 #else
 847   size_t max_i18n_object_size = 0;
 848 #endif
 849   size_t max_object_size =
 850     MAX (sizeof (struct re_state_table_entry),
 851          MAX (sizeof (re_token_t),
 852               MAX (sizeof (re_node_set),
 853                    MAX (sizeof (regmatch_t),
 854                         max_i18n_object_size))));
 855
 856   memset (dfa, '\0', sizeof (re_dfa_t));
 857
 858   /* Force allocation of str_tree_storage the first time.  */
 859   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 860
 861   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 862      calculation below, and for similar doubling calculations
 863      elsewhere.  And it's <= rather than <, because some of the
 864      doubling calculations add 1 afterwards.  */
 865   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 866     return REG_ESPACE;
 867
 868   dfa->nodes_alloc = pat_len + 1;
 869   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 870
 871   /*  table_size = 2 ^ ceil(log pat_len) */
 872   for (table_size = 1; ; table_size <<= 1)
 873     if (table_size > pat_len)
 874       break;
 875
 876   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 877   dfa->state_hash_mask = table_size - 1;
 878
 879   dfa->mb_cur_max = MB_CUR_MAX;
 880 #ifdef _LIBC
 881   if (dfa->mb_cur_max == 6
 882       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 883     dfa->is_utf8 = 1;
 884   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 885                        != 0);
 886 #else
 887   codeset_name = nl_langinfo (CODESET);
 888   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 889       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 890       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 891       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 892     dfa->is_utf8 = 1;
 893
 894   /* We check exhaustively in the loop below if this charset is a
 895      superset of ASCII.  */
 896   dfa->map_notascii = 0;
 897 #endif
 898
 899 #ifdef RE_ENABLE_I18N
 900   if (dfa->mb_cur_max > 1)
 901     {
 902       if (dfa->is_utf8)
 903         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 904       else
 905         {
 906           int i, j, ch;
 907
 908           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 909           if (BE (dfa->sb_char == NULL, 0))
 910             return REG_ESPACE;
 911
 912           /* Set the bits corresponding to single byte chars.  */
 913           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 914             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 915               {
 916                 wint_t wch = __btowc (ch);
 917                 if (wch != WEOF)
 918                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 919 # ifndef _LIBC
 920                 if (isascii (ch) && wch != ch)
 921                   dfa->map_notascii = 1;
 922 # endif
 923               }
 924         }
 925     }
 926 #endif
 927
 928   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 929     return REG_ESPACE;
 930   return REG_NOERROR;
 931 }
 932
 933 /* Initialize WORD_CHAR table, which indicate which character is
 934    "word".  In this case "word" means that it is the word construction
 935    character used by some operators like "\<", "\>", etc.  */
 936
 937 static void
 938 internal_function
 939 init_word_char (re_dfa_t *dfa)
 940 {
 941   int i = 0;
 942   int j;
 943   int ch = 0;
 944   dfa->word_ops_used = 1;
 945   if (BE (dfa->map_notascii == 0, 1))
 946     {
 947       bitset_word_t bits0 = 0x00000000;
 948       bitset_word_t bits1 = 0x03ff0000;
 949       bitset_word_t bits2 = 0x87fffffe;
 950       bitset_word_t bits3 = 0x07fffffe;
 951       if (BITSET_WORD_BITS == 64)
 952         {
 953           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 954           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 955           i = 2;
 956         }
 957       else if (BITSET_WORD_BITS == 32)
 958         {
 959           dfa->word_char[0] = bits0;
 960           dfa->word_char[1] = bits1;
 961           dfa->word_char[2] = bits2;
 962           dfa->word_char[3] = bits3;
 963           i = 4;
 964         }
 965       else
 966         goto general_case;
 967       ch = 128;
 968
 969       if (BE (dfa->is_utf8, 1))
 970         {
 971           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 972           return;
 973         }
 974     }
 975
 976  general_case:
 977   for (; i < BITSET_WORDS; ++i)
 978     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 979       if (isalnum (ch) || ch == '_')
 980         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 981 }
 982
 983 /* Free the work area which are only used while compiling.  */
 984
 985 static void
 986 free_workarea_compile (regex_t *preg)
 987 {
 988   re_dfa_t *dfa = preg->buffer;
 989   bin_tree_storage_t *storage, *next;
 990   for (storage = dfa->str_tree_storage; storage; storage = next)
 991     {
 992       next = storage->next;
 993       re_free (storage);
 994     }
 995   dfa->str_tree_storage = NULL;
 996   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 997   dfa->str_tree = NULL;
 998   re_free (dfa->org_indices);
 999   dfa->org_indices = NULL;
1000 }
1001
1002 /* Create initial states for all contexts.  */
1003
1004 static reg_errcode_t
1005 create_initial_state (re_dfa_t *dfa)
1006 {
1007   Idx first, i;
1008   reg_errcode_t err;
1009   re_node_set init_nodes;
1010
1011   /* Initial states have the epsilon closure of the node which is
1012      the first node of the regular expression.  */
1013   first = dfa->str_tree->first->node_idx;
1014   dfa->init_node = first;
1015   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1016   if (BE (err != REG_NOERROR, 0))
1017     return err;
1018
1019   /* The back-references which are in initial states can epsilon transit,
1020      since in this case all of the subexpressions can be null.
1021      Then we add epsilon closures of the nodes which are the next nodes of
1022      the back-references.  */
1023   if (dfa->nbackref > 0)
1024     for (i = 0; i < init_nodes.nelem; ++i)
1025       {
1026         Idx node_idx = init_nodes.elems[i];
1027         re_token_type_t type = dfa->nodes[node_idx].type;
1028
1029         Idx clexp_idx;
1030         if (type != OP_BACK_REF)
1031           continue;
1032         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1033           {
1034             re_token_t *clexp_node;
1035             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1036             if (clexp_node->type == OP_CLOSE_SUBEXP
1037                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1038               break;
1039           }
1040         if (clexp_idx == init_nodes.nelem)
1041           continue;
1042
1043         if (type == OP_BACK_REF)
1044           {
1045             Idx dest_idx = dfa->edests[node_idx].elems[0];
1046             if (!re_node_set_contains (&init_nodes, dest_idx))
1047               {
1048                 reg_errcode_t merge_err
1049                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1050                 if (merge_err != REG_NOERROR)
1051                   return merge_err;
1052                 i = 0;
1053               }
1054           }
1055       }
1056
1057   /* It must be the first time to invoke acquire_state.  */
1058   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1059   /* We don't check ERR here, since the initial state must not be NULL.  */
1060   if (BE (dfa->init_state == NULL, 0))
1061     return err;
1062   if (dfa->init_state->has_constraint)
1063     {
1064       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1065                                                        CONTEXT_WORD);
1066       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1067                                                      CONTEXT_NEWLINE);
1068       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1069                                                          &init_nodes,
1070                                                          CONTEXT_NEWLINE
1071                                                          | CONTEXT_BEGBUF);
1072       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1073               || dfa->init_state_begbuf == NULL, 0))
1074         return err;
1075     }
1076   else
1077     dfa->init_state_word = dfa->init_state_nl
1078       = dfa->init_state_begbuf = dfa->init_state;
1079
1080   re_node_set_free (&init_nodes);
1081   return REG_NOERROR;
1082 }
1083 \f
1084 #ifdef RE_ENABLE_I18N
1085 /* If it is possible to do searching in single byte encoding instead of UTF-8
1086    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1087    DFA nodes where needed.  */
1088
1089 static void
1090 optimize_utf8 (re_dfa_t *dfa)
1091 {
1092   Idx node;
1093   int i;
1094   bool mb_chars = false;
1095   bool has_period = false;
1096
1097   for (node = 0; node < dfa->nodes_len; ++node)
1098     switch (dfa->nodes[node].type)
1099       {
1100       case CHARACTER:
1101         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1102           mb_chars = true;
1103         break;
1104       case ANCHOR:
1105         switch (dfa->nodes[node].opr.ctx_type)
1106           {
1107           case LINE_FIRST:
1108           case LINE_LAST:
1109           case BUF_FIRST:
1110           case BUF_LAST:
1111             break;
1112           default:
1113             /* Word anchors etc. cannot be handled.  It's okay to test
1114                opr.ctx_type since constraints (for all DFA nodes) are
1115                created by ORing one or more opr.ctx_type values.  */
1116             return;
1117           }
1118         break;
1119       case OP_PERIOD:
1120         has_period = true;
1121         break;
1122       case OP_BACK_REF:
1123       case OP_ALT:
1124       case END_OF_RE:
1125       case OP_DUP_ASTERISK:
1126       case OP_OPEN_SUBEXP:
1127       case OP_CLOSE_SUBEXP:
1128         break;
1129       case COMPLEX_BRACKET:
1130         return;
1131       case SIMPLE_BRACKET:
1132         /* Just double check.  */
1133         {
1134           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1135                         ? 0
1136                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1137           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1138             {
1139               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1140                 return;
1141               rshift = 0;
1142             }
1143         }
1144         break;
1145       default:
1146         abort ();
1147       }
1148
1149   if (mb_chars || has_period)
1150     for (node = 0; node < dfa->nodes_len; ++node)
1151       {
1152         if (dfa->nodes[node].type == CHARACTER
1153             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1154           dfa->nodes[node].mb_partial = 0;
1155         else if (dfa->nodes[node].type == OP_PERIOD)
1156           dfa->nodes[node].type = OP_UTF8_PERIOD;
1157       }
1158
1159   /* The search can be in single byte locale.  */
1160   dfa->mb_cur_max = 1;
1161   dfa->is_utf8 = 0;
1162   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1163 }
1164 #endif
1165 \f
1166 /* Analyze the structure tree, and calculate "first", "next", "edest",
1167    "eclosure", and "inveclosure".  */
1168
1169 static reg_errcode_t
1170 analyze (regex_t *preg)
1171 {
1172   re_dfa_t *dfa = preg->buffer;
1173   reg_errcode_t ret;
1174
1175   /* Allocate arrays.  */
1176   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1177   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1178   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1179   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1180   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1181           || dfa->eclosures == NULL, 0))
1182     return REG_ESPACE;
1183
1184   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1185   if (dfa->subexp_map != NULL)
1186     {
1187       Idx i;
1188       for (i = 0; i < preg->re_nsub; i++)
1189         dfa->subexp_map[i] = i;
1190       preorder (dfa->str_tree, optimize_subexps, dfa);
1191       for (i = 0; i < preg->re_nsub; i++)
1192         if (dfa->subexp_map[i] != i)
1193           break;
1194       if (i == preg->re_nsub)
1195         {
1196           free (dfa->subexp_map);
1197           dfa->subexp_map = NULL;
1198         }
1199     }
1200
1201   ret = postorder (dfa->str_tree, lower_subexps, preg);
1202   if (BE (ret != REG_NOERROR, 0))
1203     return ret;
1204   ret = postorder (dfa->str_tree, calc_first, dfa);
1205   if (BE (ret != REG_NOERROR, 0))
1206     return ret;
1207   preorder (dfa->str_tree, calc_next, dfa);
1208   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1209   if (BE (ret != REG_NOERROR, 0))
1210     return ret;
1211   ret = calc_eclosure (dfa);
1212   if (BE (ret != REG_NOERROR, 0))
1213     return ret;
1214
1215   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1216      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1217   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1218       || dfa->nbackref)
1219     {
1220       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1221       if (BE (dfa->inveclosures == NULL, 0))
1222         return REG_ESPACE;
1223       ret = calc_inveclosure (dfa);
1224     }
1225
1226   return ret;
1227 }
1228
1229 /* Our parse trees are very unbalanced, so we cannot use a stack to
1230    implement parse tree visits.  Instead, we use parent pointers and
1231    some hairy code in these two functions.  */
1232 static reg_errcode_t
1233 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1234            void *extra)
1235 {
1236   bin_tree_t *node, *prev;
1237
1238   for (node = root; ; )
1239     {
1240       /* Descend down the tree, preferably to the left (or to the right
1241          if that's the only child).  */
1242       while (node->left || node->right)
1243         if (node->left)
1244           node = node->left;
1245         else
1246           node = node->right;
1247
1248       do
1249         {
1250           reg_errcode_t err = fn (extra, node);
1251           if (BE (err != REG_NOERROR, 0))
1252             return err;
1253           if (node->parent == NULL)
1254             return REG_NOERROR;
1255           prev = node;
1256           node = node->parent;
1257         }
1258       /* Go up while we have a node that is reached from the right.  */
1259       while (node->right == prev || node->right == NULL);
1260       node = node->right;
1261     }
1262 }
1263
1264 static reg_errcode_t
1265 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1266           void *extra)
1267 {
1268   bin_tree_t *node;
1269
1270   for (node = root; ; )
1271     {
1272       reg_errcode_t err = fn (extra, node);
1273       if (BE (err != REG_NOERROR, 0))
1274         return err;
1275
1276       /* Go to the left node, or up and to the right.  */
1277       if (node->left)
1278         node = node->left;
1279       else
1280         {
1281           bin_tree_t *prev = NULL;
1282           while (node->right == prev || node->right == NULL)
1283             {
1284               prev = node;
1285               node = node->parent;
1286               if (!node)
1287                 return REG_NOERROR;
1288             }
1289           node = node->right;
1290         }
1291     }
1292 }
1293
1294 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1295    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1296    backreferences as well.  Requires a preorder visit.  */
1297 static reg_errcode_t
1298 optimize_subexps (void *extra, bin_tree_t *node)
1299 {
1300   re_dfa_t *dfa = (re_dfa_t *) extra;
1301
1302   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1303     {
1304       int idx = node->token.opr.idx;
1305       node->token.opr.idx = dfa->subexp_map[idx];
1306       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1307     }
1308
1309   else if (node->token.type == SUBEXP
1310            && node->left && node->left->token.type == SUBEXP)
1311     {
1312       Idx other_idx = node->left->token.opr.idx;
1313
1314       node->left = node->left->left;
1315       if (node->left)
1316         node->left->parent = node;
1317
1318       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1319       if (other_idx < BITSET_WORD_BITS)
1320         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1321     }
1322
1323   return REG_NOERROR;
1324 }
1325
1326 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1327    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1328 static reg_errcode_t
1329 lower_subexps (void *extra, bin_tree_t *node)
1330 {
1331   regex_t *preg = (regex_t *) extra;
1332   reg_errcode_t err = REG_NOERROR;
1333
1334   if (node->left && node->left->token.type == SUBEXP)
1335     {
1336       node->left = lower_subexp (&err, preg, node->left);
1337       if (node->left)
1338         node->left->parent = node;
1339     }
1340   if (node->right && node->right->token.type == SUBEXP)
1341     {
1342       node->right = lower_subexp (&err, preg, node->right);
1343       if (node->right)
1344         node->right->parent = node;
1345     }
1346
1347   return err;
1348 }
1349
1350 static bin_tree_t *
1351 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1352 {
1353   re_dfa_t *dfa = preg->buffer;
1354   bin_tree_t *body = node->left;
1355   bin_tree_t *op, *cls, *tree1, *tree;
1356
1357   if (preg->no_sub
1358       /* We do not optimize empty subexpressions, because otherwise we may
1359          have bad CONCAT nodes with NULL children.  This is obviously not
1360          very common, so we do not lose much.  An example that triggers
1361          this case is the sed "script" /\(\)/x.  */
1362       && node->left != NULL
1363       && (node->token.opr.idx >= BITSET_WORD_BITS
1364           || !(dfa->used_bkref_map
1365                & ((bitset_word_t) 1 << node->token.opr.idx))))
1366     return node->left;
1367
1368   /* Convert the SUBEXP node to the concatenation of an
1369      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1370   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1371   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1372   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1373   tree = create_tree (dfa, op, tree1, CONCAT);
1374   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1375     {
1376       *err = REG_ESPACE;
1377       return NULL;
1378     }
1379
1380   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1381   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1382   return tree;
1383 }
1384
1385 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1386    nodes.  Requires a postorder visit.  */
1387 static reg_errcode_t
1388 calc_first (void *extra, bin_tree_t *node)
1389 {
1390   re_dfa_t *dfa = (re_dfa_t *) extra;
1391   if (node->token.type == CONCAT)
1392     {
1393       node->first = node->left->first;
1394       node->node_idx = node->left->node_idx;
1395     }
1396   else
1397     {
1398       node->first = node;
1399       node->node_idx = re_dfa_add_node (dfa, node->token);
1400       if (BE (node->node_idx == -1, 0))
1401         return REG_ESPACE;
1402       if (node->token.type == ANCHOR)
1403         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1404     }
1405   return REG_NOERROR;
1406 }
1407
1408 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1409 static reg_errcode_t
1410 calc_next (void *extra, bin_tree_t *node)
1411 {
1412   switch (node->token.type)
1413     {
1414     case OP_DUP_ASTERISK:
1415       node->left->next = node;
1416       break;
1417     case CONCAT:
1418       node->left->next = node->right->first;
1419       node->right->next = node->next;
1420       break;
1421     default:
1422       if (node->left)
1423         node->left->next = node->next;
1424       if (node->right)
1425         node->right->next = node->next;
1426       break;
1427     }
1428   return REG_NOERROR;
1429 }
1430
1431 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1432 static reg_errcode_t
1433 link_nfa_nodes (void *extra, bin_tree_t *node)
1434 {
1435   re_dfa_t *dfa = (re_dfa_t *) extra;
1436   Idx idx = node->node_idx;
1437   reg_errcode_t err = REG_NOERROR;
1438
1439   switch (node->token.type)
1440     {
1441     case CONCAT:
1442       break;
1443
1444     case END_OF_RE:
1445       assert (node->next == NULL);
1446       break;
1447
1448     case OP_DUP_ASTERISK:
1449     case OP_ALT:
1450       {
1451         Idx left, right;
1452         dfa->has_plural_match = 1;
1453         if (node->left != NULL)
1454           left = node->left->first->node_idx;
1455         else
1456           left = node->next->node_idx;
1457         if (node->right != NULL)
1458           right = node->right->first->node_idx;
1459         else
1460           right = node->next->node_idx;
1461         assert (left > -1);
1462         assert (right > -1);
1463         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1464       }
1465       break;
1466
1467     case ANCHOR:
1468     case OP_OPEN_SUBEXP:
1469     case OP_CLOSE_SUBEXP:
1470       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1471       break;
1472
1473     case OP_BACK_REF:
1474       dfa->nexts[idx] = node->next->node_idx;
1475       if (node->token.type == OP_BACK_REF)
1476         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1477       break;
1478
1479     default:
1480       assert (!IS_EPSILON_NODE (node->token.type));
1481       dfa->nexts[idx] = node->next->node_idx;
1482       break;
1483     }
1484
1485   return err;
1486 }
1487
1488 /* Duplicate the epsilon closure of the node ROOT_NODE.
1489    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1490    to their own constraint.  */
1491
1492 static reg_errcode_t
1493 internal_function
1494 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1495                         Idx root_node, unsigned int init_constraint)
1496 {
1497   Idx org_node, clone_node;
1498   bool ok;
1499   unsigned int constraint = init_constraint;
1500   for (org_node = top_org_node, clone_node = top_clone_node;;)
1501     {
1502       Idx org_dest, clone_dest;
1503       if (dfa->nodes[org_node].type == OP_BACK_REF)
1504         {
1505           /* If the back reference epsilon-transit, its destination must
1506              also have the constraint.  Then duplicate the epsilon closure
1507              of the destination of the back reference, and store it in
1508              edests of the back reference.  */
1509           org_dest = dfa->nexts[org_node];
1510           re_node_set_empty (dfa->edests + clone_node);
1511           clone_dest = duplicate_node (dfa, org_dest, constraint);
1512           if (BE (clone_dest == -1, 0))
1513             return REG_ESPACE;
1514           dfa->nexts[clone_node] = dfa->nexts[org_node];
1515           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1516           if (BE (! ok, 0))
1517             return REG_ESPACE;
1518         }
1519       else if (dfa->edests[org_node].nelem == 0)
1520         {
1521           /* In case of the node can't epsilon-transit, don't duplicate the
1522              destination and store the original destination as the
1523              destination of the node.  */
1524           dfa->nexts[clone_node] = dfa->nexts[org_node];
1525           break;
1526         }
1527       else if (dfa->edests[org_node].nelem == 1)
1528         {
1529           /* In case of the node can epsilon-transit, and it has only one
1530              destination.  */
1531           org_dest = dfa->edests[org_node].elems[0];
1532           re_node_set_empty (dfa->edests + clone_node);
1533           /* If the node is root_node itself, it means the epsilon closure
1534              has a loop.  Then tie it to the destination of the root_node.  */
1535           if (org_node == root_node && clone_node != org_node)
1536             {
1537               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1538               if (BE (! ok, 0))
1539                 return REG_ESPACE;
1540               break;
1541             }
1542           /* In case the node has another constraint, append it.  */
1543           constraint |= dfa->nodes[org_node].constraint;
1544           clone_dest = duplicate_node (dfa, org_dest, constraint);
1545           if (BE (clone_dest == -1, 0))
1546             return REG_ESPACE;
1547           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548           if (BE (! ok, 0))
1549             return REG_ESPACE;
1550         }
1551       else /* dfa->edests[org_node].nelem == 2 */
1552         {
1553           /* In case of the node can epsilon-transit, and it has two
1554              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1555           org_dest = dfa->edests[org_node].elems[0];
1556           re_node_set_empty (dfa->edests + clone_node);
1557           /* Search for a duplicated node which satisfies the constraint.  */
1558           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1559           if (clone_dest == -1)
1560             {
1561               /* There is no such duplicated node, create a new one.  */
1562               reg_errcode_t err;
1563               clone_dest = duplicate_node (dfa, org_dest, constraint);
1564               if (BE (clone_dest == -1, 0))
1565                 return REG_ESPACE;
1566               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567               if (BE (! ok, 0))
1568                 return REG_ESPACE;
1569               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1570                                             root_node, constraint);
1571               if (BE (err != REG_NOERROR, 0))
1572                 return err;
1573             }
1574           else
1575             {
1576               /* There is a duplicated node which satisfies the constraint,
1577                  use it to avoid infinite loop.  */
1578               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1579               if (BE (! ok, 0))
1580                 return REG_ESPACE;
1581             }
1582
1583           org_dest = dfa->edests[org_node].elems[1];
1584           clone_dest = duplicate_node (dfa, org_dest, constraint);
1585           if (BE (clone_dest == -1, 0))
1586             return REG_ESPACE;
1587           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1588           if (BE (! ok, 0))
1589             return REG_ESPACE;
1590         }
1591       org_node = org_dest;
1592       clone_node = clone_dest;
1593     }
1594   return REG_NOERROR;
1595 }
1596
1597 /* Search for a node which is duplicated from the node ORG_NODE, and
1598    satisfies the constraint CONSTRAINT.  */
1599
1600 static Idx
1601 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1602                         unsigned int constraint)
1603 {
1604   Idx idx;
1605   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1606     {
1607       if (org_node == dfa->org_indices[idx]
1608           && constraint == dfa->nodes[idx].constraint)
1609         return idx; /* Found.  */
1610     }
1611   return -1; /* Not found.  */
1612 }
1613
1614 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1615    Return the index of the new node, or -1 if insufficient storage is
1616    available.  */
1617
1618 static Idx
1619 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1620 {
1621   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1622   if (BE (dup_idx != -1, 1))
1623     {
1624       dfa->nodes[dup_idx].constraint = constraint;
1625       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1626       dfa->nodes[dup_idx].duplicated = 1;
1627
1628       /* Store the index of the original node.  */
1629       dfa->org_indices[dup_idx] = org_idx;
1630     }
1631   return dup_idx;
1632 }
1633
1634 static reg_errcode_t
1635 calc_inveclosure (re_dfa_t *dfa)
1636 {
1637   Idx src, idx;
1638   bool ok;
1639   for (idx = 0; idx < dfa->nodes_len; ++idx)
1640     re_node_set_init_empty (dfa->inveclosures + idx);
1641
1642   for (src = 0; src < dfa->nodes_len; ++src)
1643     {
1644       Idx *elems = dfa->eclosures[src].elems;
1645       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1646         {
1647           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1648           if (BE (! ok, 0))
1649             return REG_ESPACE;
1650         }
1651     }
1652
1653   return REG_NOERROR;
1654 }
1655
1656 /* Calculate "eclosure" for all the node in DFA.  */
1657
1658 static reg_errcode_t
1659 calc_eclosure (re_dfa_t *dfa)
1660 {
1661   Idx node_idx;
1662   bool incomplete;
1663 #ifdef DEBUG
1664   assert (dfa->nodes_len > 0);
1665 #endif
1666   incomplete = false;
1667   /* For each nodes, calculate epsilon closure.  */
1668   for (node_idx = 0; ; ++node_idx)
1669     {
1670       reg_errcode_t err;
1671       re_node_set eclosure_elem;
1672       if (node_idx == dfa->nodes_len)
1673         {
1674           if (!incomplete)
1675             break;
1676           incomplete = false;
1677           node_idx = 0;
1678         }
1679
1680 #ifdef DEBUG
1681       assert (dfa->eclosures[node_idx].nelem != -1);
1682 #endif
1683
1684       /* If we have already calculated, skip it.  */
1685       if (dfa->eclosures[node_idx].nelem != 0)
1686         continue;
1687       /* Calculate epsilon closure of 'node_idx'.  */
1688       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1689       if (BE (err != REG_NOERROR, 0))
1690         return err;
1691
1692       if (dfa->eclosures[node_idx].nelem == 0)
1693         {
1694           incomplete = true;
1695           re_node_set_free (&eclosure_elem);
1696         }
1697     }
1698   return REG_NOERROR;
1699 }
1700
1701 /* Calculate epsilon closure of NODE.  */
1702
1703 static reg_errcode_t
1704 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1705 {
1706   reg_errcode_t err;
1707   Idx i;
1708   re_node_set eclosure;
1709   bool ok;
1710   bool incomplete = false;
1711   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1712   if (BE (err != REG_NOERROR, 0))
1713     return err;
1714
1715   /* This indicates that we are calculating this node now.
1716      We reference this value to avoid infinite loop.  */
1717   dfa->eclosures[node].nelem = -1;
1718
1719   /* If the current node has constraints, duplicate all nodes
1720      since they must inherit the constraints.  */
1721   if (dfa->nodes[node].constraint
1722       && dfa->edests[node].nelem
1723       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1724     {
1725       err = duplicate_node_closure (dfa, node, node, node,
1726                                     dfa->nodes[node].constraint);
1727       if (BE (err != REG_NOERROR, 0))
1728         return err;
1729     }
1730
1731   /* Expand each epsilon destination nodes.  */
1732   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1733     for (i = 0; i < dfa->edests[node].nelem; ++i)
1734       {
1735         re_node_set eclosure_elem;
1736         Idx edest = dfa->edests[node].elems[i];
1737         /* If calculating the epsilon closure of 'edest' is in progress,
1738            return intermediate result.  */
1739         if (dfa->eclosures[edest].nelem == -1)
1740           {
1741             incomplete = true;
1742             continue;
1743           }
1744         /* If we haven't calculated the epsilon closure of 'edest' yet,
1745            calculate now. Otherwise use calculated epsilon closure.  */
1746         if (dfa->eclosures[edest].nelem == 0)
1747           {
1748             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1749             if (BE (err != REG_NOERROR, 0))
1750               return err;
1751           }
1752         else
1753           eclosure_elem = dfa->eclosures[edest];
1754         /* Merge the epsilon closure of 'edest'.  */
1755         err = re_node_set_merge (&eclosure, &eclosure_elem);
1756         if (BE (err != REG_NOERROR, 0))
1757           return err;
1758         /* If the epsilon closure of 'edest' is incomplete,
1759            the epsilon closure of this node is also incomplete.  */
1760         if (dfa->eclosures[edest].nelem == 0)
1761           {
1762             incomplete = true;
1763             re_node_set_free (&eclosure_elem);
1764           }
1765       }
1766
1767   /* An epsilon closure includes itself.  */
1768   ok = re_node_set_insert (&eclosure, node);
1769   if (BE (! ok, 0))
1770     return REG_ESPACE;
1771   if (incomplete && !root)
1772     dfa->eclosures[node].nelem = 0;
1773   else
1774     dfa->eclosures[node] = eclosure;
1775   *new_set = eclosure;
1776   return REG_NOERROR;
1777 }
1778 \f
1779 /* Functions for token which are used in the parser.  */
1780
1781 /* Fetch a token from INPUT.
1782    We must not use this function inside bracket expressions.  */
1783
1784 static void
1785 internal_function
1786 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1787 {
1788   re_string_skip_bytes (input, peek_token (result, input, syntax));
1789 }
1790
1791 /* Peek a token from INPUT, and return the length of the token.
1792    We must not use this function inside bracket expressions.  */
1793
1794 static int
1795 internal_function
1796 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1797 {
1798   unsigned char c;
1799
1800   if (re_string_eoi (input))
1801     {
1802       token->type = END_OF_RE;
1803       return 0;
1804     }
1805
1806   c = re_string_peek_byte (input, 0);
1807   token->opr.c = c;
1808
1809   token->word_char = 0;
1810 #ifdef RE_ENABLE_I18N
1811   token->mb_partial = 0;
1812   if (input->mb_cur_max > 1 &&
1813       !re_string_first_byte (input, re_string_cur_idx (input)))
1814     {
1815       token->type = CHARACTER;
1816       token->mb_partial = 1;
1817       return 1;
1818     }
1819 #endif
1820   if (c == '\\')
1821     {
1822       unsigned char c2;
1823       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1824         {
1825           token->type = BACK_SLASH;
1826           return 1;
1827         }
1828
1829       c2 = re_string_peek_byte_case (input, 1);
1830       token->opr.c = c2;
1831       token->type = CHARACTER;
1832 #ifdef RE_ENABLE_I18N
1833       if (input->mb_cur_max > 1)
1834         {
1835           wint_t wc = re_string_wchar_at (input,
1836                                           re_string_cur_idx (input) + 1);
1837           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1838         }
1839       else
1840 #endif
1841         token->word_char = IS_WORD_CHAR (c2) != 0;
1842
1843       switch (c2)
1844         {
1845         case '|':
1846           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1847             token->type = OP_ALT;
1848           break;
1849         case '1': case '2': case '3': case '4': case '5':
1850         case '6': case '7': case '8': case '9':
1851           if (!(syntax & RE_NO_BK_REFS))
1852             {
1853               token->type = OP_BACK_REF;
1854               token->opr.idx = c2 - '1';
1855             }
1856           break;
1857         case '<':
1858           if (!(syntax & RE_NO_GNU_OPS))
1859             {
1860               token->type = ANCHOR;
1861               token->opr.ctx_type = WORD_FIRST;
1862             }
1863           break;
1864         case '>':
1865           if (!(syntax & RE_NO_GNU_OPS))
1866             {
1867               token->type = ANCHOR;
1868               token->opr.ctx_type = WORD_LAST;
1869             }
1870           break;
1871         case 'b':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             {
1874               token->type = ANCHOR;
1875               token->opr.ctx_type = WORD_DELIM;
1876             }
1877           break;
1878         case 'B':
1879           if (!(syntax & RE_NO_GNU_OPS))
1880             {
1881               token->type = ANCHOR;
1882               token->opr.ctx_type = NOT_WORD_DELIM;
1883             }
1884           break;
1885         case 'w':
1886           if (!(syntax & RE_NO_GNU_OPS))
1887             token->type = OP_WORD;
1888           break;
1889         case 'W':
1890           if (!(syntax & RE_NO_GNU_OPS))
1891             token->type = OP_NOTWORD;
1892           break;
1893         case 's':
1894           if (!(syntax & RE_NO_GNU_OPS))
1895             token->type = OP_SPACE;
1896           break;
1897         case 'S':
1898           if (!(syntax & RE_NO_GNU_OPS))
1899             token->type = OP_NOTSPACE;
1900           break;
1901         case '`':
1902           if (!(syntax & RE_NO_GNU_OPS))
1903             {
1904               token->type = ANCHOR;
1905               token->opr.ctx_type = BUF_FIRST;
1906             }
1907           break;
1908         case '\'':
1909           if (!(syntax & RE_NO_GNU_OPS))
1910             {
1911               token->type = ANCHOR;
1912               token->opr.ctx_type = BUF_LAST;
1913             }
1914           break;
1915         case '(':
1916           if (!(syntax & RE_NO_BK_PARENS))
1917             token->type = OP_OPEN_SUBEXP;
1918           break;
1919         case ')':
1920           if (!(syntax & RE_NO_BK_PARENS))
1921             token->type = OP_CLOSE_SUBEXP;
1922           break;
1923         case '+':
1924           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1925             token->type = OP_DUP_PLUS;
1926           break;
1927         case '?':
1928           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1929             token->type = OP_DUP_QUESTION;
1930           break;
1931         case '{':
1932           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1933             token->type = OP_OPEN_DUP_NUM;
1934           break;
1935         case '}':
1936           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1937             token->type = OP_CLOSE_DUP_NUM;
1938           break;
1939         default:
1940           break;
1941         }
1942       return 2;
1943     }
1944
1945   token->type = CHARACTER;
1946 #ifdef RE_ENABLE_I18N
1947   if (input->mb_cur_max > 1)
1948     {
1949       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1950       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1951     }
1952   else
1953 #endif
1954     token->word_char = IS_WORD_CHAR (token->opr.c);
1955
1956   switch (c)
1957     {
1958     case '\n':
1959       if (syntax & RE_NEWLINE_ALT)
1960         token->type = OP_ALT;
1961       break;
1962     case '|':
1963       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1964         token->type = OP_ALT;
1965       break;
1966     case '*':
1967       token->type = OP_DUP_ASTERISK;
1968       break;
1969     case '+':
1970       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1971         token->type = OP_DUP_PLUS;
1972       break;
1973     case '?':
1974       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1975         token->type = OP_DUP_QUESTION;
1976       break;
1977     case '{':
1978       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1979         token->type = OP_OPEN_DUP_NUM;
1980       break;
1981     case '}':
1982       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1983         token->type = OP_CLOSE_DUP_NUM;
1984       break;
1985     case '(':
1986       if (syntax & RE_NO_BK_PARENS)
1987         token->type = OP_OPEN_SUBEXP;
1988       break;
1989     case ')':
1990       if (syntax & RE_NO_BK_PARENS)
1991         token->type = OP_CLOSE_SUBEXP;
1992       break;
1993     case '[':
1994       token->type = OP_OPEN_BRACKET;
1995       break;
1996     case '.':
1997       token->type = OP_PERIOD;
1998       break;
1999     case '^':
2000       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2001           re_string_cur_idx (input) != 0)
2002         {
2003           char prev = re_string_peek_byte (input, -1);
2004           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2005             break;
2006         }
2007       token->type = ANCHOR;
2008       token->opr.ctx_type = LINE_FIRST;
2009       break;
2010     case '$':
2011       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2012           re_string_cur_idx (input) + 1 != re_string_length (input))
2013         {
2014           re_token_t next;
2015           re_string_skip_bytes (input, 1);
2016           peek_token (&next, input, syntax);
2017           re_string_skip_bytes (input, -1);
2018           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2019             break;
2020         }
2021       token->type = ANCHOR;
2022       token->opr.ctx_type = LINE_LAST;
2023       break;
2024     default:
2025       break;
2026     }
2027   return 1;
2028 }
2029
2030 /* Peek a token from INPUT, and return the length of the token.
2031    We must not use this function out of bracket expressions.  */
2032
2033 static int
2034 internal_function
2035 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2036 {
2037   unsigned char c;
2038   if (re_string_eoi (input))
2039     {
2040       token->type = END_OF_RE;
2041       return 0;
2042     }
2043   c = re_string_peek_byte (input, 0);
2044   token->opr.c = c;
2045
2046 #ifdef RE_ENABLE_I18N
2047   if (input->mb_cur_max > 1 &&
2048       !re_string_first_byte (input, re_string_cur_idx (input)))
2049     {
2050       token->type = CHARACTER;
2051       return 1;
2052     }
2053 #endif /* RE_ENABLE_I18N */
2054
2055   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2056       && re_string_cur_idx (input) + 1 < re_string_length (input))
2057     {
2058       /* In this case, '\' escape a character.  */
2059       unsigned char c2;
2060       re_string_skip_bytes (input, 1);
2061       c2 = re_string_peek_byte (input, 0);
2062       token->opr.c = c2;
2063       token->type = CHARACTER;
2064       return 1;
2065     }
2066   if (c == '[') /* '[' is a special char in a bracket exps.  */
2067     {
2068       unsigned char c2;
2069       int token_len;
2070       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2071         c2 = re_string_peek_byte (input, 1);
2072       else
2073         c2 = 0;
2074       token->opr.c = c2;
2075       token_len = 2;
2076       switch (c2)
2077         {
2078         case '.':
2079           token->type = OP_OPEN_COLL_ELEM;
2080           break;
2081
2082         case '=':
2083           token->type = OP_OPEN_EQUIV_CLASS;
2084           break;
2085
2086         case ':':
2087           if (syntax & RE_CHAR_CLASSES)
2088             {
2089               token->type = OP_OPEN_CHAR_CLASS;
2090               break;
2091             }
2092           FALLTHROUGH;
2093         default:
2094           token->type = CHARACTER;
2095           token->opr.c = c;
2096           token_len = 1;
2097           break;
2098         }
2099       return token_len;
2100     }
2101   switch (c)
2102     {
2103     case '-':
2104       token->type = OP_CHARSET_RANGE;
2105       break;
2106     case ']':
2107       token->type = OP_CLOSE_BRACKET;
2108       break;
2109     case '^':
2110       token->type = OP_NON_MATCH_LIST;
2111       break;
2112     default:
2113       token->type = CHARACTER;
2114     }
2115   return 1;
2116 }
2117 \f
2118 /* Functions for parser.  */
2119
2120 /* Entry point of the parser.
2121    Parse the regular expression REGEXP and return the structure tree.
2122    If an error occurs, ERR is set by error code, and return NULL.
2123    This function build the following tree, from regular expression <reg_exp>:
2124            CAT
2125            / \
2126           /   \
2127    <reg_exp>  EOR
2128
2129    CAT means concatenation.
2130    EOR means end of regular expression.  */
2131
2132 static bin_tree_t *
2133 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2134        reg_errcode_t *err)
2135 {
2136   re_dfa_t *dfa = preg->buffer;
2137   bin_tree_t *tree, *eor, *root;
2138   re_token_t current_token;
2139   dfa->syntax = syntax;
2140   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2141   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2142   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2143     return NULL;
2144   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2145   if (tree != NULL)
2146     root = create_tree (dfa, tree, eor, CONCAT);
2147   else
2148     root = eor;
2149   if (BE (eor == NULL || root == NULL, 0))
2150     {
2151       *err = REG_ESPACE;
2152       return NULL;
2153     }
2154   return root;
2155 }
2156
2157 /* This function build the following tree, from regular expression
2158    <branch1>|<branch2>:
2159            ALT
2160            / \
2161           /   \
2162    <branch1> <branch2>
2163
2164    ALT means alternative, which represents the operator '|'.  */
2165
2166 static bin_tree_t *
2167 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2168                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2169 {
2170   re_dfa_t *dfa = preg->buffer;
2171   bin_tree_t *tree, *branch = NULL;
2172   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2173   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2174   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2175     return NULL;
2176
2177   while (token->type == OP_ALT)
2178     {
2179       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2180       if (token->type != OP_ALT && token->type != END_OF_RE
2181           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2182         {
2183           bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2184           dfa->completed_bkref_map = initial_bkref_map;
2185           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2186           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2187             {
2188               if (tree != NULL)
2189                 postorder (tree, free_tree, NULL);
2190               return NULL;
2191             }
2192           dfa->completed_bkref_map |= accumulated_bkref_map;
2193         }
2194       else
2195         branch = NULL;
2196       tree = create_tree (dfa, tree, branch, OP_ALT);
2197       if (BE (tree == NULL, 0))
2198         {
2199           *err = REG_ESPACE;
2200           return NULL;
2201         }
2202     }
2203   return tree;
2204 }
2205
2206 /* This function build the following tree, from regular expression
2207    <exp1><exp2>:
2208         CAT
2209         / \
2210        /   \
2211    <exp1> <exp2>
2212
2213    CAT means concatenation.  */
2214
2215 static bin_tree_t *
2216 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2217               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2218 {
2219   bin_tree_t *tree, *expr;
2220   re_dfa_t *dfa = preg->buffer;
2221   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2222   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2223     return NULL;
2224
2225   while (token->type != OP_ALT && token->type != END_OF_RE
2226          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2227     {
2228       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2229       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2230         {
2231           if (tree != NULL)
2232             postorder (tree, free_tree, NULL);
2233           return NULL;
2234         }
2235       if (tree != NULL && expr != NULL)
2236         {
2237           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2238           if (newtree == NULL)
2239             {
2240               postorder (expr, free_tree, NULL);
2241               postorder (tree, free_tree, NULL);
2242               *err = REG_ESPACE;
2243               return NULL;
2244             }
2245           tree = newtree;
2246         }
2247       else if (tree == NULL)
2248         tree = expr;
2249       /* Otherwise expr == NULL, we don't need to create new tree.  */
2250     }
2251   return tree;
2252 }
2253
2254 /* This function build the following tree, from regular expression a*:
2255          *
2256          |
2257          a
2258 */
2259
2260 static bin_tree_t *
2261 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2262                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2263 {
2264   re_dfa_t *dfa = preg->buffer;
2265   bin_tree_t *tree;
2266   switch (token->type)
2267     {
2268     case CHARACTER:
2269       tree = create_token_tree (dfa, NULL, NULL, token);
2270       if (BE (tree == NULL, 0))
2271         {
2272           *err = REG_ESPACE;
2273           return NULL;
2274         }
2275 #ifdef RE_ENABLE_I18N
2276       if (dfa->mb_cur_max > 1)
2277         {
2278           while (!re_string_eoi (regexp)
2279                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2280             {
2281               bin_tree_t *mbc_remain;
2282               fetch_token (token, regexp, syntax);
2283               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2284               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2285               if (BE (mbc_remain == NULL || tree == NULL, 0))
2286                 {
2287                   *err = REG_ESPACE;
2288                   return NULL;
2289                 }
2290             }
2291         }
2292 #endif
2293       break;
2294
2295     case OP_OPEN_SUBEXP:
2296       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2297       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2298         return NULL;
2299       break;
2300
2301     case OP_OPEN_BRACKET:
2302       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2303       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304         return NULL;
2305       break;
2306
2307     case OP_BACK_REF:
2308       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2309         {
2310           *err = REG_ESUBREG;
2311           return NULL;
2312         }
2313       dfa->used_bkref_map |= 1 << token->opr.idx;
2314       tree = create_token_tree (dfa, NULL, NULL, token);
2315       if (BE (tree == NULL, 0))
2316         {
2317           *err = REG_ESPACE;
2318           return NULL;
2319         }
2320       ++dfa->nbackref;
2321       dfa->has_mb_node = 1;
2322       break;
2323
2324     case OP_OPEN_DUP_NUM:
2325       if (syntax & RE_CONTEXT_INVALID_DUP)
2326         {
2327           *err = REG_BADRPT;
2328           return NULL;
2329         }
2330       FALLTHROUGH;
2331     case OP_DUP_ASTERISK:
2332     case OP_DUP_PLUS:
2333     case OP_DUP_QUESTION:
2334       if (syntax & RE_CONTEXT_INVALID_OPS)
2335         {
2336           *err = REG_BADRPT;
2337           return NULL;
2338         }
2339       else if (syntax & RE_CONTEXT_INDEP_OPS)
2340         {
2341           fetch_token (token, regexp, syntax);
2342           return parse_expression (regexp, preg, token, syntax, nest, err);
2343         }
2344       FALLTHROUGH;
2345     case OP_CLOSE_SUBEXP:
2346       if ((token->type == OP_CLOSE_SUBEXP) &&
2347           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2348         {
2349           *err = REG_ERPAREN;
2350           return NULL;
2351         }
2352       FALLTHROUGH;
2353     case OP_CLOSE_DUP_NUM:
2354       /* We treat it as a normal character.  */
2355
2356       /* Then we can these characters as normal characters.  */
2357       token->type = CHARACTER;
2358       /* mb_partial and word_char bits should be initialized already
2359          by peek_token.  */
2360       tree = create_token_tree (dfa, NULL, NULL, token);
2361       if (BE (tree == NULL, 0))
2362         {
2363           *err = REG_ESPACE;
2364           return NULL;
2365         }
2366       break;
2367
2368     case ANCHOR:
2369       if ((token->opr.ctx_type
2370            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2371           && dfa->word_ops_used == 0)
2372         init_word_char (dfa);
2373       if (token->opr.ctx_type == WORD_DELIM
2374           || token->opr.ctx_type == NOT_WORD_DELIM)
2375         {
2376           bin_tree_t *tree_first, *tree_last;
2377           if (token->opr.ctx_type == WORD_DELIM)
2378             {
2379               token->opr.ctx_type = WORD_FIRST;
2380               tree_first = create_token_tree (dfa, NULL, NULL, token);
2381               token->opr.ctx_type = WORD_LAST;
2382             }
2383           else
2384             {
2385               token->opr.ctx_type = INSIDE_WORD;
2386               tree_first = create_token_tree (dfa, NULL, NULL, token);
2387               token->opr.ctx_type = INSIDE_NOTWORD;
2388             }
2389           tree_last = create_token_tree (dfa, NULL, NULL, token);
2390           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2392             {
2393               *err = REG_ESPACE;
2394               return NULL;
2395             }
2396         }
2397       else
2398         {
2399           tree = create_token_tree (dfa, NULL, NULL, token);
2400           if (BE (tree == NULL, 0))
2401             {
2402               *err = REG_ESPACE;
2403               return NULL;
2404             }
2405         }
2406       /* We must return here, since ANCHORs can't be followed
2407          by repetition operators.
2408          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2410       fetch_token (token, regexp, syntax);
2411       return tree;
2412
2413     case OP_PERIOD:
2414       tree = create_token_tree (dfa, NULL, NULL, token);
2415       if (BE (tree == NULL, 0))
2416         {
2417           *err = REG_ESPACE;
2418           return NULL;
2419         }
2420       if (dfa->mb_cur_max > 1)
2421         dfa->has_mb_node = 1;
2422       break;
2423
2424     case OP_WORD:
2425     case OP_NOTWORD:
2426       tree = build_charclass_op (dfa, regexp->trans,
2427                                  "alnum",
2428                                  "_",
2429                                  token->type == OP_NOTWORD, err);
2430       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431         return NULL;
2432       break;
2433
2434     case OP_SPACE:
2435     case OP_NOTSPACE:
2436       tree = build_charclass_op (dfa, regexp->trans,
2437                                  "space",
2438                                  "",
2439                                  token->type == OP_NOTSPACE, err);
2440       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2441         return NULL;
2442       break;
2443
2444     case OP_ALT:
2445     case END_OF_RE:
2446       return NULL;
2447
2448     case BACK_SLASH:
2449       *err = REG_EESCAPE;
2450       return NULL;
2451
2452     default:
2453       /* Must not happen?  */
2454 #ifdef DEBUG
2455       assert (0);
2456 #endif
2457       return NULL;
2458     }
2459   fetch_token (token, regexp, syntax);
2460
2461   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2462          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2463     {
2464       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2465                                            syntax, err);
2466       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2467         {
2468           if (tree != NULL)
2469             postorder (tree, free_tree, NULL);
2470           return NULL;
2471         }
2472       tree = dup_tree;
2473       /* In BRE consecutive duplications are not allowed.  */
2474       if ((syntax & RE_CONTEXT_INVALID_DUP)
2475           && (token->type == OP_DUP_ASTERISK
2476               || token->type == OP_OPEN_DUP_NUM))
2477         {
2478           if (tree != NULL)
2479             postorder (tree, free_tree, NULL);
2480           *err = REG_BADRPT;
2481           return NULL;
2482         }
2483     }
2484
2485   return tree;
2486 }
2487
2488 /* This function build the following tree, from regular expression
2489    (<reg_exp>):
2490          SUBEXP
2491             |
2492         <reg_exp>
2493 */
2494
2495 static bin_tree_t *
2496 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2497                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2498 {
2499   re_dfa_t *dfa = preg->buffer;
2500   bin_tree_t *tree;
2501   size_t cur_nsub;
2502   cur_nsub = preg->re_nsub++;
2503
2504   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2505
2506   /* The subexpression may be a null string.  */
2507   if (token->type == OP_CLOSE_SUBEXP)
2508     tree = NULL;
2509   else
2510     {
2511       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2512       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2513         {
2514           if (tree != NULL)
2515             postorder (tree, free_tree, NULL);
2516           *err = REG_EPAREN;
2517         }
2518       if (BE (*err != REG_NOERROR, 0))
2519         return NULL;
2520     }
2521
2522   if (cur_nsub <= '9' - '1')
2523     dfa->completed_bkref_map |= 1 << cur_nsub;
2524
2525   tree = create_tree (dfa, tree, NULL, SUBEXP);
2526   if (BE (tree == NULL, 0))
2527     {
2528       *err = REG_ESPACE;
2529       return NULL;
2530     }
2531   tree->token.opr.idx = cur_nsub;
2532   return tree;
2533 }
2534
2535 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2536
2537 static bin_tree_t *
2538 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2539               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2540 {
2541   bin_tree_t *tree = NULL, *old_tree = NULL;
2542   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2543   re_token_t start_token = *token;
2544
2545   if (token->type == OP_OPEN_DUP_NUM)
2546     {
2547       end = 0;
2548       start = fetch_number (regexp, token, syntax);
2549       if (start == -1)
2550         {
2551           if (token->type == CHARACTER && token->opr.c == ',')
2552             start = 0; /* We treat "{,m}" as "{0,m}".  */
2553           else
2554             {
2555               *err = REG_BADBR; /* <re>{} is invalid.  */
2556               return NULL;
2557             }
2558         }
2559       if (BE (start != -2, 1))
2560         {
2561           /* We treat "{n}" as "{n,n}".  */
2562           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2563                  : ((token->type == CHARACTER && token->opr.c == ',')
2564                     ? fetch_number (regexp, token, syntax) : -2));
2565         }
2566       if (BE (start == -2 || end == -2, 0))
2567         {
2568           /* Invalid sequence.  */
2569           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2570             {
2571               if (token->type == END_OF_RE)
2572                 *err = REG_EBRACE;
2573               else
2574                 *err = REG_BADBR;
2575
2576               return NULL;
2577             }
2578
2579           /* If the syntax bit is set, rollback.  */
2580           re_string_set_index (regexp, start_idx);
2581           *token = start_token;
2582           token->type = CHARACTER;
2583           /* mb_partial and word_char bits should be already initialized by
2584              peek_token.  */
2585           return elem;
2586         }
2587
2588       if (BE ((end != -1 && start > end)
2589               || token->type != OP_CLOSE_DUP_NUM, 0))
2590         {
2591           /* First number greater than second.  */
2592           *err = REG_BADBR;
2593           return NULL;
2594         }
2595
2596       if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2597         {
2598           *err = REG_ESIZE;
2599           return NULL;
2600         }
2601     }
2602   else
2603     {
2604       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2605       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2606     }
2607
2608   fetch_token (token, regexp, syntax);
2609
2610   if (BE (elem == NULL, 0))
2611     return NULL;
2612   if (BE (start == 0 && end == 0, 0))
2613     {
2614       postorder (elem, free_tree, NULL);
2615       return NULL;
2616     }
2617
2618   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2619   if (BE (start > 0, 0))
2620     {
2621       tree = elem;
2622       for (i = 2; i <= start; ++i)
2623         {
2624           elem = duplicate_tree (elem, dfa);
2625           tree = create_tree (dfa, tree, elem, CONCAT);
2626           if (BE (elem == NULL || tree == NULL, 0))
2627             goto parse_dup_op_espace;
2628         }
2629
2630       if (start == end)
2631         return tree;
2632
2633       /* Duplicate ELEM before it is marked optional.  */
2634       elem = duplicate_tree (elem, dfa);
2635       if (BE (elem == NULL, 0))
2636         goto parse_dup_op_espace;
2637       old_tree = tree;
2638     }
2639   else
2640     old_tree = NULL;
2641
2642   if (elem->token.type == SUBEXP)
2643     {
2644       uintptr_t subidx = elem->token.opr.idx;
2645       postorder (elem, mark_opt_subexp, (void *) subidx);
2646     }
2647
2648   tree = create_tree (dfa, elem, NULL,
2649                       (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2650   if (BE (tree == NULL, 0))
2651     goto parse_dup_op_espace;
2652
2653 /* From gnulib's "intprops.h":
2654    True if the arithmetic type T is signed.  */
2655 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2656
2657   /* This loop is actually executed only when end != -1,
2658      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2659      already created the start+1-th copy.  */
2660   if (TYPE_SIGNED (Idx) || end != -1)
2661     for (i = start + 2; i <= end; ++i)
2662       {
2663         elem = duplicate_tree (elem, dfa);
2664         tree = create_tree (dfa, tree, elem, CONCAT);
2665         if (BE (elem == NULL || tree == NULL, 0))
2666           goto parse_dup_op_espace;
2667
2668         tree = create_tree (dfa, tree, NULL, OP_ALT);
2669         if (BE (tree == NULL, 0))
2670           goto parse_dup_op_espace;
2671       }
2672
2673   if (old_tree)
2674     tree = create_tree (dfa, old_tree, tree, CONCAT);
2675
2676   return tree;
2677
2678  parse_dup_op_espace:
2679   *err = REG_ESPACE;
2680   return NULL;
2681 }
2682
2683 /* Size of the names for collating symbol/equivalence_class/character_class.
2684    I'm not sure, but maybe enough.  */
2685 #define BRACKET_NAME_BUF_SIZE 32
2686
2687 #ifndef _LIBC
2688
2689 # ifdef RE_ENABLE_I18N
2690 /* Convert the byte B to the corresponding wide character.  In a
2691    unibyte locale, treat B as itself if it is an encoding error.
2692    In a multibyte locale, return WEOF if B is an encoding error.  */
2693 static wint_t
2694 parse_byte (unsigned char b, re_charset_t *mbcset)
2695 {
2696   wint_t wc = __btowc (b);
2697   return wc == WEOF && !mbcset ? b : wc;
2698 }
2699 #endif
2700
2701   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2702      Build the range expression which starts from START_ELEM, and ends
2703      at END_ELEM.  The result are written to MBCSET and SBCSET.
2704      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2705      mbcset->range_ends, is a pointer argument since we may
2706      update it.  */
2707
2708 static reg_errcode_t
2709 internal_function
2710 # ifdef RE_ENABLE_I18N
2711 build_range_exp (const reg_syntax_t syntax,
2712                  bitset_t sbcset,
2713                  re_charset_t *mbcset,
2714                  Idx *range_alloc,
2715                  const bracket_elem_t *start_elem,
2716                  const bracket_elem_t *end_elem)
2717 # else /* not RE_ENABLE_I18N */
2718 build_range_exp (const reg_syntax_t syntax,
2719                  bitset_t sbcset,
2720                  const bracket_elem_t *start_elem,
2721                  const bracket_elem_t *end_elem)
2722 # endif /* not RE_ENABLE_I18N */
2723 {
2724   unsigned int start_ch, end_ch;
2725   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2726   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2727           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2728           0))
2729     return REG_ERANGE;
2730
2731   /* We can handle no multi character collating elements without libc
2732      support.  */
2733   if (BE ((start_elem->type == COLL_SYM
2734            && strlen ((char *) start_elem->opr.name) > 1)
2735           || (end_elem->type == COLL_SYM
2736               && strlen ((char *) end_elem->opr.name) > 1), 0))
2737     return REG_ECOLLATE;
2738
2739 # ifdef RE_ENABLE_I18N
2740   {
2741     wchar_t wc;
2742     wint_t start_wc;
2743     wint_t end_wc;
2744
2745     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2746                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2747                    : 0));
2748     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2749               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2750                  : 0));
2751     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2752                 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2753     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2754               ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2755     if (start_wc == WEOF || end_wc == WEOF)
2756       return REG_ECOLLATE;
2757     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2758       return REG_ERANGE;
2759
2760     /* Got valid collation sequence values, add them as a new entry.
2761        However, for !_LIBC we have no collation elements: if the
2762        character set is single byte, the single byte character set
2763        that we build below suffices.  parse_bracket_exp passes
2764        no MBCSET if dfa->mb_cur_max == 1.  */
2765     if (mbcset)
2766       {
2767         /* Check the space of the arrays.  */
2768         if (BE (*range_alloc == mbcset->nranges, 0))
2769           {
2770             /* There is not enough space, need realloc.  */
2771             wchar_t *new_array_start, *new_array_end;
2772             Idx new_nranges;
2773
2774             /* +1 in case of mbcset->nranges is 0.  */
2775             new_nranges = 2 * mbcset->nranges + 1;
2776             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2777                are NULL if *range_alloc == 0.  */
2778             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2779                                           new_nranges);
2780             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2781                                         new_nranges);
2782
2783             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2784               {
2785                 re_free (new_array_start);
2786                 re_free (new_array_end);
2787                 return REG_ESPACE;
2788               }
2789
2790             mbcset->range_starts = new_array_start;
2791             mbcset->range_ends = new_array_end;
2792             *range_alloc = new_nranges;
2793           }
2794
2795         mbcset->range_starts[mbcset->nranges] = start_wc;
2796         mbcset->range_ends[mbcset->nranges++] = end_wc;
2797       }
2798
2799     /* Build the table for single byte characters.  */
2800     for (wc = 0; wc < SBC_MAX; ++wc)
2801       {
2802         if (start_wc <= wc && wc <= end_wc)
2803           bitset_set (sbcset, wc);
2804       }
2805   }
2806 # else /* not RE_ENABLE_I18N */
2807   {
2808     unsigned int ch;
2809     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2810                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2811                    : 0));
2812     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2813               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2814                  : 0));
2815     if (start_ch > end_ch)
2816       return REG_ERANGE;
2817     /* Build the table for single byte characters.  */
2818     for (ch = 0; ch < SBC_MAX; ++ch)
2819       if (start_ch <= ch  && ch <= end_ch)
2820         bitset_set (sbcset, ch);
2821   }
2822 # endif /* not RE_ENABLE_I18N */
2823   return REG_NOERROR;
2824 }
2825 #endif /* not _LIBC */
2826
2827 #ifndef _LIBC
2828 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2829    Build the collating element which is represented by NAME.
2830    The result are written to MBCSET and SBCSET.
2831    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2832    pointer argument since we may update it.  */
2833
2834 static reg_errcode_t
2835 internal_function
2836 # ifdef RE_ENABLE_I18N
2837 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2838                         Idx *coll_sym_alloc, const unsigned char *name)
2839 # else /* not RE_ENABLE_I18N */
2840 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2841 # endif /* not RE_ENABLE_I18N */
2842 {
2843   size_t name_len = strlen ((const char *) name);
2844   if (BE (name_len != 1, 0))
2845     return REG_ECOLLATE;
2846   else
2847     {
2848       bitset_set (sbcset, name[0]);
2849       return REG_NOERROR;
2850     }
2851 }
2852 #endif /* not _LIBC */
2853
2854 /* This function parse bracket expression like "[abc]", "[a-c]",
2855    "[[.a-a.]]" etc.  */
2856
2857 static bin_tree_t *
2858 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2859                    reg_syntax_t syntax, reg_errcode_t *err)
2860 {
2861 #ifdef _LIBC
2862   const unsigned char *collseqmb;
2863   const char *collseqwc;
2864   uint32_t nrules;
2865   int32_t table_size;
2866   const int32_t *symb_table;
2867   const unsigned char *extra;
2868
2869   /* Local function for parse_bracket_exp used in _LIBC environment.
2870      Seek the collating symbol entry corresponding to NAME.
2871      Return the index of the symbol in the SYMB_TABLE,
2872      or -1 if not found.  */
2873
2874   auto inline int32_t
2875   __attribute__ ((always_inline))
2876   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2877     {
2878       int32_t elem;
2879
2880       for (elem = 0; elem < table_size; elem++)
2881         if (symb_table[2 * elem] != 0)
2882           {
2883             int32_t idx = symb_table[2 * elem + 1];
2884             /* Skip the name of collating element name.  */
2885             idx += 1 + extra[idx];
2886             if (/* Compare the length of the name.  */
2887                 name_len == extra[idx]
2888                 /* Compare the name.  */
2889                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2890               /* Yep, this is the entry.  */
2891               return elem;
2892           }
2893       return -1;
2894     }
2895
2896   /* Local function for parse_bracket_exp used in _LIBC environment.
2897      Look up the collation sequence value of BR_ELEM.
2898      Return the value if succeeded, UINT_MAX otherwise.  */
2899
2900   auto inline unsigned int
2901   __attribute__ ((always_inline))
2902   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2903     {
2904       if (br_elem->type == SB_CHAR)
2905         {
2906           /*
2907           if (MB_CUR_MAX == 1)
2908           */
2909           if (nrules == 0)
2910             return collseqmb[br_elem->opr.ch];
2911           else
2912             {
2913               wint_t wc = __btowc (br_elem->opr.ch);
2914               return __collseq_table_lookup (collseqwc, wc);
2915             }
2916         }
2917       else if (br_elem->type == MB_CHAR)
2918         {
2919           if (nrules != 0)
2920             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2921         }
2922       else if (br_elem->type == COLL_SYM)
2923         {
2924           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2925           if (nrules != 0)
2926             {
2927               int32_t elem, idx;
2928               elem = seek_collating_symbol_entry (br_elem->opr.name,
2929                                                   sym_name_len);
2930               if (elem != -1)
2931                 {
2932                   /* We found the entry.  */
2933                   idx = symb_table[2 * elem + 1];
2934                   /* Skip the name of collating element name.  */
2935                   idx += 1 + extra[idx];
2936                   /* Skip the byte sequence of the collating element.  */
2937                   idx += 1 + extra[idx];
2938                   /* Adjust for the alignment.  */
2939                   idx = (idx + 3) & ~3;
2940                   /* Skip the multibyte collation sequence value.  */
2941                   idx += sizeof (unsigned int);
2942                   /* Skip the wide char sequence of the collating element.  */
2943                   idx += sizeof (unsigned int) *
2944                     (1 + *(unsigned int *) (extra + idx));
2945                   /* Return the collation sequence value.  */
2946                   return *(unsigned int *) (extra + idx);
2947                 }
2948               else if (sym_name_len == 1)
2949                 {
2950                   /* No valid character.  Match it as a single byte
2951                      character.  */
2952                   return collseqmb[br_elem->opr.name[0]];
2953                 }
2954             }
2955           else if (sym_name_len == 1)
2956             return collseqmb[br_elem->opr.name[0]];
2957         }
2958       return UINT_MAX;
2959     }
2960
2961   /* Local function for parse_bracket_exp used in _LIBC environment.
2962      Build the range expression which starts from START_ELEM, and ends
2963      at END_ELEM.  The result are written to MBCSET and SBCSET.
2964      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2965      mbcset->range_ends, is a pointer argument since we may
2966      update it.  */
2967
2968   auto inline reg_errcode_t
2969   __attribute__ ((always_inline))
2970   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2971                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2972     {
2973       unsigned int ch;
2974       uint32_t start_collseq;
2975       uint32_t end_collseq;
2976
2977       /* Equivalence Classes and Character Classes can't be a range
2978          start/end.  */
2979       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2980               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2981               0))
2982         return REG_ERANGE;
2983
2984       /* FIXME: Implement rational ranges here, too.  */
2985       start_collseq = lookup_collation_sequence_value (start_elem);
2986       end_collseq = lookup_collation_sequence_value (end_elem);
2987       /* Check start/end collation sequence values.  */
2988       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2989         return REG_ECOLLATE;
2990       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2991         return REG_ERANGE;
2992
2993       /* Got valid collation sequence values, add them as a new entry.
2994          However, if we have no collation elements, and the character set
2995          is single byte, the single byte character set that we
2996          build below suffices. */
2997       if (nrules > 0 || dfa->mb_cur_max > 1)
2998         {
2999           /* Check the space of the arrays.  */
3000           if (BE (*range_alloc == mbcset->nranges, 0))
3001             {
3002               /* There is not enough space, need realloc.  */
3003               uint32_t *new_array_start;
3004               uint32_t *new_array_end;
3005               Idx new_nranges;
3006
3007               /* +1 in case of mbcset->nranges is 0.  */
3008               new_nranges = 2 * mbcset->nranges + 1;
3009               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3010                                             new_nranges);
3011               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3012                                           new_nranges);
3013
3014               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3015                 return REG_ESPACE;
3016
3017               mbcset->range_starts = new_array_start;
3018               mbcset->range_ends = new_array_end;
3019               *range_alloc = new_nranges;
3020             }
3021
3022           mbcset->range_starts[mbcset->nranges] = start_collseq;
3023           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3024         }
3025
3026       /* Build the table for single byte characters.  */
3027       for (ch = 0; ch < SBC_MAX; ch++)
3028         {
3029           uint32_t ch_collseq;
3030           /*
3031           if (MB_CUR_MAX == 1)
3032           */
3033           if (nrules == 0)
3034             ch_collseq = collseqmb[ch];
3035           else
3036             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3037           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3038             bitset_set (sbcset, ch);
3039         }
3040       return REG_NOERROR;
3041     }
3042
3043   /* Local function for parse_bracket_exp used in _LIBC environment.
3044      Build the collating element which is represented by NAME.
3045      The result are written to MBCSET and SBCSET.
3046      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3047      pointer argument since we may update it.  */
3048
3049   auto inline reg_errcode_t
3050   __attribute__ ((always_inline))
3051   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3052                           Idx *coll_sym_alloc, const unsigned char *name)
3053     {
3054       int32_t elem, idx;
3055       size_t name_len = strlen ((const char *) name);
3056       if (nrules != 0)
3057         {
3058           elem = seek_collating_symbol_entry (name, name_len);
3059           if (elem != -1)
3060             {
3061               /* We found the entry.  */
3062               idx = symb_table[2 * elem + 1];
3063               /* Skip the name of collating element name.  */
3064               idx += 1 + extra[idx];
3065             }
3066           else if (name_len == 1)
3067             {
3068               /* No valid character, treat it as a normal
3069                  character.  */
3070               bitset_set (sbcset, name[0]);
3071               return REG_NOERROR;
3072             }
3073           else
3074             return REG_ECOLLATE;
3075
3076           /* Got valid collation sequence, add it as a new entry.  */
3077           /* Check the space of the arrays.  */
3078           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3079             {
3080               /* Not enough, realloc it.  */
3081               /* +1 in case of mbcset->ncoll_syms is 0.  */
3082               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3083               /* Use realloc since mbcset->coll_syms is NULL
3084                  if *alloc == 0.  */
3085               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3086                                                    new_coll_sym_alloc);
3087               if (BE (new_coll_syms == NULL, 0))
3088                 return REG_ESPACE;
3089               mbcset->coll_syms = new_coll_syms;
3090               *coll_sym_alloc = new_coll_sym_alloc;
3091             }
3092           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3093           return REG_NOERROR;
3094         }
3095       else
3096         {
3097           if (BE (name_len != 1, 0))
3098             return REG_ECOLLATE;
3099           else
3100             {
3101               bitset_set (sbcset, name[0]);
3102               return REG_NOERROR;
3103             }
3104         }
3105     }
3106 #endif
3107
3108   re_token_t br_token;
3109   re_bitset_ptr_t sbcset;
3110 #ifdef RE_ENABLE_I18N
3111   re_charset_t *mbcset;
3112   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3113   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3114 #endif /* not RE_ENABLE_I18N */
3115   bool non_match = false;
3116   bin_tree_t *work_tree;
3117   int token_len;
3118   bool first_round = true;
3119 #ifdef _LIBC
3120   collseqmb = (const unsigned char *)
3121     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3122   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3123   if (nrules)
3124     {
3125       /*
3126       if (MB_CUR_MAX > 1)
3127       */
3128       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3129       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3130       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3131                                                   _NL_COLLATE_SYMB_TABLEMB);
3132       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3133                                                    _NL_COLLATE_SYMB_EXTRAMB);
3134     }
3135 #endif
3136   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3137 #ifdef RE_ENABLE_I18N
3138   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3139 #endif /* RE_ENABLE_I18N */
3140 #ifdef RE_ENABLE_I18N
3141   if (BE (sbcset == NULL || mbcset == NULL, 0))
3142 #else
3143   if (BE (sbcset == NULL, 0))
3144 #endif /* RE_ENABLE_I18N */
3145     {
3146       re_free (sbcset);
3147 #ifdef RE_ENABLE_I18N
3148       re_free (mbcset);
3149 #endif
3150       *err = REG_ESPACE;
3151       return NULL;
3152     }
3153
3154   token_len = peek_token_bracket (token, regexp, syntax);
3155   if (BE (token->type == END_OF_RE, 0))
3156     {
3157       *err = REG_BADPAT;
3158       goto parse_bracket_exp_free_return;
3159     }
3160   if (token->type == OP_NON_MATCH_LIST)
3161     {
3162 #ifdef RE_ENABLE_I18N
3163       mbcset->non_match = 1;
3164 #endif /* not RE_ENABLE_I18N */
3165       non_match = true;
3166       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3167         bitset_set (sbcset, '\n');
3168       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3169       token_len = peek_token_bracket (token, regexp, syntax);
3170       if (BE (token->type == END_OF_RE, 0))
3171         {
3172           *err = REG_BADPAT;
3173           goto parse_bracket_exp_free_return;
3174         }
3175     }
3176
3177   /* We treat the first ']' as a normal character.  */
3178   if (token->type == OP_CLOSE_BRACKET)
3179     token->type = CHARACTER;
3180
3181   while (1)
3182     {
3183       bracket_elem_t start_elem, end_elem;
3184       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3185       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3186       reg_errcode_t ret;
3187       int token_len2 = 0;
3188       bool is_range_exp = false;
3189       re_token_t token2;
3190
3191       start_elem.opr.name = start_name_buf;
3192       start_elem.type = COLL_SYM;
3193       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3194                                    syntax, first_round);
3195       if (BE (ret != REG_NOERROR, 0))
3196         {
3197           *err = ret;
3198           goto parse_bracket_exp_free_return;
3199         }
3200       first_round = false;
3201
3202       /* Get information about the next token.  We need it in any case.  */
3203       token_len = peek_token_bracket (token, regexp, syntax);
3204
3205       /* Do not check for ranges if we know they are not allowed.  */
3206       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3207         {
3208           if (BE (token->type == END_OF_RE, 0))
3209             {
3210               *err = REG_EBRACK;
3211               goto parse_bracket_exp_free_return;
3212             }
3213           if (token->type == OP_CHARSET_RANGE)
3214             {
3215               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3216               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3217               if (BE (token2.type == END_OF_RE, 0))
3218                 {
3219                   *err = REG_EBRACK;
3220                   goto parse_bracket_exp_free_return;
3221                 }
3222               if (token2.type == OP_CLOSE_BRACKET)
3223                 {
3224                   /* We treat the last '-' as a normal character.  */
3225                   re_string_skip_bytes (regexp, -token_len);
3226                   token->type = CHARACTER;
3227                 }
3228               else
3229                 is_range_exp = true;
3230             }
3231         }
3232
3233       if (is_range_exp == true)
3234         {
3235           end_elem.opr.name = end_name_buf;
3236           end_elem.type = COLL_SYM;
3237           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3238                                        dfa, syntax, true);
3239           if (BE (ret != REG_NOERROR, 0))
3240             {
3241               *err = ret;
3242               goto parse_bracket_exp_free_return;
3243             }
3244
3245           token_len = peek_token_bracket (token, regexp, syntax);
3246
3247 #ifdef _LIBC
3248           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3249                                   &start_elem, &end_elem);
3250 #else
3251 # ifdef RE_ENABLE_I18N
3252           *err = build_range_exp (syntax, sbcset,
3253                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3254                                   &range_alloc, &start_elem, &end_elem);
3255 # else
3256           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3257 # endif
3258 #endif /* RE_ENABLE_I18N */
3259           if (BE (*err != REG_NOERROR, 0))
3260             goto parse_bracket_exp_free_return;
3261         }
3262       else
3263         {
3264           switch (start_elem.type)
3265             {
3266             case SB_CHAR:
3267               bitset_set (sbcset, start_elem.opr.ch);
3268               break;
3269 #ifdef RE_ENABLE_I18N
3270             case MB_CHAR:
3271               /* Check whether the array has enough space.  */
3272               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3273                 {
3274                   wchar_t *new_mbchars;
3275                   /* Not enough, realloc it.  */
3276                   /* +1 in case of mbcset->nmbchars is 0.  */
3277                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3278                   /* Use realloc since array is NULL if *alloc == 0.  */
3279                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3280                                             mbchar_alloc);
3281                   if (BE (new_mbchars == NULL, 0))
3282                     goto parse_bracket_exp_espace;
3283                   mbcset->mbchars = new_mbchars;
3284                 }
3285               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3286               break;
3287 #endif /* RE_ENABLE_I18N */
3288             case EQUIV_CLASS:
3289               *err = build_equiv_class (sbcset,
3290 #ifdef RE_ENABLE_I18N
3291                                         mbcset, &equiv_class_alloc,
3292 #endif /* RE_ENABLE_I18N */
3293                                         start_elem.opr.name);
3294               if (BE (*err != REG_NOERROR, 0))
3295                 goto parse_bracket_exp_free_return;
3296               break;
3297             case COLL_SYM:
3298               *err = build_collating_symbol (sbcset,
3299 #ifdef RE_ENABLE_I18N
3300                                              mbcset, &coll_sym_alloc,
3301 #endif /* RE_ENABLE_I18N */
3302                                              start_elem.opr.name);
3303               if (BE (*err != REG_NOERROR, 0))
3304                 goto parse_bracket_exp_free_return;
3305               break;
3306             case CHAR_CLASS:
3307               *err = build_charclass (regexp->trans, sbcset,
3308 #ifdef RE_ENABLE_I18N
3309                                       mbcset, &char_class_alloc,
3310 #endif /* RE_ENABLE_I18N */
3311                                       (const char *) start_elem.opr.name,
3312                                       syntax);
3313               if (BE (*err != REG_NOERROR, 0))
3314                goto parse_bracket_exp_free_return;
3315               break;
3316             default:
3317               assert (0);
3318               break;
3319             }
3320         }
3321       if (BE (token->type == END_OF_RE, 0))
3322         {
3323           *err = REG_EBRACK;
3324           goto parse_bracket_exp_free_return;
3325         }
3326       if (token->type == OP_CLOSE_BRACKET)
3327         break;
3328     }
3329
3330   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3331
3332   /* If it is non-matching list.  */
3333   if (non_match)
3334     bitset_not (sbcset);
3335
3336 #ifdef RE_ENABLE_I18N
3337   /* Ensure only single byte characters are set.  */
3338   if (dfa->mb_cur_max > 1)
3339     bitset_mask (sbcset, dfa->sb_char);
3340
3341   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3342       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3343                                                      || mbcset->non_match)))
3344     {
3345       bin_tree_t *mbc_tree;
3346       int sbc_idx;
3347       /* Build a tree for complex bracket.  */
3348       dfa->has_mb_node = 1;
3349       br_token.type = COMPLEX_BRACKET;
3350       br_token.opr.mbcset = mbcset;
3351       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3352       if (BE (mbc_tree == NULL, 0))
3353         goto parse_bracket_exp_espace;
3354       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3355         if (sbcset[sbc_idx])
3356           break;
3357       /* If there are no bits set in sbcset, there is no point
3358          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3359       if (sbc_idx < BITSET_WORDS)
3360         {
3361           /* Build a tree for simple bracket.  */
3362           br_token.type = SIMPLE_BRACKET;
3363           br_token.opr.sbcset = sbcset;
3364           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3365           if (BE (work_tree == NULL, 0))
3366             goto parse_bracket_exp_espace;
3367
3368           /* Then join them by ALT node.  */
3369           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3370           if (BE (work_tree == NULL, 0))
3371             goto parse_bracket_exp_espace;
3372         }
3373       else
3374         {
3375           re_free (sbcset);
3376           work_tree = mbc_tree;
3377         }
3378     }
3379   else
3380 #endif /* not RE_ENABLE_I18N */
3381     {
3382 #ifdef RE_ENABLE_I18N
3383       free_charset (mbcset);
3384 #endif
3385       /* Build a tree for simple bracket.  */
3386       br_token.type = SIMPLE_BRACKET;
3387       br_token.opr.sbcset = sbcset;
3388       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3389       if (BE (work_tree == NULL, 0))
3390         goto parse_bracket_exp_espace;
3391     }
3392   return work_tree;
3393
3394  parse_bracket_exp_espace:
3395   *err = REG_ESPACE;
3396  parse_bracket_exp_free_return:
3397   re_free (sbcset);
3398 #ifdef RE_ENABLE_I18N
3399   free_charset (mbcset);
3400 #endif /* RE_ENABLE_I18N */
3401   return NULL;
3402 }
3403
3404 /* Parse an element in the bracket expression.  */
3405
3406 static reg_errcode_t
3407 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3408                        re_token_t *token, int token_len, re_dfa_t *dfa,
3409                        reg_syntax_t syntax, bool accept_hyphen)
3410 {
3411 #ifdef RE_ENABLE_I18N
3412   int cur_char_size;
3413   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3414   if (cur_char_size > 1)
3415     {
3416       elem->type = MB_CHAR;
3417       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3418       re_string_skip_bytes (regexp, cur_char_size);
3419       return REG_NOERROR;
3420     }
3421 #endif /* RE_ENABLE_I18N */
3422   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3423   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3424       || token->type == OP_OPEN_EQUIV_CLASS)
3425     return parse_bracket_symbol (elem, regexp, token);
3426   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3427     {
3428       /* A '-' must only appear as anything but a range indicator before
3429          the closing bracket.  Everything else is an error.  */
3430       re_token_t token2;
3431       (void) peek_token_bracket (&token2, regexp, syntax);
3432       if (token2.type != OP_CLOSE_BRACKET)
3433         /* The actual error value is not standardized since this whole
3434            case is undefined.  But ERANGE makes good sense.  */
3435         return REG_ERANGE;
3436     }
3437   elem->type = SB_CHAR;
3438   elem->opr.ch = token->opr.c;
3439   return REG_NOERROR;
3440 }
3441
3442 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3443    such as [:<character_class>:], [.<collating_element>.], and
3444    [=<equivalent_class>=].  */
3445
3446 static reg_errcode_t
3447 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3448                       re_token_t *token)
3449 {
3450   unsigned char ch, delim = token->opr.c;
3451   int i = 0;
3452   if (re_string_eoi(regexp))
3453     return REG_EBRACK;
3454   for (;; ++i)
3455     {
3456       if (i >= BRACKET_NAME_BUF_SIZE)
3457         return REG_EBRACK;
3458       if (token->type == OP_OPEN_CHAR_CLASS)
3459         ch = re_string_fetch_byte_case (regexp);
3460       else
3461         ch = re_string_fetch_byte (regexp);
3462       if (re_string_eoi(regexp))
3463         return REG_EBRACK;
3464       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3465         break;
3466       elem->opr.name[i] = ch;
3467     }
3468   re_string_skip_bytes (regexp, 1);
3469   elem->opr.name[i] = '\0';
3470   switch (token->type)
3471     {
3472     case OP_OPEN_COLL_ELEM:
3473       elem->type = COLL_SYM;
3474       break;
3475     case OP_OPEN_EQUIV_CLASS:
3476       elem->type = EQUIV_CLASS;
3477       break;
3478     case OP_OPEN_CHAR_CLASS:
3479       elem->type = CHAR_CLASS;
3480       break;
3481     default:
3482       break;
3483     }
3484   return REG_NOERROR;
3485 }
3486
3487   /* Helper function for parse_bracket_exp.
3488      Build the equivalence class which is represented by NAME.
3489      The result are written to MBCSET and SBCSET.
3490      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3491      is a pointer argument since we may update it.  */
3492
3493 static reg_errcode_t
3494 #ifdef RE_ENABLE_I18N
3495 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3496                    Idx *equiv_class_alloc, const unsigned char *name)
3497 #else /* not RE_ENABLE_I18N */
3498 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3499 #endif /* not RE_ENABLE_I18N */
3500 {
3501 #ifdef _LIBC
3502   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3503   if (nrules != 0)
3504     {
3505       const int32_t *table, *indirect;
3506       const unsigned char *weights, *extra, *cp;
3507       unsigned char char_buf[2];
3508       int32_t idx1, idx2;
3509       unsigned int ch;
3510       size_t len;
3511       /* Calculate the index for equivalence class.  */
3512       cp = name;
3513       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3514       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3515                                                _NL_COLLATE_WEIGHTMB);
3516       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3517                                                    _NL_COLLATE_EXTRAMB);
3518       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3519                                                 _NL_COLLATE_INDIRECTMB);
3520       idx1 = findidx (table, indirect, extra, &cp, -1);
3521       if (BE (idx1 == 0 || *cp != '\0', 0))
3522         /* This isn't a valid character.  */
3523         return REG_ECOLLATE;
3524
3525       /* Build single byte matching table for this equivalence class.  */
3526       len = weights[idx1 & 0xffffff];
3527       for (ch = 0; ch < SBC_MAX; ++ch)
3528         {
3529           char_buf[0] = ch;
3530           cp = char_buf;
3531           idx2 = findidx (table, indirect, extra, &cp, 1);
3532 /*
3533           idx2 = table[ch];
3534 */
3535           if (idx2 == 0)
3536             /* This isn't a valid character.  */
3537             continue;
3538           /* Compare only if the length matches and the collation rule
3539              index is the same.  */
3540           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3541             {
3542               int cnt = 0;
3543
3544               while (cnt <= len &&
3545                      weights[(idx1 & 0xffffff) + 1 + cnt]
3546                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3547                 ++cnt;
3548
3549               if (cnt > len)
3550                 bitset_set (sbcset, ch);
3551             }
3552         }
3553       /* Check whether the array has enough space.  */
3554       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3555         {
3556           /* Not enough, realloc it.  */
3557           /* +1 in case of mbcset->nequiv_classes is 0.  */
3558           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3559           /* Use realloc since the array is NULL if *alloc == 0.  */
3560           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3561                                                    int32_t,
3562                                                    new_equiv_class_alloc);
3563           if (BE (new_equiv_classes == NULL, 0))
3564             return REG_ESPACE;
3565           mbcset->equiv_classes = new_equiv_classes;
3566           *equiv_class_alloc = new_equiv_class_alloc;
3567         }
3568       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3569     }
3570   else
3571 #endif /* _LIBC */
3572     {
3573       if (BE (strlen ((const char *) name) != 1, 0))
3574         return REG_ECOLLATE;
3575       bitset_set (sbcset, *name);
3576     }
3577   return REG_NOERROR;
3578 }
3579
3580   /* Helper function for parse_bracket_exp.
3581      Build the character class which is represented by NAME.
3582      The result are written to MBCSET and SBCSET.
3583      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3584      is a pointer argument since we may update it.  */
3585
3586 static reg_errcode_t
3587 #ifdef RE_ENABLE_I18N
3588 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3589                  re_charset_t *mbcset, Idx *char_class_alloc,
3590                  const char *class_name, reg_syntax_t syntax)
3591 #else /* not RE_ENABLE_I18N */
3592 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3593                  const char *class_name, reg_syntax_t syntax)
3594 #endif /* not RE_ENABLE_I18N */
3595 {
3596   int i;
3597   const char *name = class_name;
3598
3599   /* In case of REG_ICASE "upper" and "lower" match the both of
3600      upper and lower cases.  */
3601   if ((syntax & RE_ICASE)
3602       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3603     name = "alpha";
3604
3605 #ifdef RE_ENABLE_I18N
3606   /* Check the space of the arrays.  */
3607   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3608     {
3609       /* Not enough, realloc it.  */
3610       /* +1 in case of mbcset->nchar_classes is 0.  */
3611       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3612       /* Use realloc since array is NULL if *alloc == 0.  */
3613       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3614                                                new_char_class_alloc);
3615       if (BE (new_char_classes == NULL, 0))
3616         return REG_ESPACE;
3617       mbcset->char_classes = new_char_classes;
3618       *char_class_alloc = new_char_class_alloc;
3619     }
3620   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3621 #endif /* RE_ENABLE_I18N */
3622
3623 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3624   do {                                          \
3625     if (BE (trans != NULL, 0))                  \
3626       {                                         \
3627         for (i = 0; i < SBC_MAX; ++i)           \
3628           if (ctype_func (i))                   \
3629             bitset_set (sbcset, trans[i]);      \
3630       }                                         \
3631     else                                        \
3632       {                                         \
3633         for (i = 0; i < SBC_MAX; ++i)           \
3634           if (ctype_func (i))                   \
3635             bitset_set (sbcset, i);             \
3636       }                                         \
3637   } while (0)
3638
3639   if (strcmp (name, "alnum") == 0)
3640     BUILD_CHARCLASS_LOOP (isalnum);
3641   else if (strcmp (name, "cntrl") == 0)
3642     BUILD_CHARCLASS_LOOP (iscntrl);
3643   else if (strcmp (name, "lower") == 0)
3644     BUILD_CHARCLASS_LOOP (islower);
3645   else if (strcmp (name, "space") == 0)
3646     BUILD_CHARCLASS_LOOP (isspace);
3647   else if (strcmp (name, "alpha") == 0)
3648     BUILD_CHARCLASS_LOOP (isalpha);
3649   else if (strcmp (name, "digit") == 0)
3650     BUILD_CHARCLASS_LOOP (isdigit);
3651   else if (strcmp (name, "print") == 0)
3652     BUILD_CHARCLASS_LOOP (isprint);
3653   else if (strcmp (name, "upper") == 0)
3654     BUILD_CHARCLASS_LOOP (isupper);
3655   else if (strcmp (name, "blank") == 0)
3656     BUILD_CHARCLASS_LOOP (isblank);
3657   else if (strcmp (name, "graph") == 0)
3658     BUILD_CHARCLASS_LOOP (isgraph);
3659   else if (strcmp (name, "punct") == 0)
3660     BUILD_CHARCLASS_LOOP (ispunct);
3661   else if (strcmp (name, "xdigit") == 0)
3662     BUILD_CHARCLASS_LOOP (isxdigit);
3663   else
3664     return REG_ECTYPE;
3665
3666   return REG_NOERROR;
3667 }
3668
3669 static bin_tree_t *
3670 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3671                     const char *class_name,
3672                     const char *extra, bool non_match,
3673                     reg_errcode_t *err)
3674 {
3675   re_bitset_ptr_t sbcset;
3676 #ifdef RE_ENABLE_I18N
3677   re_charset_t *mbcset;
3678   Idx alloc = 0;
3679 #endif /* not RE_ENABLE_I18N */
3680   reg_errcode_t ret;
3681   re_token_t br_token;
3682   bin_tree_t *tree;
3683
3684   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3685   if (BE (sbcset == NULL, 0))
3686     {
3687       *err = REG_ESPACE;
3688       return NULL;
3689     }
3690 #ifdef RE_ENABLE_I18N
3691   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3692   if (BE (mbcset == NULL, 0))
3693     {
3694       re_free (sbcset);
3695       *err = REG_ESPACE;
3696       return NULL;
3697     }
3698   mbcset->non_match = non_match;
3699 #endif /* RE_ENABLE_I18N */
3700
3701   /* We don't care the syntax in this case.  */
3702   ret = build_charclass (trans, sbcset,
3703 #ifdef RE_ENABLE_I18N
3704                          mbcset, &alloc,
3705 #endif /* RE_ENABLE_I18N */
3706                          class_name, 0);
3707
3708   if (BE (ret != REG_NOERROR, 0))
3709     {
3710       re_free (sbcset);
3711 #ifdef RE_ENABLE_I18N
3712       free_charset (mbcset);
3713 #endif /* RE_ENABLE_I18N */
3714       *err = ret;
3715       return NULL;
3716     }
3717   /* \w match '_' also.  */
3718   for (; *extra; extra++)
3719     bitset_set (sbcset, *extra);
3720
3721   /* If it is non-matching list.  */
3722   if (non_match)
3723     bitset_not (sbcset);
3724
3725 #ifdef RE_ENABLE_I18N
3726   /* Ensure only single byte characters are set.  */
3727   if (dfa->mb_cur_max > 1)
3728     bitset_mask (sbcset, dfa->sb_char);
3729 #endif
3730
3731   /* Build a tree for simple bracket.  */
3732 #if defined GCC_LINT || defined lint
3733   memset (&br_token, 0, sizeof br_token);
3734 #endif
3735   br_token.type = SIMPLE_BRACKET;
3736   br_token.opr.sbcset = sbcset;
3737   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3738   if (BE (tree == NULL, 0))
3739     goto build_word_op_espace;
3740
3741 #ifdef RE_ENABLE_I18N
3742   if (dfa->mb_cur_max > 1)
3743     {
3744       bin_tree_t *mbc_tree;
3745       /* Build a tree for complex bracket.  */
3746       br_token.type = COMPLEX_BRACKET;
3747       br_token.opr.mbcset = mbcset;
3748       dfa->has_mb_node = 1;
3749       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3750       if (BE (mbc_tree == NULL, 0))
3751         goto build_word_op_espace;
3752       /* Then join them by ALT node.  */
3753       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3754       if (BE (mbc_tree != NULL, 1))
3755         return tree;
3756     }
3757   else
3758     {
3759       free_charset (mbcset);
3760       return tree;
3761     }
3762 #else /* not RE_ENABLE_I18N */
3763   return tree;
3764 #endif /* not RE_ENABLE_I18N */
3765
3766  build_word_op_espace:
3767   re_free (sbcset);
3768 #ifdef RE_ENABLE_I18N
3769   free_charset (mbcset);
3770 #endif /* RE_ENABLE_I18N */
3771   *err = REG_ESPACE;
3772   return NULL;
3773 }
3774
3775 /* This is intended for the expressions like "a{1,3}".
3776    Fetch a number from 'input', and return the number.
3777    Return -1 if the number field is empty like "{,1}".
3778    Return RE_DUP_MAX + 1 if the number field is too large.
3779    Return -2 if an error occurred.  */
3780
3781 static Idx
3782 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3783 {
3784   Idx num = -1;
3785   unsigned char c;
3786   while (1)
3787     {
3788       fetch_token (token, input, syntax);
3789       c = token->opr.c;
3790       if (BE (token->type == END_OF_RE, 0))
3791         return -2;
3792       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3793         break;
3794       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3795              ? -2
3796              : num == -1
3797              ? c - '0'
3798              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3799     }
3800   return num;
3801 }
3802 \f
3803 #ifdef RE_ENABLE_I18N
3804 static void
3805 free_charset (re_charset_t *cset)
3806 {
3807   re_free (cset->mbchars);
3808 # ifdef _LIBC
3809   re_free (cset->coll_syms);
3810   re_free (cset->equiv_classes);
3811   re_free (cset->range_starts);
3812   re_free (cset->range_ends);
3813 # endif
3814   re_free (cset->char_classes);
3815   re_free (cset);
3816 }
3817 #endif /* RE_ENABLE_I18N */
3818 \f
3819 /* Functions for binary tree operation.  */
3820
3821 /* Create a tree node.  */
3822
3823 static bin_tree_t *
3824 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3825              re_token_type_t type)
3826 {
3827   re_token_t t;
3828 #if defined GCC_LINT || defined lint
3829   memset (&t, 0, sizeof t);
3830 #endif
3831   t.type = type;
3832   return create_token_tree (dfa, left, right, &t);
3833 }
3834
3835 static bin_tree_t *
3836 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3837                    const re_token_t *token)
3838 {
3839   bin_tree_t *tree;
3840   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3841     {
3842       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3843
3844       if (storage == NULL)
3845         return NULL;
3846       storage->next = dfa->str_tree_storage;
3847       dfa->str_tree_storage = storage;
3848       dfa->str_tree_storage_idx = 0;
3849     }
3850   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3851
3852   tree->parent = NULL;
3853   tree->left = left;
3854   tree->right = right;
3855   tree->token = *token;
3856   tree->token.duplicated = 0;
3857   tree->token.opt_subexp = 0;
3858   tree->first = NULL;
3859   tree->next = NULL;
3860   tree->node_idx = -1;
3861
3862   if (left != NULL)
3863     left->parent = tree;
3864   if (right != NULL)
3865     right->parent = tree;
3866   return tree;
3867 }
3868
3869 /* Mark the tree SRC as an optional subexpression.
3870    To be called from preorder or postorder.  */
3871
3872 static reg_errcode_t
3873 mark_opt_subexp (void *extra, bin_tree_t *node)
3874 {
3875   Idx idx = (uintptr_t) extra;
3876   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3877     node->token.opt_subexp = 1;
3878
3879   return REG_NOERROR;
3880 }
3881
3882 /* Free the allocated memory inside NODE. */
3883
3884 static void
3885 free_token (re_token_t *node)
3886 {
3887 #ifdef RE_ENABLE_I18N
3888   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3889     free_charset (node->opr.mbcset);
3890   else
3891 #endif /* RE_ENABLE_I18N */
3892     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3893       re_free (node->opr.sbcset);
3894 }
3895
3896 /* Worker function for tree walking.  Free the allocated memory inside NODE
3897    and its children. */
3898
3899 static reg_errcode_t
3900 free_tree (void *extra, bin_tree_t *node)
3901 {
3902   free_token (&node->token);
3903   return REG_NOERROR;
3904 }
3905
3906
3907 /* Duplicate the node SRC, and return new node.  This is a preorder
3908    visit similar to the one implemented by the generic visitor, but
3909    we need more infrastructure to maintain two parallel trees --- so,
3910    it's easier to duplicate.  */
3911
3912 static bin_tree_t *
3913 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3914 {
3915   const bin_tree_t *node;
3916   bin_tree_t *dup_root;
3917   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3918
3919   for (node = root; ; )
3920     {
3921       /* Create a new tree and link it back to the current parent.  */
3922       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3923       if (*p_new == NULL)
3924         return NULL;
3925       (*p_new)->parent = dup_node;
3926       (*p_new)->token.duplicated = 1;
3927       dup_node = *p_new;
3928
3929       /* Go to the left node, or up and to the right.  */
3930       if (node->left)
3931         {
3932           node = node->left;
3933           p_new = &dup_node->left;
3934         }
3935       else
3936         {
3937           const bin_tree_t *prev = NULL;
3938           while (node->right == prev || node->right == NULL)
3939             {
3940               prev = node;
3941               node = node->parent;
3942               dup_node = dup_node->parent;
3943               if (!node)
3944                 return dup_root;
3945             }
3946           node = node->right;
3947           p_new = &dup_node->right;
3948         }
3949     }
3950 }