gl/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU General Public
   8    License as published by the Free Software Foundation; either
   9    version 3 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         Idx *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       Idx *char_class_alloc,
  97                                       const char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const char *class_name,
 110                                        const char *extra,
 111                                        bool non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 static const char __re_error_msgid[] =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 static const size_t __re_error_msgid_idx[] =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 210    are set in BUFP on entry.  */
 211
 212 #ifdef _LIBC
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 #else /* size_t might promote */
 219 const char *
 220 re_compile_pattern (const char *pattern, size_t length,
 221                     struct re_pattern_buffer *bufp)
 222 #endif
 223 {
 224   reg_errcode_t ret;
 225
 226   /* And GNU code determines whether or not to get register information
 227      by passing null for the REGS argument to re_match, etc., not by
 228      setting no_sub, unless RE_NO_SUB is set.  */
 229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 230
 231   /* Match anchors at newline.  */
 232   bufp->newline_anchor = 1;
 233
 234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 235
 236   if (!ret)
 237     return NULL;
 238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 239 }
 240 #ifdef _LIBC
 241 weak_alias (__re_compile_pattern, re_compile_pattern)
 242 #endif
 243
 244 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 245    also be assigned to arbitrarily: each pattern buffer stores its own
 246    syntax, so it can be changed between regex compilations.  */
 247 /* This has no initializer because initialized variables in Emacs
 248    become read-only after dumping.  */
 249 reg_syntax_t re_syntax_options;
 250
 251
 252 /* Specify the precise syntax of regexps for compilation.  This provides
 253    for compatibility for various utilities which historically have
 254    different, incompatible syntaxes.
 255
 256    The argument SYNTAX is a bit mask comprised of the various bits
 257    defined in regex.h.  We return the old syntax.  */
 258
 259 reg_syntax_t
 260 re_set_syntax (syntax)
 261     reg_syntax_t syntax;
 262 {
 263   reg_syntax_t ret = re_syntax_options;
 264
 265   re_syntax_options = syntax;
 266   return ret;
 267 }
 268 #ifdef _LIBC
 269 weak_alias (__re_set_syntax, re_set_syntax)
 270 #endif
 271
 272 int
 273 re_compile_fastmap (bufp)
 274     struct re_pattern_buffer *bufp;
 275 {
 276   re_dfa_t *dfa = bufp->buffer;
 277   char *fastmap = bufp->fastmap;
 278
 279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 281   if (dfa->init_state != dfa->init_state_word)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 283   if (dfa->init_state != dfa->init_state_nl)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 285   if (dfa->init_state != dfa->init_state_begbuf)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 287   bufp->fastmap_accurate = 1;
 288   return 0;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 292 #endif
 293
 294 static inline void
 295 __attribute__ ((always_inline))
 296 re_set_fastmap (char *fastmap, bool icase, int ch)
 297 {
 298   fastmap[ch] = 1;
 299   if (icase)
 300     fastmap[tolower (ch)] = 1;
 301 }
 302
 303 /* Helper function for re_compile_fastmap.
 304    Compile fastmap for the initial_state INIT_STATE.  */
 305
 306 static void
 307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 308                          char *fastmap)
 309 {
 310   re_dfa_t *dfa = bufp->buffer;
 311   Idx node_cnt;
 312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 314     {
 315       Idx node = init_state->nodes.elems[node_cnt];
 316       re_token_type_t type = dfa->nodes[node].type;
 317
 318       if (type == CHARACTER)
 319         {
 320           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 321 #ifdef RE_ENABLE_I18N
 322           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 323             {
 324               unsigned char buf[MB_LEN_MAX];
 325               unsigned char *p;
 326               wchar_t wc;
 327               mbstate_t state;
 328
 329               p = buf;
 330               *p++ = dfa->nodes[node].opr.c;
 331               while (++node < dfa->nodes_len
 332                      && dfa->nodes[node].type == CHARACTER
 333                      && dfa->nodes[node].mb_partial)
 334                 *p++ = dfa->nodes[node].opr.c;
 335               memset (&state, '\0', sizeof (state));
 336               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 337                              &state) == p - buf
 338                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 339                       != (size_t) -1))
 340                 re_set_fastmap (fastmap, false, buf[0]);
 341             }
 342 #endif
 343         }
 344       else if (type == SIMPLE_BRACKET)
 345         {
 346           int i, ch;
 347           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 348             {
 349               int j;
 350               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 351               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 352                 if (w & ((bitset_word_t) 1 << j))
 353                   re_set_fastmap (fastmap, icase, ch);
 354             }
 355         }
 356 #ifdef RE_ENABLE_I18N
 357       else if (type == COMPLEX_BRACKET)
 358         {
 359           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 360           Idx i;
 361
 362 # ifdef _LIBC
 363           /* See if we have to try all bytes which start multiple collation
 364              elements.
 365              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 366                   collation element, and don't catch 'b' since 'b' is
 367                   the only collation element which starts from 'b' (and
 368                   it is caught by SIMPLE_BRACKET).  */
 369               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 370                   && (cset->ncoll_syms || cset->nranges))
 371                 {
 372                   const int32_t *table = (const int32_t *)
 373                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 374                   for (i = 0; i < SBC_MAX; ++i)
 375                     if (table[i] < 0)
 376                       re_set_fastmap (fastmap, icase, i);
 377                 }
 378 # endif /* _LIBC */
 379
 380           /* See if we have to start the match at all multibyte characters,
 381              i.e. where we would not find an invalid sequence.  This only
 382              applies to multibyte character sets; for single byte character
 383              sets, the SIMPLE_BRACKET again suffices.  */
 384           if (dfa->mb_cur_max > 1
 385               && (cset->nchar_classes || cset->non_match || cset->nranges
 386 # ifdef _LIBC
 387                   || cset->nequiv_classes
 388 # endif /* _LIBC */
 389                  ))
 390             {
 391               unsigned char c = 0;
 392               do
 393                 {
 394                   mbstate_t mbs;
 395                   memset (&mbs, 0, sizeof (mbs));
 396                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 397                     re_set_fastmap (fastmap, false, (int) c);
 398                 }
 399               while (++c != 0);
 400             }
 401
 402           else
 403             {
 404               /* ... Else catch all bytes which can start the mbchars.  */
 405               for (i = 0; i < cset->nmbchars; ++i)
 406                 {
 407                   char buf[256];
 408                   mbstate_t state;
 409                   memset (&state, '\0', sizeof (state));
 410                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 411                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 412                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 413                     {
 414                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 415                           != (size_t) -1)
 416                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 417                     }
 418                 }
 419             }
 420         }
 421 #endif /* RE_ENABLE_I18N */
 422       else if (type == OP_PERIOD
 423 #ifdef RE_ENABLE_I18N
 424                || type == OP_UTF8_PERIOD
 425 #endif /* RE_ENABLE_I18N */
 426                || type == END_OF_RE)
 427         {
 428           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 429           if (type == END_OF_RE)
 430             bufp->can_be_null = 1;
 431           return;
 432         }
 433     }
 434 }
 435 \f
 436 /* Entry point for POSIX code.  */
 437 /* regcomp takes a regular expression as a string and compiles it.
 438
 439    PREG is a regex_t *.  We do not expect any fields to be initialized,
 440    since POSIX says we shouldn't.  Thus, we set
 441
 442      'buffer' to the compiled pattern;
 443      'used' to the length of the compiled pattern;
 444      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 446        RE_SYNTAX_POSIX_BASIC;
 447      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 448      'fastmap' to an allocated space for the fastmap;
 449      'fastmap_accurate' to zero;
 450      're_nsub' to the number of subexpressions in PATTERN.
 451
 452    PATTERN is the address of the pattern string.
 453
 454    CFLAGS is a series of bits which affect compilation.
 455
 456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 457      use POSIX basic syntax.
 458
 459      If REG_NEWLINE is set, then . and [^...] don't match newline.
 460      Also, regexec will try a match beginning after every newline.
 461
 462      If REG_ICASE is set, then we considers upper- and lowercase
 463      versions of letters to be equivalent when matching.
 464
 465      If REG_NOSUB is set, then when PREG is passed to regexec, that
 466      routine will report only success or failure, and nothing about the
 467      registers.
 468
 469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 470    the return codes and their meanings.)  */
 471
 472 int
 473 regcomp (preg, pattern, cflags)
 474     regex_t *_Restrict_ preg;
 475     const char *_Restrict_ pattern;
 476     int cflags;
 477 {
 478   reg_errcode_t ret;
 479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 480                          : RE_SYNTAX_POSIX_BASIC);
 481
 482   preg->buffer = NULL;
 483   preg->allocated = 0;
 484   preg->used = 0;
 485
 486   /* Try to allocate space for the fastmap.  */
 487   preg->fastmap = re_malloc (char, SBC_MAX);
 488   if (BE (preg->fastmap == NULL, 0))
 489     return REG_ESPACE;
 490
 491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 492
 493   /* If REG_NEWLINE is set, newlines are treated differently.  */
 494   if (cflags & REG_NEWLINE)
 495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 496       syntax &= ~RE_DOT_NEWLINE;
 497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 498       /* It also changes the matching behavior.  */
 499       preg->newline_anchor = 1;
 500     }
 501   else
 502     preg->newline_anchor = 0;
 503   preg->no_sub = !!(cflags & REG_NOSUB);
 504   preg->translate = NULL;
 505
 506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 507
 508   /* POSIX doesn't distinguish between an unmatched open-group and an
 509      unmatched close-group: both are REG_EPAREN.  */
 510   if (ret == REG_ERPAREN)
 511     ret = REG_EPAREN;
 512
 513   /* We have already checked preg->fastmap != NULL.  */
 514   if (BE (ret == REG_NOERROR, 1))
 515     /* Compute the fastmap now, since regexec cannot modify the pattern
 516        buffer.  This function never fails in this implementation.  */
 517     (void) re_compile_fastmap (preg);
 518   else
 519     {
 520       /* Some error occurred while compiling the expression.  */
 521       re_free (preg->fastmap);
 522       preg->fastmap = NULL;
 523     }
 524
 525   return (int) ret;
 526 }
 527 #ifdef _LIBC
 528 weak_alias (__regcomp, regcomp)
 529 #endif
 530
 531 /* Returns a message corresponding to an error code, ERRCODE, returned
 532    from either regcomp or regexec.   We don't use PREG here.  */
 533
 534 #ifdef _LIBC
 535 size_t
 536 regerror (errcode, preg, errbuf, errbuf_size)
 537     int errcode;
 538     const regex_t *_Restrict_ preg;
 539     char *_Restrict_ errbuf;
 540     size_t errbuf_size;
 541 #else /* size_t might promote */
 542 size_t
 543 regerror (int errcode, const regex_t *_Restrict_ preg,
 544           char *_Restrict_ errbuf, size_t errbuf_size)
 545 #endif
 546 {
 547   const char *msg;
 548   size_t msg_size;
 549
 550   if (BE (errcode < 0
 551           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 552                                / sizeof (__re_error_msgid_idx[0])), 0))
 553     /* Only error codes returned by the rest of the code should be passed
 554        to this routine.  If we are given anything else, or if other regex
 555        code generates an invalid error code, then the program has a bug.
 556        Dump core so we can fix it.  */
 557     abort ();
 558
 559   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 560
 561   msg_size = strlen (msg) + 1; /* Includes the null.  */
 562
 563   if (BE (errbuf_size != 0, 1))
 564     {
 565       size_t cpy_size = msg_size;
 566       if (BE (msg_size > errbuf_size, 0))
 567         {
 568           cpy_size = errbuf_size - 1;
 569           errbuf[cpy_size] = '\0';
 570         }
 571       memcpy (errbuf, msg, cpy_size);
 572     }
 573
 574   return msg_size;
 575 }
 576 #ifdef _LIBC
 577 weak_alias (__regerror, regerror)
 578 #endif
 579
 580
 581 #ifdef RE_ENABLE_I18N
 582 /* This static array is used for the map to single-byte characters when
 583    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 584    it the same all the time.  UTF-8 is the preferred encoding so this is
 585    a worthwhile optimization.  */
 586 static const bitset_t utf8_sb_map =
 587 {
 588   /* Set the first 128 bits.  */
 589 # if defined __GNUC__ && !defined __STRICT_ANSI__
 590   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 591 # else
 592 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 593 #   error "bitset_word_t is narrower than 32 bits"
 594 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 595   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 596 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX, BITSET_WORD_MAX,
 598 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 599   BITSET_WORD_MAX,
 600 #  endif
 601   (BITSET_WORD_MAX
 602    >> (SBC_MAX % BITSET_WORD_BITS == 0
 603        ? 0
 604        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 605 # endif
 606 };
 607 #endif
 608
 609
 610 static void
 611 free_dfa_content (re_dfa_t *dfa)
 612 {
 613   Idx i, j;
 614
 615   if (dfa->nodes)
 616     for (i = 0; i < dfa->nodes_len; ++i)
 617       free_token (dfa->nodes + i);
 618   re_free (dfa->nexts);
 619   for (i = 0; i < dfa->nodes_len; ++i)
 620     {
 621       if (dfa->eclosures != NULL)
 622         re_node_set_free (dfa->eclosures + i);
 623       if (dfa->inveclosures != NULL)
 624         re_node_set_free (dfa->inveclosures + i);
 625       if (dfa->edests != NULL)
 626         re_node_set_free (dfa->edests + i);
 627     }
 628   re_free (dfa->edests);
 629   re_free (dfa->eclosures);
 630   re_free (dfa->inveclosures);
 631   re_free (dfa->nodes);
 632
 633   if (dfa->state_table)
 634     for (i = 0; i <= dfa->state_hash_mask; ++i)
 635       {
 636         struct re_state_table_entry *entry = dfa->state_table + i;
 637         for (j = 0; j < entry->num; ++j)
 638           {
 639             re_dfastate_t *state = entry->array[j];
 640             free_state (state);
 641           }
 642         re_free (entry->array);
 643       }
 644   re_free (dfa->state_table);
 645 #ifdef RE_ENABLE_I18N
 646   if (dfa->sb_char != utf8_sb_map)
 647     re_free (dfa->sb_char);
 648 #endif
 649   re_free (dfa->subexp_map);
 650 #ifdef DEBUG
 651   re_free (dfa->re_str);
 652 #endif
 653
 654   re_free (dfa);
 655 }
 656
 657
 658 /* Free dynamically allocated space used by PREG.  */
 659
 660 void
 661 regfree (preg)
 662     regex_t *preg;
 663 {
 664   re_dfa_t *dfa = preg->buffer;
 665   if (BE (dfa != NULL, 1))
 666     {
 667       lock_fini (dfa->lock);
 668       free_dfa_content (dfa);
 669     }
 670   preg->buffer = NULL;
 671   preg->allocated = 0;
 672
 673   re_free (preg->fastmap);
 674   preg->fastmap = NULL;
 675
 676   re_free (preg->translate);
 677   preg->translate = NULL;
 678 }
 679 #ifdef _LIBC
 680 weak_alias (__regfree, regfree)
 681 #endif
 682 \f
 683 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 684    them unless specifically requested.  */
 685
 686 #if defined _REGEX_RE_COMP || defined _LIBC
 687
 688 /* BSD has one and only one pattern buffer.  */
 689 static struct re_pattern_buffer re_comp_buf;
 690
 691 char *
 692 # ifdef _LIBC
 693 /* Make these definitions weak in libc, so POSIX programs can redefine
 694    these names if they don't use our functions, and still use
 695    regcomp/regexec above without link errors.  */
 696 weak_function
 697 # endif
 698 re_comp (s)
 699      const char *s;
 700 {
 701   reg_errcode_t ret;
 702   char *fastmap;
 703
 704   if (!s)
 705     {
 706       if (!re_comp_buf.buffer)
 707         return gettext ("No previous regular expression");
 708       return 0;
 709     }
 710
 711   if (re_comp_buf.buffer)
 712     {
 713       fastmap = re_comp_buf.fastmap;
 714       re_comp_buf.fastmap = NULL;
 715       __regfree (&re_comp_buf);
 716       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 717       re_comp_buf.fastmap = fastmap;
 718     }
 719
 720   if (re_comp_buf.fastmap == NULL)
 721     {
 722       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 723       if (re_comp_buf.fastmap == NULL)
 724         return (char *) gettext (__re_error_msgid
 725                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 726     }
 727
 728   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 729      don't need to initialize the pattern buffer fields which affect it.  */
 730
 731   /* Match anchors at newlines.  */
 732   re_comp_buf.newline_anchor = 1;
 733
 734   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 735
 736   if (!ret)
 737     return NULL;
 738
 739   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 740   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 741 }
 742
 743 #ifdef _LIBC
 744 libc_freeres_fn (free_mem)
 745 {
 746   __regfree (&re_comp_buf);
 747 }
 748 #endif
 749
 750 #endif /* _REGEX_RE_COMP */
 751 \f
 752 /* Internal entry point.
 753    Compile the regular expression PATTERN, whose length is LENGTH.
 754    SYNTAX indicate regular expression's syntax.  */
 755
 756 static reg_errcode_t
 757 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 758                      reg_syntax_t syntax)
 759 {
 760   reg_errcode_t err = REG_NOERROR;
 761   re_dfa_t *dfa;
 762   re_string_t regexp;
 763
 764   /* Initialize the pattern buffer.  */
 765   preg->fastmap_accurate = 0;
 766   preg->syntax = syntax;
 767   preg->not_bol = preg->not_eol = 0;
 768   preg->used = 0;
 769   preg->re_nsub = 0;
 770   preg->can_be_null = 0;
 771   preg->regs_allocated = REGS_UNALLOCATED;
 772
 773   /* Initialize the dfa.  */
 774   dfa = preg->buffer;
 775   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 776     {
 777       /* If zero allocated, but buffer is non-null, try to realloc
 778          enough space.  This loses if buffer's address is bogus, but
 779          that is the user's responsibility.  If ->buffer is NULL this
 780          is a simple allocation.  */
 781       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 782       if (dfa == NULL)
 783         return REG_ESPACE;
 784       preg->allocated = sizeof (re_dfa_t);
 785       preg->buffer = dfa;
 786     }
 787   preg->used = sizeof (re_dfa_t);
 788
 789   err = init_dfa (dfa, length);
 790   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
 791     err = REG_ESPACE;
 792   if (BE (err != REG_NOERROR, 0))
 793     {
 794       free_dfa_content (dfa);
 795       preg->buffer = NULL;
 796       preg->allocated = 0;
 797       return err;
 798     }
 799 #ifdef DEBUG
 800   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 801   dfa->re_str = re_malloc (char, length + 1);
 802   strncpy (dfa->re_str, pattern, length + 1);
 803 #endif
 804
 805   err = re_string_construct (&regexp, pattern, length, preg->translate,
 806                              (syntax & RE_ICASE) != 0, dfa);
 807   if (BE (err != REG_NOERROR, 0))
 808     {
 809     re_compile_internal_free_return:
 810       free_workarea_compile (preg);
 811       re_string_destruct (&regexp);
 812       lock_fini (dfa->lock);
 813       free_dfa_content (dfa);
 814       preg->buffer = NULL;
 815       preg->allocated = 0;
 816       return err;
 817     }
 818
 819   /* Parse the regular expression, and build a structure tree.  */
 820   preg->re_nsub = 0;
 821   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 822   if (BE (dfa->str_tree == NULL, 0))
 823     goto re_compile_internal_free_return;
 824
 825   /* Analyze the tree and create the nfa.  */
 826   err = analyze (preg);
 827   if (BE (err != REG_NOERROR, 0))
 828     goto re_compile_internal_free_return;
 829
 830 #ifdef RE_ENABLE_I18N
 831   /* If possible, do searching in single byte encoding to speed things up.  */
 832   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 833     optimize_utf8 (dfa);
 834 #endif
 835
 836   /* Then create the initial state of the dfa.  */
 837   err = create_initial_state (dfa);
 838
 839   /* Release work areas.  */
 840   free_workarea_compile (preg);
 841   re_string_destruct (&regexp);
 842
 843   if (BE (err != REG_NOERROR, 0))
 844     {
 845       lock_fini (dfa->lock);
 846       free_dfa_content (dfa);
 847       preg->buffer = NULL;
 848       preg->allocated = 0;
 849     }
 850
 851   return err;
 852 }
 853
 854 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 855    as the initial length of some arrays.  */
 856
 857 static reg_errcode_t
 858 init_dfa (re_dfa_t *dfa, size_t pat_len)
 859 {
 860   __re_size_t table_size;
 861 #ifndef _LIBC
 862   const char *codeset_name;
 863 #endif
 864 #ifdef RE_ENABLE_I18N
 865   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 866 #else
 867   size_t max_i18n_object_size = 0;
 868 #endif
 869   size_t max_object_size =
 870     MAX (sizeof (struct re_state_table_entry),
 871          MAX (sizeof (re_token_t),
 872               MAX (sizeof (re_node_set),
 873                    MAX (sizeof (regmatch_t),
 874                         max_i18n_object_size))));
 875
 876   memset (dfa, '\0', sizeof (re_dfa_t));
 877
 878   /* Force allocation of str_tree_storage the first time.  */
 879   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 880
 881   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 882      calculation below, and for similar doubling calculations
 883      elsewhere.  And it's <= rather than <, because some of the
 884      doubling calculations add 1 afterwards.  */
 885   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 886     return REG_ESPACE;
 887
 888   dfa->nodes_alloc = pat_len + 1;
 889   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 890
 891   /*  table_size = 2 ^ ceil(log pat_len) */
 892   for (table_size = 1; ; table_size <<= 1)
 893     if (table_size > pat_len)
 894       break;
 895
 896   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 897   dfa->state_hash_mask = table_size - 1;
 898
 899   dfa->mb_cur_max = MB_CUR_MAX;
 900 #ifdef _LIBC
 901   if (dfa->mb_cur_max == 6
 902       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 903     dfa->is_utf8 = 1;
 904   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 905                        != 0);
 906 #else
 907   codeset_name = nl_langinfo (CODESET);
 908   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 909       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 910       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 911       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 912     dfa->is_utf8 = 1;
 913
 914   /* We check exhaustively in the loop below if this charset is a
 915      superset of ASCII.  */
 916   dfa->map_notascii = 0;
 917 #endif
 918
 919 #ifdef RE_ENABLE_I18N
 920   if (dfa->mb_cur_max > 1)
 921     {
 922       if (dfa->is_utf8)
 923         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 924       else
 925         {
 926           int i, j, ch;
 927
 928           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 929           if (BE (dfa->sb_char == NULL, 0))
 930             return REG_ESPACE;
 931
 932           /* Set the bits corresponding to single byte chars.  */
 933           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 934             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 935               {
 936                 wint_t wch = __btowc (ch);
 937                 if (wch != WEOF)
 938                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 939 # ifndef _LIBC
 940                 if (isascii (ch) && wch != ch)
 941                   dfa->map_notascii = 1;
 942 # endif
 943               }
 944         }
 945     }
 946 #endif
 947
 948   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 949     return REG_ESPACE;
 950   return REG_NOERROR;
 951 }
 952
 953 /* Initialize WORD_CHAR table, which indicate which character is
 954    "word".  In this case "word" means that it is the word construction
 955    character used by some operators like "\<", "\>", etc.  */
 956
 957 static void
 958 internal_function
 959 init_word_char (re_dfa_t *dfa)
 960 {
 961   int i = 0;
 962   int j;
 963   int ch = 0;
 964   dfa->word_ops_used = 1;
 965   if (BE (dfa->map_notascii == 0, 1))
 966     {
 967       bitset_word_t bits0 = 0x00000000;
 968       bitset_word_t bits1 = 0x03ff0000;
 969       bitset_word_t bits2 = 0x87fffffe;
 970       bitset_word_t bits3 = 0x07fffffe;
 971       if (BITSET_WORD_BITS == 64)
 972         {
 973           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 974           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 975           i = 2;
 976         }
 977       else if (BITSET_WORD_BITS == 32)
 978         {
 979           dfa->word_char[0] = bits0;
 980           dfa->word_char[1] = bits1;
 981           dfa->word_char[2] = bits2;
 982           dfa->word_char[3] = bits3;
 983           i = 4;
 984         }
 985       else
 986         goto general_case;
 987       ch = 128;
 988
 989       if (BE (dfa->is_utf8, 1))
 990         {
 991           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 992           return;
 993         }
 994     }
 995
 996  general_case:
 997   for (; i < BITSET_WORDS; ++i)
 998     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 999       if (isalnum (ch) || ch == '_')
1000         dfa->word_char[i] |= (bitset_word_t) 1 << j;
1001 }
1002
1003 /* Free the work area which are only used while compiling.  */
1004
1005 static void
1006 free_workarea_compile (regex_t *preg)
1007 {
1008   re_dfa_t *dfa = preg->buffer;
1009   bin_tree_storage_t *storage, *next;
1010   for (storage = dfa->str_tree_storage; storage; storage = next)
1011     {
1012       next = storage->next;
1013       re_free (storage);
1014     }
1015   dfa->str_tree_storage = NULL;
1016   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1017   dfa->str_tree = NULL;
1018   re_free (dfa->org_indices);
1019   dfa->org_indices = NULL;
1020 }
1021
1022 /* Create initial states for all contexts.  */
1023
1024 static reg_errcode_t
1025 create_initial_state (re_dfa_t *dfa)
1026 {
1027   Idx first, i;
1028   reg_errcode_t err;
1029   re_node_set init_nodes;
1030
1031   /* Initial states have the epsilon closure of the node which is
1032      the first node of the regular expression.  */
1033   first = dfa->str_tree->first->node_idx;
1034   dfa->init_node = first;
1035   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1036   if (BE (err != REG_NOERROR, 0))
1037     return err;
1038
1039   /* The back-references which are in initial states can epsilon transit,
1040      since in this case all of the subexpressions can be null.
1041      Then we add epsilon closures of the nodes which are the next nodes of
1042      the back-references.  */
1043   if (dfa->nbackref > 0)
1044     for (i = 0; i < init_nodes.nelem; ++i)
1045       {
1046         Idx node_idx = init_nodes.elems[i];
1047         re_token_type_t type = dfa->nodes[node_idx].type;
1048
1049         Idx clexp_idx;
1050         if (type != OP_BACK_REF)
1051           continue;
1052         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1053           {
1054             re_token_t *clexp_node;
1055             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1056             if (clexp_node->type == OP_CLOSE_SUBEXP
1057                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1058               break;
1059           }
1060         if (clexp_idx == init_nodes.nelem)
1061           continue;
1062
1063         if (type == OP_BACK_REF)
1064           {
1065             Idx dest_idx = dfa->edests[node_idx].elems[0];
1066             if (!re_node_set_contains (&init_nodes, dest_idx))
1067               {
1068                 reg_errcode_t merge_err
1069                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1070                 if (merge_err != REG_NOERROR)
1071                   return merge_err;
1072                 i = 0;
1073               }
1074           }
1075       }
1076
1077   /* It must be the first time to invoke acquire_state.  */
1078   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1079   /* We don't check ERR here, since the initial state must not be NULL.  */
1080   if (BE (dfa->init_state == NULL, 0))
1081     return err;
1082   if (dfa->init_state->has_constraint)
1083     {
1084       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1085                                                        CONTEXT_WORD);
1086       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1087                                                      CONTEXT_NEWLINE);
1088       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1089                                                          &init_nodes,
1090                                                          CONTEXT_NEWLINE
1091                                                          | CONTEXT_BEGBUF);
1092       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1093               || dfa->init_state_begbuf == NULL, 0))
1094         return err;
1095     }
1096   else
1097     dfa->init_state_word = dfa->init_state_nl
1098       = dfa->init_state_begbuf = dfa->init_state;
1099
1100   re_node_set_free (&init_nodes);
1101   return REG_NOERROR;
1102 }
1103 \f
1104 #ifdef RE_ENABLE_I18N
1105 /* If it is possible to do searching in single byte encoding instead of UTF-8
1106    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1107    DFA nodes where needed.  */
1108
1109 static void
1110 optimize_utf8 (re_dfa_t *dfa)
1111 {
1112   Idx node;
1113   int i;
1114   bool mb_chars = false;
1115   bool has_period = false;
1116
1117   for (node = 0; node < dfa->nodes_len; ++node)
1118     switch (dfa->nodes[node].type)
1119       {
1120       case CHARACTER:
1121         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1122           mb_chars = true;
1123         break;
1124       case ANCHOR:
1125         switch (dfa->nodes[node].opr.ctx_type)
1126           {
1127           case LINE_FIRST:
1128           case LINE_LAST:
1129           case BUF_FIRST:
1130           case BUF_LAST:
1131             break;
1132           default:
1133             /* Word anchors etc. cannot be handled.  It's okay to test
1134                opr.ctx_type since constraints (for all DFA nodes) are
1135                created by ORing one or more opr.ctx_type values.  */
1136             return;
1137           }
1138         break;
1139       case OP_PERIOD:
1140         has_period = true;
1141         break;
1142       case OP_BACK_REF:
1143       case OP_ALT:
1144       case END_OF_RE:
1145       case OP_DUP_ASTERISK:
1146       case OP_OPEN_SUBEXP:
1147       case OP_CLOSE_SUBEXP:
1148         break;
1149       case COMPLEX_BRACKET:
1150         return;
1151       case SIMPLE_BRACKET:
1152         /* Just double check.  */
1153         {
1154           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1155                         ? 0
1156                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1157           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1158             {
1159               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1160                 return;
1161               rshift = 0;
1162             }
1163         }
1164         break;
1165       default:
1166         abort ();
1167       }
1168
1169   if (mb_chars || has_period)
1170     for (node = 0; node < dfa->nodes_len; ++node)
1171       {
1172         if (dfa->nodes[node].type == CHARACTER
1173             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1174           dfa->nodes[node].mb_partial = 0;
1175         else if (dfa->nodes[node].type == OP_PERIOD)
1176           dfa->nodes[node].type = OP_UTF8_PERIOD;
1177       }
1178
1179   /* The search can be in single byte locale.  */
1180   dfa->mb_cur_max = 1;
1181   dfa->is_utf8 = 0;
1182   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1183 }
1184 #endif
1185 \f
1186 /* Analyze the structure tree, and calculate "first", "next", "edest",
1187    "eclosure", and "inveclosure".  */
1188
1189 static reg_errcode_t
1190 analyze (regex_t *preg)
1191 {
1192   re_dfa_t *dfa = preg->buffer;
1193   reg_errcode_t ret;
1194
1195   /* Allocate arrays.  */
1196   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1197   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1198   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1199   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1200   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1201           || dfa->eclosures == NULL, 0))
1202     return REG_ESPACE;
1203
1204   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1205   if (dfa->subexp_map != NULL)
1206     {
1207       Idx i;
1208       for (i = 0; i < preg->re_nsub; i++)
1209         dfa->subexp_map[i] = i;
1210       preorder (dfa->str_tree, optimize_subexps, dfa);
1211       for (i = 0; i < preg->re_nsub; i++)
1212         if (dfa->subexp_map[i] != i)
1213           break;
1214       if (i == preg->re_nsub)
1215         {
1216           free (dfa->subexp_map);
1217           dfa->subexp_map = NULL;
1218         }
1219     }
1220
1221   ret = postorder (dfa->str_tree, lower_subexps, preg);
1222   if (BE (ret != REG_NOERROR, 0))
1223     return ret;
1224   ret = postorder (dfa->str_tree, calc_first, dfa);
1225   if (BE (ret != REG_NOERROR, 0))
1226     return ret;
1227   preorder (dfa->str_tree, calc_next, dfa);
1228   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1229   if (BE (ret != REG_NOERROR, 0))
1230     return ret;
1231   ret = calc_eclosure (dfa);
1232   if (BE (ret != REG_NOERROR, 0))
1233     return ret;
1234
1235   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1236      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1237   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1238       || dfa->nbackref)
1239     {
1240       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1241       if (BE (dfa->inveclosures == NULL, 0))
1242         return REG_ESPACE;
1243       ret = calc_inveclosure (dfa);
1244     }
1245
1246   return ret;
1247 }
1248
1249 /* Our parse trees are very unbalanced, so we cannot use a stack to
1250    implement parse tree visits.  Instead, we use parent pointers and
1251    some hairy code in these two functions.  */
1252 static reg_errcode_t
1253 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1254            void *extra)
1255 {
1256   bin_tree_t *node, *prev;
1257
1258   for (node = root; ; )
1259     {
1260       /* Descend down the tree, preferably to the left (or to the right
1261          if that's the only child).  */
1262       while (node->left || node->right)
1263         if (node->left)
1264           node = node->left;
1265         else
1266           node = node->right;
1267
1268       do
1269         {
1270           reg_errcode_t err = fn (extra, node);
1271           if (BE (err != REG_NOERROR, 0))
1272             return err;
1273           if (node->parent == NULL)
1274             return REG_NOERROR;
1275           prev = node;
1276           node = node->parent;
1277         }
1278       /* Go up while we have a node that is reached from the right.  */
1279       while (node->right == prev || node->right == NULL);
1280       node = node->right;
1281     }
1282 }
1283
1284 static reg_errcode_t
1285 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1286           void *extra)
1287 {
1288   bin_tree_t *node;
1289
1290   for (node = root; ; )
1291     {
1292       reg_errcode_t err = fn (extra, node);
1293       if (BE (err != REG_NOERROR, 0))
1294         return err;
1295
1296       /* Go to the left node, or up and to the right.  */
1297       if (node->left)
1298         node = node->left;
1299       else
1300         {
1301           bin_tree_t *prev = NULL;
1302           while (node->right == prev || node->right == NULL)
1303             {
1304               prev = node;
1305               node = node->parent;
1306               if (!node)
1307                 return REG_NOERROR;
1308             }
1309           node = node->right;
1310         }
1311     }
1312 }
1313
1314 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1315    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1316    backreferences as well.  Requires a preorder visit.  */
1317 static reg_errcode_t
1318 optimize_subexps (void *extra, bin_tree_t *node)
1319 {
1320   re_dfa_t *dfa = (re_dfa_t *) extra;
1321
1322   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1323     {
1324       int idx = node->token.opr.idx;
1325       node->token.opr.idx = dfa->subexp_map[idx];
1326       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1327     }
1328
1329   else if (node->token.type == SUBEXP
1330            && node->left && node->left->token.type == SUBEXP)
1331     {
1332       Idx other_idx = node->left->token.opr.idx;
1333
1334       node->left = node->left->left;
1335       if (node->left)
1336         node->left->parent = node;
1337
1338       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1339       if (other_idx < BITSET_WORD_BITS)
1340         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1341     }
1342
1343   return REG_NOERROR;
1344 }
1345
1346 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1347    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1348 static reg_errcode_t
1349 lower_subexps (void *extra, bin_tree_t *node)
1350 {
1351   regex_t *preg = (regex_t *) extra;
1352   reg_errcode_t err = REG_NOERROR;
1353
1354   if (node->left && node->left->token.type == SUBEXP)
1355     {
1356       node->left = lower_subexp (&err, preg, node->left);
1357       if (node->left)
1358         node->left->parent = node;
1359     }
1360   if (node->right && node->right->token.type == SUBEXP)
1361     {
1362       node->right = lower_subexp (&err, preg, node->right);
1363       if (node->right)
1364         node->right->parent = node;
1365     }
1366
1367   return err;
1368 }
1369
1370 static bin_tree_t *
1371 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1372 {
1373   re_dfa_t *dfa = preg->buffer;
1374   bin_tree_t *body = node->left;
1375   bin_tree_t *op, *cls, *tree1, *tree;
1376
1377   if (preg->no_sub
1378       /* We do not optimize empty subexpressions, because otherwise we may
1379          have bad CONCAT nodes with NULL children.  This is obviously not
1380          very common, so we do not lose much.  An example that triggers
1381          this case is the sed "script" /\(\)/x.  */
1382       && node->left != NULL
1383       && (node->token.opr.idx >= BITSET_WORD_BITS
1384           || !(dfa->used_bkref_map
1385                & ((bitset_word_t) 1 << node->token.opr.idx))))
1386     return node->left;
1387
1388   /* Convert the SUBEXP node to the concatenation of an
1389      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1390   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1391   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1392   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1393   tree = create_tree (dfa, op, tree1, CONCAT);
1394   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1395     {
1396       *err = REG_ESPACE;
1397       return NULL;
1398     }
1399
1400   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1401   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1402   return tree;
1403 }
1404
1405 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1406    nodes.  Requires a postorder visit.  */
1407 static reg_errcode_t
1408 calc_first (void *extra, bin_tree_t *node)
1409 {
1410   re_dfa_t *dfa = (re_dfa_t *) extra;
1411   if (node->token.type == CONCAT)
1412     {
1413       node->first = node->left->first;
1414       node->node_idx = node->left->node_idx;
1415     }
1416   else
1417     {
1418       node->first = node;
1419       node->node_idx = re_dfa_add_node (dfa, node->token);
1420       if (BE (node->node_idx == REG_MISSING, 0))
1421         return REG_ESPACE;
1422       if (node->token.type == ANCHOR)
1423         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1424     }
1425   return REG_NOERROR;
1426 }
1427
1428 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1429 static reg_errcode_t
1430 calc_next (void *extra, bin_tree_t *node)
1431 {
1432   switch (node->token.type)
1433     {
1434     case OP_DUP_ASTERISK:
1435       node->left->next = node;
1436       break;
1437     case CONCAT:
1438       node->left->next = node->right->first;
1439       node->right->next = node->next;
1440       break;
1441     default:
1442       if (node->left)
1443         node->left->next = node->next;
1444       if (node->right)
1445         node->right->next = node->next;
1446       break;
1447     }
1448   return REG_NOERROR;
1449 }
1450
1451 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1452 static reg_errcode_t
1453 link_nfa_nodes (void *extra, bin_tree_t *node)
1454 {
1455   re_dfa_t *dfa = (re_dfa_t *) extra;
1456   Idx idx = node->node_idx;
1457   reg_errcode_t err = REG_NOERROR;
1458
1459   switch (node->token.type)
1460     {
1461     case CONCAT:
1462       break;
1463
1464     case END_OF_RE:
1465       assert (node->next == NULL);
1466       break;
1467
1468     case OP_DUP_ASTERISK:
1469     case OP_ALT:
1470       {
1471         Idx left, right;
1472         dfa->has_plural_match = 1;
1473         if (node->left != NULL)
1474           left = node->left->first->node_idx;
1475         else
1476           left = node->next->node_idx;
1477         if (node->right != NULL)
1478           right = node->right->first->node_idx;
1479         else
1480           right = node->next->node_idx;
1481         assert (REG_VALID_INDEX (left));
1482         assert (REG_VALID_INDEX (right));
1483         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1484       }
1485       break;
1486
1487     case ANCHOR:
1488     case OP_OPEN_SUBEXP:
1489     case OP_CLOSE_SUBEXP:
1490       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1491       break;
1492
1493     case OP_BACK_REF:
1494       dfa->nexts[idx] = node->next->node_idx;
1495       if (node->token.type == OP_BACK_REF)
1496         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1497       break;
1498
1499     default:
1500       assert (!IS_EPSILON_NODE (node->token.type));
1501       dfa->nexts[idx] = node->next->node_idx;
1502       break;
1503     }
1504
1505   return err;
1506 }
1507
1508 /* Duplicate the epsilon closure of the node ROOT_NODE.
1509    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1510    to their own constraint.  */
1511
1512 static reg_errcode_t
1513 internal_function
1514 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1515                         Idx root_node, unsigned int init_constraint)
1516 {
1517   Idx org_node, clone_node;
1518   bool ok;
1519   unsigned int constraint = init_constraint;
1520   for (org_node = top_org_node, clone_node = top_clone_node;;)
1521     {
1522       Idx org_dest, clone_dest;
1523       if (dfa->nodes[org_node].type == OP_BACK_REF)
1524         {
1525           /* If the back reference epsilon-transit, its destination must
1526              also have the constraint.  Then duplicate the epsilon closure
1527              of the destination of the back reference, and store it in
1528              edests of the back reference.  */
1529           org_dest = dfa->nexts[org_node];
1530           re_node_set_empty (dfa->edests + clone_node);
1531           clone_dest = duplicate_node (dfa, org_dest, constraint);
1532           if (BE (clone_dest == REG_MISSING, 0))
1533             return REG_ESPACE;
1534           dfa->nexts[clone_node] = dfa->nexts[org_node];
1535           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536           if (BE (! ok, 0))
1537             return REG_ESPACE;
1538         }
1539       else if (dfa->edests[org_node].nelem == 0)
1540         {
1541           /* In case of the node can't epsilon-transit, don't duplicate the
1542              destination and store the original destination as the
1543              destination of the node.  */
1544           dfa->nexts[clone_node] = dfa->nexts[org_node];
1545           break;
1546         }
1547       else if (dfa->edests[org_node].nelem == 1)
1548         {
1549           /* In case of the node can epsilon-transit, and it has only one
1550              destination.  */
1551           org_dest = dfa->edests[org_node].elems[0];
1552           re_node_set_empty (dfa->edests + clone_node);
1553           /* If the node is root_node itself, it means the epsilon closure
1554              has a loop.  Then tie it to the destination of the root_node.  */
1555           if (org_node == root_node && clone_node != org_node)
1556             {
1557               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1558               if (BE (! ok, 0))
1559                 return REG_ESPACE;
1560               break;
1561             }
1562           /* In case the node has another constraint, append it.  */
1563           constraint |= dfa->nodes[org_node].constraint;
1564           clone_dest = duplicate_node (dfa, org_dest, constraint);
1565           if (BE (clone_dest == REG_MISSING, 0))
1566             return REG_ESPACE;
1567           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568           if (BE (! ok, 0))
1569             return REG_ESPACE;
1570         }
1571       else /* dfa->edests[org_node].nelem == 2 */
1572         {
1573           /* In case of the node can epsilon-transit, and it has two
1574              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1575           org_dest = dfa->edests[org_node].elems[0];
1576           re_node_set_empty (dfa->edests + clone_node);
1577           /* Search for a duplicated node which satisfies the constraint.  */
1578           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1579           if (clone_dest == REG_MISSING)
1580             {
1581               /* There is no such duplicated node, create a new one.  */
1582               reg_errcode_t err;
1583               clone_dest = duplicate_node (dfa, org_dest, constraint);
1584               if (BE (clone_dest == REG_MISSING, 0))
1585                 return REG_ESPACE;
1586               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1587               if (BE (! ok, 0))
1588                 return REG_ESPACE;
1589               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1590                                             root_node, constraint);
1591               if (BE (err != REG_NOERROR, 0))
1592                 return err;
1593             }
1594           else
1595             {
1596               /* There is a duplicated node which satisfies the constraint,
1597                  use it to avoid infinite loop.  */
1598               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1599               if (BE (! ok, 0))
1600                 return REG_ESPACE;
1601             }
1602
1603           org_dest = dfa->edests[org_node].elems[1];
1604           clone_dest = duplicate_node (dfa, org_dest, constraint);
1605           if (BE (clone_dest == REG_MISSING, 0))
1606             return REG_ESPACE;
1607           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1608           if (BE (! ok, 0))
1609             return REG_ESPACE;
1610         }
1611       org_node = org_dest;
1612       clone_node = clone_dest;
1613     }
1614   return REG_NOERROR;
1615 }
1616
1617 /* Search for a node which is duplicated from the node ORG_NODE, and
1618    satisfies the constraint CONSTRAINT.  */
1619
1620 static Idx
1621 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1622                         unsigned int constraint)
1623 {
1624   Idx idx;
1625   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1626     {
1627       if (org_node == dfa->org_indices[idx]
1628           && constraint == dfa->nodes[idx].constraint)
1629         return idx; /* Found.  */
1630     }
1631   return REG_MISSING; /* Not found.  */
1632 }
1633
1634 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1635    Return the index of the new node, or REG_MISSING if insufficient storage is
1636    available.  */
1637
1638 static Idx
1639 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1640 {
1641   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1642   if (BE (dup_idx != REG_MISSING, 1))
1643     {
1644       dfa->nodes[dup_idx].constraint = constraint;
1645       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1646       dfa->nodes[dup_idx].duplicated = 1;
1647
1648       /* Store the index of the original node.  */
1649       dfa->org_indices[dup_idx] = org_idx;
1650     }
1651   return dup_idx;
1652 }
1653
1654 static reg_errcode_t
1655 calc_inveclosure (re_dfa_t *dfa)
1656 {
1657   Idx src, idx;
1658   bool ok;
1659   for (idx = 0; idx < dfa->nodes_len; ++idx)
1660     re_node_set_init_empty (dfa->inveclosures + idx);
1661
1662   for (src = 0; src < dfa->nodes_len; ++src)
1663     {
1664       Idx *elems = dfa->eclosures[src].elems;
1665       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1666         {
1667           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1668           if (BE (! ok, 0))
1669             return REG_ESPACE;
1670         }
1671     }
1672
1673   return REG_NOERROR;
1674 }
1675
1676 /* Calculate "eclosure" for all the node in DFA.  */
1677
1678 static reg_errcode_t
1679 calc_eclosure (re_dfa_t *dfa)
1680 {
1681   Idx node_idx;
1682   bool incomplete;
1683 #ifdef DEBUG
1684   assert (dfa->nodes_len > 0);
1685 #endif
1686   incomplete = false;
1687   /* For each nodes, calculate epsilon closure.  */
1688   for (node_idx = 0; ; ++node_idx)
1689     {
1690       reg_errcode_t err;
1691       re_node_set eclosure_elem;
1692       if (node_idx == dfa->nodes_len)
1693         {
1694           if (!incomplete)
1695             break;
1696           incomplete = false;
1697           node_idx = 0;
1698         }
1699
1700 #ifdef DEBUG
1701       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1702 #endif
1703
1704       /* If we have already calculated, skip it.  */
1705       if (dfa->eclosures[node_idx].nelem != 0)
1706         continue;
1707       /* Calculate epsilon closure of 'node_idx'.  */
1708       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1709       if (BE (err != REG_NOERROR, 0))
1710         return err;
1711
1712       if (dfa->eclosures[node_idx].nelem == 0)
1713         {
1714           incomplete = true;
1715           re_node_set_free (&eclosure_elem);
1716         }
1717     }
1718   return REG_NOERROR;
1719 }
1720
1721 /* Calculate epsilon closure of NODE.  */
1722
1723 static reg_errcode_t
1724 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1725 {
1726   reg_errcode_t err;
1727   Idx i;
1728   re_node_set eclosure;
1729   bool ok;
1730   bool incomplete = false;
1731   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1732   if (BE (err != REG_NOERROR, 0))
1733     return err;
1734
1735   /* This indicates that we are calculating this node now.
1736      We reference this value to avoid infinite loop.  */
1737   dfa->eclosures[node].nelem = REG_MISSING;
1738
1739   /* If the current node has constraints, duplicate all nodes
1740      since they must inherit the constraints.  */
1741   if (dfa->nodes[node].constraint
1742       && dfa->edests[node].nelem
1743       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1744     {
1745       err = duplicate_node_closure (dfa, node, node, node,
1746                                     dfa->nodes[node].constraint);
1747       if (BE (err != REG_NOERROR, 0))
1748         return err;
1749     }
1750
1751   /* Expand each epsilon destination nodes.  */
1752   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1753     for (i = 0; i < dfa->edests[node].nelem; ++i)
1754       {
1755         re_node_set eclosure_elem;
1756         Idx edest = dfa->edests[node].elems[i];
1757         /* If calculating the epsilon closure of 'edest' is in progress,
1758            return intermediate result.  */
1759         if (dfa->eclosures[edest].nelem == REG_MISSING)
1760           {
1761             incomplete = true;
1762             continue;
1763           }
1764         /* If we haven't calculated the epsilon closure of 'edest' yet,
1765            calculate now. Otherwise use calculated epsilon closure.  */
1766         if (dfa->eclosures[edest].nelem == 0)
1767           {
1768             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1769             if (BE (err != REG_NOERROR, 0))
1770               return err;
1771           }
1772         else
1773           eclosure_elem = dfa->eclosures[edest];
1774         /* Merge the epsilon closure of 'edest'.  */
1775         err = re_node_set_merge (&eclosure, &eclosure_elem);
1776         if (BE (err != REG_NOERROR, 0))
1777           return err;
1778         /* If the epsilon closure of 'edest' is incomplete,
1779            the epsilon closure of this node is also incomplete.  */
1780         if (dfa->eclosures[edest].nelem == 0)
1781           {
1782             incomplete = true;
1783             re_node_set_free (&eclosure_elem);
1784           }
1785       }
1786
1787   /* An epsilon closure includes itself.  */
1788   ok = re_node_set_insert (&eclosure, node);
1789   if (BE (! ok, 0))
1790     return REG_ESPACE;
1791   if (incomplete && !root)
1792     dfa->eclosures[node].nelem = 0;
1793   else
1794     dfa->eclosures[node] = eclosure;
1795   *new_set = eclosure;
1796   return REG_NOERROR;
1797 }
1798 \f
1799 /* Functions for token which are used in the parser.  */
1800
1801 /* Fetch a token from INPUT.
1802    We must not use this function inside bracket expressions.  */
1803
1804 static void
1805 internal_function
1806 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1807 {
1808   re_string_skip_bytes (input, peek_token (result, input, syntax));
1809 }
1810
1811 /* Peek a token from INPUT, and return the length of the token.
1812    We must not use this function inside bracket expressions.  */
1813
1814 static int
1815 internal_function
1816 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1817 {
1818   unsigned char c;
1819
1820   if (re_string_eoi (input))
1821     {
1822       token->type = END_OF_RE;
1823       return 0;
1824     }
1825
1826   c = re_string_peek_byte (input, 0);
1827   token->opr.c = c;
1828
1829   token->word_char = 0;
1830 #ifdef RE_ENABLE_I18N
1831   token->mb_partial = 0;
1832   if (input->mb_cur_max > 1 &&
1833       !re_string_first_byte (input, re_string_cur_idx (input)))
1834     {
1835       token->type = CHARACTER;
1836       token->mb_partial = 1;
1837       return 1;
1838     }
1839 #endif
1840   if (c == '\\')
1841     {
1842       unsigned char c2;
1843       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1844         {
1845           token->type = BACK_SLASH;
1846           return 1;
1847         }
1848
1849       c2 = re_string_peek_byte_case (input, 1);
1850       token->opr.c = c2;
1851       token->type = CHARACTER;
1852 #ifdef RE_ENABLE_I18N
1853       if (input->mb_cur_max > 1)
1854         {
1855           wint_t wc = re_string_wchar_at (input,
1856                                           re_string_cur_idx (input) + 1);
1857           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1858         }
1859       else
1860 #endif
1861         token->word_char = IS_WORD_CHAR (c2) != 0;
1862
1863       switch (c2)
1864         {
1865         case '|':
1866           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1867             token->type = OP_ALT;
1868           break;
1869         case '1': case '2': case '3': case '4': case '5':
1870         case '6': case '7': case '8': case '9':
1871           if (!(syntax & RE_NO_BK_REFS))
1872             {
1873               token->type = OP_BACK_REF;
1874               token->opr.idx = c2 - '1';
1875             }
1876           break;
1877         case '<':
1878           if (!(syntax & RE_NO_GNU_OPS))
1879             {
1880               token->type = ANCHOR;
1881               token->opr.ctx_type = WORD_FIRST;
1882             }
1883           break;
1884         case '>':
1885           if (!(syntax & RE_NO_GNU_OPS))
1886             {
1887               token->type = ANCHOR;
1888               token->opr.ctx_type = WORD_LAST;
1889             }
1890           break;
1891         case 'b':
1892           if (!(syntax & RE_NO_GNU_OPS))
1893             {
1894               token->type = ANCHOR;
1895               token->opr.ctx_type = WORD_DELIM;
1896             }
1897           break;
1898         case 'B':
1899           if (!(syntax & RE_NO_GNU_OPS))
1900             {
1901               token->type = ANCHOR;
1902               token->opr.ctx_type = NOT_WORD_DELIM;
1903             }
1904           break;
1905         case 'w':
1906           if (!(syntax & RE_NO_GNU_OPS))
1907             token->type = OP_WORD;
1908           break;
1909         case 'W':
1910           if (!(syntax & RE_NO_GNU_OPS))
1911             token->type = OP_NOTWORD;
1912           break;
1913         case 's':
1914           if (!(syntax & RE_NO_GNU_OPS))
1915             token->type = OP_SPACE;
1916           break;
1917         case 'S':
1918           if (!(syntax & RE_NO_GNU_OPS))
1919             token->type = OP_NOTSPACE;
1920           break;
1921         case '`':
1922           if (!(syntax & RE_NO_GNU_OPS))
1923             {
1924               token->type = ANCHOR;
1925               token->opr.ctx_type = BUF_FIRST;
1926             }
1927           break;
1928         case '\'':
1929           if (!(syntax & RE_NO_GNU_OPS))
1930             {
1931               token->type = ANCHOR;
1932               token->opr.ctx_type = BUF_LAST;
1933             }
1934           break;
1935         case '(':
1936           if (!(syntax & RE_NO_BK_PARENS))
1937             token->type = OP_OPEN_SUBEXP;
1938           break;
1939         case ')':
1940           if (!(syntax & RE_NO_BK_PARENS))
1941             token->type = OP_CLOSE_SUBEXP;
1942           break;
1943         case '+':
1944           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1945             token->type = OP_DUP_PLUS;
1946           break;
1947         case '?':
1948           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1949             token->type = OP_DUP_QUESTION;
1950           break;
1951         case '{':
1952           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1953             token->type = OP_OPEN_DUP_NUM;
1954           break;
1955         case '}':
1956           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1957             token->type = OP_CLOSE_DUP_NUM;
1958           break;
1959         default:
1960           break;
1961         }
1962       return 2;
1963     }
1964
1965   token->type = CHARACTER;
1966 #ifdef RE_ENABLE_I18N
1967   if (input->mb_cur_max > 1)
1968     {
1969       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1970       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1971     }
1972   else
1973 #endif
1974     token->word_char = IS_WORD_CHAR (token->opr.c);
1975
1976   switch (c)
1977     {
1978     case '\n':
1979       if (syntax & RE_NEWLINE_ALT)
1980         token->type = OP_ALT;
1981       break;
1982     case '|':
1983       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1984         token->type = OP_ALT;
1985       break;
1986     case '*':
1987       token->type = OP_DUP_ASTERISK;
1988       break;
1989     case '+':
1990       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1991         token->type = OP_DUP_PLUS;
1992       break;
1993     case '?':
1994       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1995         token->type = OP_DUP_QUESTION;
1996       break;
1997     case '{':
1998       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1999         token->type = OP_OPEN_DUP_NUM;
2000       break;
2001     case '}':
2002       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2003         token->type = OP_CLOSE_DUP_NUM;
2004       break;
2005     case '(':
2006       if (syntax & RE_NO_BK_PARENS)
2007         token->type = OP_OPEN_SUBEXP;
2008       break;
2009     case ')':
2010       if (syntax & RE_NO_BK_PARENS)
2011         token->type = OP_CLOSE_SUBEXP;
2012       break;
2013     case '[':
2014       token->type = OP_OPEN_BRACKET;
2015       break;
2016     case '.':
2017       token->type = OP_PERIOD;
2018       break;
2019     case '^':
2020       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2021           re_string_cur_idx (input) != 0)
2022         {
2023           char prev = re_string_peek_byte (input, -1);
2024           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2025             break;
2026         }
2027       token->type = ANCHOR;
2028       token->opr.ctx_type = LINE_FIRST;
2029       break;
2030     case '$':
2031       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2032           re_string_cur_idx (input) + 1 != re_string_length (input))
2033         {
2034           re_token_t next;
2035           re_string_skip_bytes (input, 1);
2036           peek_token (&next, input, syntax);
2037           re_string_skip_bytes (input, -1);
2038           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2039             break;
2040         }
2041       token->type = ANCHOR;
2042       token->opr.ctx_type = LINE_LAST;
2043       break;
2044     default:
2045       break;
2046     }
2047   return 1;
2048 }
2049
2050 /* Peek a token from INPUT, and return the length of the token.
2051    We must not use this function out of bracket expressions.  */
2052
2053 static int
2054 internal_function
2055 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2056 {
2057   unsigned char c;
2058   if (re_string_eoi (input))
2059     {
2060       token->type = END_OF_RE;
2061       return 0;
2062     }
2063   c = re_string_peek_byte (input, 0);
2064   token->opr.c = c;
2065
2066 #ifdef RE_ENABLE_I18N
2067   if (input->mb_cur_max > 1 &&
2068       !re_string_first_byte (input, re_string_cur_idx (input)))
2069     {
2070       token->type = CHARACTER;
2071       return 1;
2072     }
2073 #endif /* RE_ENABLE_I18N */
2074
2075   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2076       && re_string_cur_idx (input) + 1 < re_string_length (input))
2077     {
2078       /* In this case, '\' escape a character.  */
2079       unsigned char c2;
2080       re_string_skip_bytes (input, 1);
2081       c2 = re_string_peek_byte (input, 0);
2082       token->opr.c = c2;
2083       token->type = CHARACTER;
2084       return 1;
2085     }
2086   if (c == '[') /* '[' is a special char in a bracket exps.  */
2087     {
2088       unsigned char c2;
2089       int token_len;
2090       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2091         c2 = re_string_peek_byte (input, 1);
2092       else
2093         c2 = 0;
2094       token->opr.c = c2;
2095       token_len = 2;
2096       switch (c2)
2097         {
2098         case '.':
2099           token->type = OP_OPEN_COLL_ELEM;
2100           break;
2101         case '=':
2102           token->type = OP_OPEN_EQUIV_CLASS;
2103           break;
2104         case ':':
2105           if (syntax & RE_CHAR_CLASSES)
2106             {
2107               token->type = OP_OPEN_CHAR_CLASS;
2108               break;
2109             }
2110           /* else fall through.  */
2111         default:
2112           token->type = CHARACTER;
2113           token->opr.c = c;
2114           token_len = 1;
2115           break;
2116         }
2117       return token_len;
2118     }
2119   switch (c)
2120     {
2121     case '-':
2122       token->type = OP_CHARSET_RANGE;
2123       break;
2124     case ']':
2125       token->type = OP_CLOSE_BRACKET;
2126       break;
2127     case '^':
2128       token->type = OP_NON_MATCH_LIST;
2129       break;
2130     default:
2131       token->type = CHARACTER;
2132     }
2133   return 1;
2134 }
2135 \f
2136 /* Functions for parser.  */
2137
2138 /* Entry point of the parser.
2139    Parse the regular expression REGEXP and return the structure tree.
2140    If an error occurs, ERR is set by error code, and return NULL.
2141    This function build the following tree, from regular expression <reg_exp>:
2142            CAT
2143            / \
2144           /   \
2145    <reg_exp>  EOR
2146
2147    CAT means concatenation.
2148    EOR means end of regular expression.  */
2149
2150 static bin_tree_t *
2151 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2152        reg_errcode_t *err)
2153 {
2154   re_dfa_t *dfa = preg->buffer;
2155   bin_tree_t *tree, *eor, *root;
2156   re_token_t current_token;
2157   dfa->syntax = syntax;
2158   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2159   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2160   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2161     return NULL;
2162   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2163   if (tree != NULL)
2164     root = create_tree (dfa, tree, eor, CONCAT);
2165   else
2166     root = eor;
2167   if (BE (eor == NULL || root == NULL, 0))
2168     {
2169       *err = REG_ESPACE;
2170       return NULL;
2171     }
2172   return root;
2173 }
2174
2175 /* This function build the following tree, from regular expression
2176    <branch1>|<branch2>:
2177            ALT
2178            / \
2179           /   \
2180    <branch1> <branch2>
2181
2182    ALT means alternative, which represents the operator '|'.  */
2183
2184 static bin_tree_t *
2185 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2186                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2187 {
2188   re_dfa_t *dfa = preg->buffer;
2189   bin_tree_t *tree, *branch = NULL;
2190   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2191   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2192     return NULL;
2193
2194   while (token->type == OP_ALT)
2195     {
2196       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2197       if (token->type != OP_ALT && token->type != END_OF_RE
2198           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2199         {
2200           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2201           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2202             return NULL;
2203         }
2204       else
2205         branch = NULL;
2206       tree = create_tree (dfa, tree, branch, OP_ALT);
2207       if (BE (tree == NULL, 0))
2208         {
2209           *err = REG_ESPACE;
2210           return NULL;
2211         }
2212     }
2213   return tree;
2214 }
2215
2216 /* This function build the following tree, from regular expression
2217    <exp1><exp2>:
2218         CAT
2219         / \
2220        /   \
2221    <exp1> <exp2>
2222
2223    CAT means concatenation.  */
2224
2225 static bin_tree_t *
2226 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2227               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2228 {
2229   bin_tree_t *tree, *expr;
2230   re_dfa_t *dfa = preg->buffer;
2231   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2232   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2233     return NULL;
2234
2235   while (token->type != OP_ALT && token->type != END_OF_RE
2236          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2237     {
2238       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2239       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2240         {
2241           if (tree != NULL)
2242             postorder (tree, free_tree, NULL);
2243           return NULL;
2244         }
2245       if (tree != NULL && expr != NULL)
2246         {
2247           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2248           if (newtree == NULL)
2249             {
2250               postorder (expr, free_tree, NULL);
2251               postorder (tree, free_tree, NULL);
2252               *err = REG_ESPACE;
2253               return NULL;
2254             }
2255           tree = newtree;
2256         }
2257       else if (tree == NULL)
2258         tree = expr;
2259       /* Otherwise expr == NULL, we don't need to create new tree.  */
2260     }
2261   return tree;
2262 }
2263
2264 /* This function build the following tree, from regular expression a*:
2265          *
2266          |
2267          a
2268 */
2269
2270 static bin_tree_t *
2271 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2272                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2273 {
2274   re_dfa_t *dfa = preg->buffer;
2275   bin_tree_t *tree;
2276   switch (token->type)
2277     {
2278     case CHARACTER:
2279       tree = create_token_tree (dfa, NULL, NULL, token);
2280       if (BE (tree == NULL, 0))
2281         {
2282           *err = REG_ESPACE;
2283           return NULL;
2284         }
2285 #ifdef RE_ENABLE_I18N
2286       if (dfa->mb_cur_max > 1)
2287         {
2288           while (!re_string_eoi (regexp)
2289                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2290             {
2291               bin_tree_t *mbc_remain;
2292               fetch_token (token, regexp, syntax);
2293               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2294               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2295               if (BE (mbc_remain == NULL || tree == NULL, 0))
2296                 {
2297                   *err = REG_ESPACE;
2298                   return NULL;
2299                 }
2300             }
2301         }
2302 #endif
2303       break;
2304     case OP_OPEN_SUBEXP:
2305       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2306       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2307         return NULL;
2308       break;
2309     case OP_OPEN_BRACKET:
2310       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2311       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2312         return NULL;
2313       break;
2314     case OP_BACK_REF:
2315       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2316         {
2317           *err = REG_ESUBREG;
2318           return NULL;
2319         }
2320       dfa->used_bkref_map |= 1 << token->opr.idx;
2321       tree = create_token_tree (dfa, NULL, NULL, token);
2322       if (BE (tree == NULL, 0))
2323         {
2324           *err = REG_ESPACE;
2325           return NULL;
2326         }
2327       ++dfa->nbackref;
2328       dfa->has_mb_node = 1;
2329       break;
2330     case OP_OPEN_DUP_NUM:
2331       if (syntax & RE_CONTEXT_INVALID_DUP)
2332         {
2333           *err = REG_BADRPT;
2334           return NULL;
2335         }
2336       /* FALLTHROUGH */
2337     case OP_DUP_ASTERISK:
2338     case OP_DUP_PLUS:
2339     case OP_DUP_QUESTION:
2340       if (syntax & RE_CONTEXT_INVALID_OPS)
2341         {
2342           *err = REG_BADRPT;
2343           return NULL;
2344         }
2345       else if (syntax & RE_CONTEXT_INDEP_OPS)
2346         {
2347           fetch_token (token, regexp, syntax);
2348           return parse_expression (regexp, preg, token, syntax, nest, err);
2349         }
2350       /* else fall through  */
2351     case OP_CLOSE_SUBEXP:
2352       if ((token->type == OP_CLOSE_SUBEXP) &&
2353           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2354         {
2355           *err = REG_ERPAREN;
2356           return NULL;
2357         }
2358       /* else fall through  */
2359     case OP_CLOSE_DUP_NUM:
2360       /* We treat it as a normal character.  */
2361
2362       /* Then we can these characters as normal characters.  */
2363       token->type = CHARACTER;
2364       /* mb_partial and word_char bits should be initialized already
2365          by peek_token.  */
2366       tree = create_token_tree (dfa, NULL, NULL, token);
2367       if (BE (tree == NULL, 0))
2368         {
2369           *err = REG_ESPACE;
2370           return NULL;
2371         }
2372       break;
2373     case ANCHOR:
2374       if ((token->opr.ctx_type
2375            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2376           && dfa->word_ops_used == 0)
2377         init_word_char (dfa);
2378       if (token->opr.ctx_type == WORD_DELIM
2379           || token->opr.ctx_type == NOT_WORD_DELIM)
2380         {
2381           bin_tree_t *tree_first, *tree_last;
2382           if (token->opr.ctx_type == WORD_DELIM)
2383             {
2384               token->opr.ctx_type = WORD_FIRST;
2385               tree_first = create_token_tree (dfa, NULL, NULL, token);
2386               token->opr.ctx_type = WORD_LAST;
2387             }
2388           else
2389             {
2390               token->opr.ctx_type = INSIDE_WORD;
2391               tree_first = create_token_tree (dfa, NULL, NULL, token);
2392               token->opr.ctx_type = INSIDE_NOTWORD;
2393             }
2394           tree_last = create_token_tree (dfa, NULL, NULL, token);
2395           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2396           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2397             {
2398               *err = REG_ESPACE;
2399               return NULL;
2400             }
2401         }
2402       else
2403         {
2404           tree = create_token_tree (dfa, NULL, NULL, token);
2405           if (BE (tree == NULL, 0))
2406             {
2407               *err = REG_ESPACE;
2408               return NULL;
2409             }
2410         }
2411       /* We must return here, since ANCHORs can't be followed
2412          by repetition operators.
2413          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2414              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2415       fetch_token (token, regexp, syntax);
2416       return tree;
2417     case OP_PERIOD:
2418       tree = create_token_tree (dfa, NULL, NULL, token);
2419       if (BE (tree == NULL, 0))
2420         {
2421           *err = REG_ESPACE;
2422           return NULL;
2423         }
2424       if (dfa->mb_cur_max > 1)
2425         dfa->has_mb_node = 1;
2426       break;
2427     case OP_WORD:
2428     case OP_NOTWORD:
2429       tree = build_charclass_op (dfa, regexp->trans,
2430                                  "alnum",
2431                                  "_",
2432                                  token->type == OP_NOTWORD, err);
2433       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2434         return NULL;
2435       break;
2436     case OP_SPACE:
2437     case OP_NOTSPACE:
2438       tree = build_charclass_op (dfa, regexp->trans,
2439                                  "space",
2440                                  "",
2441                                  token->type == OP_NOTSPACE, err);
2442       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2443         return NULL;
2444       break;
2445     case OP_ALT:
2446     case END_OF_RE:
2447       return NULL;
2448     case BACK_SLASH:
2449       *err = REG_EESCAPE;
2450       return NULL;
2451     default:
2452       /* Must not happen?  */
2453 #ifdef DEBUG
2454       assert (0);
2455 #endif
2456       return NULL;
2457     }
2458   fetch_token (token, regexp, syntax);
2459
2460   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2461          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2462     {
2463       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2464       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2465         return NULL;
2466       /* In BRE consecutive duplications are not allowed.  */
2467       if ((syntax & RE_CONTEXT_INVALID_DUP)
2468           && (token->type == OP_DUP_ASTERISK
2469               || token->type == OP_OPEN_DUP_NUM))
2470         {
2471           *err = REG_BADRPT;
2472           return NULL;
2473         }
2474     }
2475
2476   return tree;
2477 }
2478
2479 /* This function build the following tree, from regular expression
2480    (<reg_exp>):
2481          SUBEXP
2482             |
2483         <reg_exp>
2484 */
2485
2486 static bin_tree_t *
2487 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2488                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2489 {
2490   re_dfa_t *dfa = preg->buffer;
2491   bin_tree_t *tree;
2492   size_t cur_nsub;
2493   cur_nsub = preg->re_nsub++;
2494
2495   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2496
2497   /* The subexpression may be a null string.  */
2498   if (token->type == OP_CLOSE_SUBEXP)
2499     tree = NULL;
2500   else
2501     {
2502       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2503       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2504         {
2505           if (tree != NULL)
2506             postorder (tree, free_tree, NULL);
2507           *err = REG_EPAREN;
2508         }
2509       if (BE (*err != REG_NOERROR, 0))
2510         return NULL;
2511     }
2512
2513   if (cur_nsub <= '9' - '1')
2514     dfa->completed_bkref_map |= 1 << cur_nsub;
2515
2516   tree = create_tree (dfa, tree, NULL, SUBEXP);
2517   if (BE (tree == NULL, 0))
2518     {
2519       *err = REG_ESPACE;
2520       return NULL;
2521     }
2522   tree->token.opr.idx = cur_nsub;
2523   return tree;
2524 }
2525
2526 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2527
2528 static bin_tree_t *
2529 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2530               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2531 {
2532   bin_tree_t *tree = NULL, *old_tree = NULL;
2533   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2534   re_token_t start_token = *token;
2535
2536   if (token->type == OP_OPEN_DUP_NUM)
2537     {
2538       end = 0;
2539       start = fetch_number (regexp, token, syntax);
2540       if (start == REG_MISSING)
2541         {
2542           if (token->type == CHARACTER && token->opr.c == ',')
2543             start = 0; /* We treat "{,m}" as "{0,m}".  */
2544           else
2545             {
2546               *err = REG_BADBR; /* <re>{} is invalid.  */
2547               return NULL;
2548             }
2549         }
2550       if (BE (start != REG_ERROR, 1))
2551         {
2552           /* We treat "{n}" as "{n,n}".  */
2553           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2554                  : ((token->type == CHARACTER && token->opr.c == ',')
2555                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2556         }
2557       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2558         {
2559           /* Invalid sequence.  */
2560           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2561             {
2562               if (token->type == END_OF_RE)
2563                 *err = REG_EBRACE;
2564               else
2565                 *err = REG_BADBR;
2566
2567               return NULL;
2568             }
2569
2570           /* If the syntax bit is set, rollback.  */
2571           re_string_set_index (regexp, start_idx);
2572           *token = start_token;
2573           token->type = CHARACTER;
2574           /* mb_partial and word_char bits should be already initialized by
2575              peek_token.  */
2576           return elem;
2577         }
2578
2579       if (BE ((end != REG_MISSING && start > end)
2580               || token->type != OP_CLOSE_DUP_NUM, 0))
2581         {
2582           /* First number greater than second.  */
2583           *err = REG_BADBR;
2584           return NULL;
2585         }
2586
2587       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2588         {
2589           *err = REG_ESIZE;
2590           return NULL;
2591         }
2592     }
2593   else
2594     {
2595       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2596       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2597     }
2598
2599   fetch_token (token, regexp, syntax);
2600
2601   if (BE (elem == NULL, 0))
2602     return NULL;
2603   if (BE (start == 0 && end == 0, 0))
2604     {
2605       postorder (elem, free_tree, NULL);
2606       return NULL;
2607     }
2608
2609   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2610   if (BE (start > 0, 0))
2611     {
2612       tree = elem;
2613       for (i = 2; i <= start; ++i)
2614         {
2615           elem = duplicate_tree (elem, dfa);
2616           tree = create_tree (dfa, tree, elem, CONCAT);
2617           if (BE (elem == NULL || tree == NULL, 0))
2618             goto parse_dup_op_espace;
2619         }
2620
2621       if (start == end)
2622         return tree;
2623
2624       /* Duplicate ELEM before it is marked optional.  */
2625       elem = duplicate_tree (elem, dfa);
2626       old_tree = tree;
2627     }
2628   else
2629     old_tree = NULL;
2630
2631   if (elem->token.type == SUBEXP)
2632     {
2633       uintptr_t subidx = elem->token.opr.idx;
2634       postorder (elem, mark_opt_subexp, (void *) subidx);
2635     }
2636
2637   tree = create_tree (dfa, elem, NULL,
2638                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2639   if (BE (tree == NULL, 0))
2640     goto parse_dup_op_espace;
2641
2642 /* From gnulib's "intprops.h":
2643    True if the arithmetic type T is signed.  */
2644 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2645
2646   /* This loop is actually executed only when end != REG_MISSING,
2647      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2648      already created the start+1-th copy.  */
2649   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2650     for (i = start + 2; i <= end; ++i)
2651       {
2652         elem = duplicate_tree (elem, dfa);
2653         tree = create_tree (dfa, tree, elem, CONCAT);
2654         if (BE (elem == NULL || tree == NULL, 0))
2655           goto parse_dup_op_espace;
2656
2657         tree = create_tree (dfa, tree, NULL, OP_ALT);
2658         if (BE (tree == NULL, 0))
2659           goto parse_dup_op_espace;
2660       }
2661
2662   if (old_tree)
2663     tree = create_tree (dfa, old_tree, tree, CONCAT);
2664
2665   return tree;
2666
2667  parse_dup_op_espace:
2668   *err = REG_ESPACE;
2669   return NULL;
2670 }
2671
2672 /* Size of the names for collating symbol/equivalence_class/character_class.
2673    I'm not sure, but maybe enough.  */
2674 #define BRACKET_NAME_BUF_SIZE 32
2675
2676 #ifndef _LIBC
2677   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2678      Build the range expression which starts from START_ELEM, and ends
2679      at END_ELEM.  The result are written to MBCSET and SBCSET.
2680      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2681      mbcset->range_ends, is a pointer argument since we may
2682      update it.  */
2683
2684 static reg_errcode_t
2685 internal_function
2686 # ifdef RE_ENABLE_I18N
2687 build_range_exp (const reg_syntax_t syntax,
2688                  bitset_t sbcset,
2689                  re_charset_t *mbcset,
2690                  Idx *range_alloc,
2691                  const bracket_elem_t *start_elem,
2692                  const bracket_elem_t *end_elem)
2693 # else /* not RE_ENABLE_I18N */
2694 build_range_exp (const reg_syntax_t syntax,
2695                  bitset_t sbcset,
2696                  const bracket_elem_t *start_elem,
2697                  const bracket_elem_t *end_elem)
2698 # endif /* not RE_ENABLE_I18N */
2699 {
2700   unsigned int start_ch, end_ch;
2701   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2702   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2703           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2704           0))
2705     return REG_ERANGE;
2706
2707   /* We can handle no multi character collating elements without libc
2708      support.  */
2709   if (BE ((start_elem->type == COLL_SYM
2710            && strlen ((char *) start_elem->opr.name) > 1)
2711           || (end_elem->type == COLL_SYM
2712               && strlen ((char *) end_elem->opr.name) > 1), 0))
2713     return REG_ECOLLATE;
2714
2715 # ifdef RE_ENABLE_I18N
2716   {
2717     wchar_t wc;
2718     wint_t start_wc;
2719     wint_t end_wc;
2720
2721     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2722                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2723                    : 0));
2724     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2725               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2726                  : 0));
2727     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2728                 ? __btowc (start_ch) : start_elem->opr.wch);
2729     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2730               ? __btowc (end_ch) : end_elem->opr.wch);
2731     if (start_wc == WEOF || end_wc == WEOF)
2732       return REG_ECOLLATE;
2733     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2734       return REG_ERANGE;
2735
2736     /* Got valid collation sequence values, add them as a new entry.
2737        However, for !_LIBC we have no collation elements: if the
2738        character set is single byte, the single byte character set
2739        that we build below suffices.  parse_bracket_exp passes
2740        no MBCSET if dfa->mb_cur_max == 1.  */
2741     if (mbcset)
2742       {
2743         /* Check the space of the arrays.  */
2744         if (BE (*range_alloc == mbcset->nranges, 0))
2745           {
2746             /* There is not enough space, need realloc.  */
2747             wchar_t *new_array_start, *new_array_end;
2748             Idx new_nranges;
2749
2750             /* +1 in case of mbcset->nranges is 0.  */
2751             new_nranges = 2 * mbcset->nranges + 1;
2752             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2753                are NULL if *range_alloc == 0.  */
2754             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2755                                           new_nranges);
2756             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2757                                         new_nranges);
2758
2759             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2760               return REG_ESPACE;
2761
2762             mbcset->range_starts = new_array_start;
2763             mbcset->range_ends = new_array_end;
2764             *range_alloc = new_nranges;
2765           }
2766
2767         mbcset->range_starts[mbcset->nranges] = start_wc;
2768         mbcset->range_ends[mbcset->nranges++] = end_wc;
2769       }
2770
2771     /* Build the table for single byte characters.  */
2772     for (wc = 0; wc < SBC_MAX; ++wc)
2773       {
2774         if (start_wc <= wc && wc <= end_wc)
2775           bitset_set (sbcset, wc);
2776       }
2777   }
2778 # else /* not RE_ENABLE_I18N */
2779   {
2780     unsigned int ch;
2781     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2782                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2783                    : 0));
2784     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2785               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2786                  : 0));
2787     if (start_ch > end_ch)
2788       return REG_ERANGE;
2789     /* Build the table for single byte characters.  */
2790     for (ch = 0; ch < SBC_MAX; ++ch)
2791       if (start_ch <= ch  && ch <= end_ch)
2792         bitset_set (sbcset, ch);
2793   }
2794 # endif /* not RE_ENABLE_I18N */
2795   return REG_NOERROR;
2796 }
2797 #endif /* not _LIBC */
2798
2799 #ifndef _LIBC
2800 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2801    Build the collating element which is represented by NAME.
2802    The result are written to MBCSET and SBCSET.
2803    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2804    pointer argument since we may update it.  */
2805
2806 static reg_errcode_t
2807 internal_function
2808 # ifdef RE_ENABLE_I18N
2809 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2810                         Idx *coll_sym_alloc, const unsigned char *name)
2811 # else /* not RE_ENABLE_I18N */
2812 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2813 # endif /* not RE_ENABLE_I18N */
2814 {
2815   size_t name_len = strlen ((const char *) name);
2816   if (BE (name_len != 1, 0))
2817     return REG_ECOLLATE;
2818   else
2819     {
2820       bitset_set (sbcset, name[0]);
2821       return REG_NOERROR;
2822     }
2823 }
2824 #endif /* not _LIBC */
2825
2826 /* This function parse bracket expression like "[abc]", "[a-c]",
2827    "[[.a-a.]]" etc.  */
2828
2829 static bin_tree_t *
2830 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2831                    reg_syntax_t syntax, reg_errcode_t *err)
2832 {
2833 #ifdef _LIBC
2834   const unsigned char *collseqmb;
2835   const char *collseqwc;
2836   uint32_t nrules;
2837   int32_t table_size;
2838   const int32_t *symb_table;
2839   const unsigned char *extra;
2840
2841   /* Local function for parse_bracket_exp used in _LIBC environment.
2842      Seek the collating symbol entry corresponding to NAME.
2843      Return the index of the symbol in the SYMB_TABLE,
2844      or -1 if not found.  */
2845
2846   auto inline int32_t
2847   __attribute__ ((always_inline))
2848   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2849     {
2850       int32_t elem;
2851
2852       for (elem = 0; elem < table_size; elem++)
2853         if (symb_table[2 * elem] != 0)
2854           {
2855             int32_t idx = symb_table[2 * elem + 1];
2856             /* Skip the name of collating element name.  */
2857             idx += 1 + extra[idx];
2858             if (/* Compare the length of the name.  */
2859                 name_len == extra[idx]
2860                 /* Compare the name.  */
2861                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2862               /* Yep, this is the entry.  */
2863               return elem;
2864           }
2865       return -1;
2866     }
2867
2868   /* Local function for parse_bracket_exp used in _LIBC environment.
2869      Look up the collation sequence value of BR_ELEM.
2870      Return the value if succeeded, UINT_MAX otherwise.  */
2871
2872   auto inline unsigned int
2873   __attribute__ ((always_inline))
2874   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2875     {
2876       if (br_elem->type == SB_CHAR)
2877         {
2878           /*
2879           if (MB_CUR_MAX == 1)
2880           */
2881           if (nrules == 0)
2882             return collseqmb[br_elem->opr.ch];
2883           else
2884             {
2885               wint_t wc = __btowc (br_elem->opr.ch);
2886               return __collseq_table_lookup (collseqwc, wc);
2887             }
2888         }
2889       else if (br_elem->type == MB_CHAR)
2890         {
2891           if (nrules != 0)
2892             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2893         }
2894       else if (br_elem->type == COLL_SYM)
2895         {
2896           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2897           if (nrules != 0)
2898             {
2899               int32_t elem, idx;
2900               elem = seek_collating_symbol_entry (br_elem->opr.name,
2901                                                   sym_name_len);
2902               if (elem != -1)
2903                 {
2904                   /* We found the entry.  */
2905                   idx = symb_table[2 * elem + 1];
2906                   /* Skip the name of collating element name.  */
2907                   idx += 1 + extra[idx];
2908                   /* Skip the byte sequence of the collating element.  */
2909                   idx += 1 + extra[idx];
2910                   /* Adjust for the alignment.  */
2911                   idx = (idx + 3) & ~3;
2912                   /* Skip the multibyte collation sequence value.  */
2913                   idx += sizeof (unsigned int);
2914                   /* Skip the wide char sequence of the collating element.  */
2915                   idx += sizeof (unsigned int) *
2916                     (1 + *(unsigned int *) (extra + idx));
2917                   /* Return the collation sequence value.  */
2918                   return *(unsigned int *) (extra + idx);
2919                 }
2920               else if (sym_name_len == 1)
2921                 {
2922                   /* No valid character.  Match it as a single byte
2923                      character.  */
2924                   return collseqmb[br_elem->opr.name[0]];
2925                 }
2926             }
2927           else if (sym_name_len == 1)
2928             return collseqmb[br_elem->opr.name[0]];
2929         }
2930       return UINT_MAX;
2931     }
2932
2933   /* Local function for parse_bracket_exp used in _LIBC environment.
2934      Build the range expression which starts from START_ELEM, and ends
2935      at END_ELEM.  The result are written to MBCSET and SBCSET.
2936      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2937      mbcset->range_ends, is a pointer argument since we may
2938      update it.  */
2939
2940   auto inline reg_errcode_t
2941   __attribute__ ((always_inline))
2942   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2943                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2944     {
2945       unsigned int ch;
2946       uint32_t start_collseq;
2947       uint32_t end_collseq;
2948
2949       /* Equivalence Classes and Character Classes can't be a range
2950          start/end.  */
2951       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2952               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2953               0))
2954         return REG_ERANGE;
2955
2956       /* FIXME: Implement rational ranges here, too.  */
2957       start_collseq = lookup_collation_sequence_value (start_elem);
2958       end_collseq = lookup_collation_sequence_value (end_elem);
2959       /* Check start/end collation sequence values.  */
2960       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2961         return REG_ECOLLATE;
2962       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2963         return REG_ERANGE;
2964
2965       /* Got valid collation sequence values, add them as a new entry.
2966          However, if we have no collation elements, and the character set
2967          is single byte, the single byte character set that we
2968          build below suffices. */
2969       if (nrules > 0 || dfa->mb_cur_max > 1)
2970         {
2971           /* Check the space of the arrays.  */
2972           if (BE (*range_alloc == mbcset->nranges, 0))
2973             {
2974               /* There is not enough space, need realloc.  */
2975               uint32_t *new_array_start;
2976               uint32_t *new_array_end;
2977               Idx new_nranges;
2978
2979               /* +1 in case of mbcset->nranges is 0.  */
2980               new_nranges = 2 * mbcset->nranges + 1;
2981               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2982                                             new_nranges);
2983               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2984                                           new_nranges);
2985
2986               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2987                 return REG_ESPACE;
2988
2989               mbcset->range_starts = new_array_start;
2990               mbcset->range_ends = new_array_end;
2991               *range_alloc = new_nranges;
2992             }
2993
2994           mbcset->range_starts[mbcset->nranges] = start_collseq;
2995           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2996         }
2997
2998       /* Build the table for single byte characters.  */
2999       for (ch = 0; ch < SBC_MAX; ch++)
3000         {
3001           uint32_t ch_collseq;
3002           /*
3003           if (MB_CUR_MAX == 1)
3004           */
3005           if (nrules == 0)
3006             ch_collseq = collseqmb[ch];
3007           else
3008             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3009           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3010             bitset_set (sbcset, ch);
3011         }
3012       return REG_NOERROR;
3013     }
3014
3015   /* Local function for parse_bracket_exp used in _LIBC environment.
3016      Build the collating element which is represented by NAME.
3017      The result are written to MBCSET and SBCSET.
3018      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3019      pointer argument since we may update it.  */
3020
3021   auto inline reg_errcode_t
3022   __attribute__ ((always_inline))
3023   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3024                           Idx *coll_sym_alloc, const unsigned char *name)
3025     {
3026       int32_t elem, idx;
3027       size_t name_len = strlen ((const char *) name);
3028       if (nrules != 0)
3029         {
3030           elem = seek_collating_symbol_entry (name, name_len);
3031           if (elem != -1)
3032             {
3033               /* We found the entry.  */
3034               idx = symb_table[2 * elem + 1];
3035               /* Skip the name of collating element name.  */
3036               idx += 1 + extra[idx];
3037             }
3038           else if (name_len == 1)
3039             {
3040               /* No valid character, treat it as a normal
3041                  character.  */
3042               bitset_set (sbcset, name[0]);
3043               return REG_NOERROR;
3044             }
3045           else
3046             return REG_ECOLLATE;
3047
3048           /* Got valid collation sequence, add it as a new entry.  */
3049           /* Check the space of the arrays.  */
3050           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3051             {
3052               /* Not enough, realloc it.  */
3053               /* +1 in case of mbcset->ncoll_syms is 0.  */
3054               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3055               /* Use realloc since mbcset->coll_syms is NULL
3056                  if *alloc == 0.  */
3057               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3058                                                    new_coll_sym_alloc);
3059               if (BE (new_coll_syms == NULL, 0))
3060                 return REG_ESPACE;
3061               mbcset->coll_syms = new_coll_syms;
3062               *coll_sym_alloc = new_coll_sym_alloc;
3063             }
3064           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3065           return REG_NOERROR;
3066         }
3067       else
3068         {
3069           if (BE (name_len != 1, 0))
3070             return REG_ECOLLATE;
3071           else
3072             {
3073               bitset_set (sbcset, name[0]);
3074               return REG_NOERROR;
3075             }
3076         }
3077     }
3078 #endif
3079
3080   re_token_t br_token;
3081   re_bitset_ptr_t sbcset;
3082 #ifdef RE_ENABLE_I18N
3083   re_charset_t *mbcset;
3084   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3085   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3086 #endif /* not RE_ENABLE_I18N */
3087   bool non_match = false;
3088   bin_tree_t *work_tree;
3089   int token_len;
3090   bool first_round = true;
3091 #ifdef _LIBC
3092   collseqmb = (const unsigned char *)
3093     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3094   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3095   if (nrules)
3096     {
3097       /*
3098       if (MB_CUR_MAX > 1)
3099       */
3100       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3101       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3102       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3103                                                   _NL_COLLATE_SYMB_TABLEMB);
3104       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3105                                                    _NL_COLLATE_SYMB_EXTRAMB);
3106     }
3107 #endif
3108   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3109 #ifdef RE_ENABLE_I18N
3110   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3111 #endif /* RE_ENABLE_I18N */
3112 #ifdef RE_ENABLE_I18N
3113   if (BE (sbcset == NULL || mbcset == NULL, 0))
3114 #else
3115   if (BE (sbcset == NULL, 0))
3116 #endif /* RE_ENABLE_I18N */
3117     {
3118       re_free (sbcset);
3119 #ifdef RE_ENABLE_I18N
3120       re_free (mbcset);
3121 #endif
3122       *err = REG_ESPACE;
3123       return NULL;
3124     }
3125
3126   token_len = peek_token_bracket (token, regexp, syntax);
3127   if (BE (token->type == END_OF_RE, 0))
3128     {
3129       *err = REG_BADPAT;
3130       goto parse_bracket_exp_free_return;
3131     }
3132   if (token->type == OP_NON_MATCH_LIST)
3133     {
3134 #ifdef RE_ENABLE_I18N
3135       mbcset->non_match = 1;
3136 #endif /* not RE_ENABLE_I18N */
3137       non_match = true;
3138       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3139         bitset_set (sbcset, '\n');
3140       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3141       token_len = peek_token_bracket (token, regexp, syntax);
3142       if (BE (token->type == END_OF_RE, 0))
3143         {
3144           *err = REG_BADPAT;
3145           goto parse_bracket_exp_free_return;
3146         }
3147     }
3148
3149   /* We treat the first ']' as a normal character.  */
3150   if (token->type == OP_CLOSE_BRACKET)
3151     token->type = CHARACTER;
3152
3153   while (1)
3154     {
3155       bracket_elem_t start_elem, end_elem;
3156       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3157       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3158       reg_errcode_t ret;
3159       int token_len2 = 0;
3160       bool is_range_exp = false;
3161       re_token_t token2;
3162
3163       start_elem.opr.name = start_name_buf;
3164       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3165                                    syntax, first_round);
3166       if (BE (ret != REG_NOERROR, 0))
3167         {
3168           *err = ret;
3169           goto parse_bracket_exp_free_return;
3170         }
3171       first_round = false;
3172
3173       /* Get information about the next token.  We need it in any case.  */
3174       token_len = peek_token_bracket (token, regexp, syntax);
3175
3176       /* Do not check for ranges if we know they are not allowed.  */
3177       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3178         {
3179           if (BE (token->type == END_OF_RE, 0))
3180             {
3181               *err = REG_EBRACK;
3182               goto parse_bracket_exp_free_return;
3183             }
3184           if (token->type == OP_CHARSET_RANGE)
3185             {
3186               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3187               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3188               if (BE (token2.type == END_OF_RE, 0))
3189                 {
3190                   *err = REG_EBRACK;
3191                   goto parse_bracket_exp_free_return;
3192                 }
3193               if (token2.type == OP_CLOSE_BRACKET)
3194                 {
3195                   /* We treat the last '-' as a normal character.  */
3196                   re_string_skip_bytes (regexp, -token_len);
3197                   token->type = CHARACTER;
3198                 }
3199               else
3200                 is_range_exp = true;
3201             }
3202         }
3203
3204       if (is_range_exp == true)
3205         {
3206           end_elem.opr.name = end_name_buf;
3207           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3208                                        dfa, syntax, true);
3209           if (BE (ret != REG_NOERROR, 0))
3210             {
3211               *err = ret;
3212               goto parse_bracket_exp_free_return;
3213             }
3214
3215           token_len = peek_token_bracket (token, regexp, syntax);
3216
3217 #ifdef _LIBC
3218           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3219                                   &start_elem, &end_elem);
3220 #else
3221 # ifdef RE_ENABLE_I18N
3222           *err = build_range_exp (syntax, sbcset,
3223                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3224                                   &range_alloc, &start_elem, &end_elem);
3225 # else
3226           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3227 # endif
3228 #endif /* RE_ENABLE_I18N */
3229           if (BE (*err != REG_NOERROR, 0))
3230             goto parse_bracket_exp_free_return;
3231         }
3232       else
3233         {
3234           switch (start_elem.type)
3235             {
3236             case SB_CHAR:
3237               bitset_set (sbcset, start_elem.opr.ch);
3238               break;
3239 #ifdef RE_ENABLE_I18N
3240             case MB_CHAR:
3241               /* Check whether the array has enough space.  */
3242               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3243                 {
3244                   wchar_t *new_mbchars;
3245                   /* Not enough, realloc it.  */
3246                   /* +1 in case of mbcset->nmbchars is 0.  */
3247                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3248                   /* Use realloc since array is NULL if *alloc == 0.  */
3249                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3250                                             mbchar_alloc);
3251                   if (BE (new_mbchars == NULL, 0))
3252                     goto parse_bracket_exp_espace;
3253                   mbcset->mbchars = new_mbchars;
3254                 }
3255               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3256               break;
3257 #endif /* RE_ENABLE_I18N */
3258             case EQUIV_CLASS:
3259               *err = build_equiv_class (sbcset,
3260 #ifdef RE_ENABLE_I18N
3261                                         mbcset, &equiv_class_alloc,
3262 #endif /* RE_ENABLE_I18N */
3263                                         start_elem.opr.name);
3264               if (BE (*err != REG_NOERROR, 0))
3265                 goto parse_bracket_exp_free_return;
3266               break;
3267             case COLL_SYM:
3268               *err = build_collating_symbol (sbcset,
3269 #ifdef RE_ENABLE_I18N
3270                                              mbcset, &coll_sym_alloc,
3271 #endif /* RE_ENABLE_I18N */
3272                                              start_elem.opr.name);
3273               if (BE (*err != REG_NOERROR, 0))
3274                 goto parse_bracket_exp_free_return;
3275               break;
3276             case CHAR_CLASS:
3277               *err = build_charclass (regexp->trans, sbcset,
3278 #ifdef RE_ENABLE_I18N
3279                                       mbcset, &char_class_alloc,
3280 #endif /* RE_ENABLE_I18N */
3281                                       (const char *) start_elem.opr.name,
3282                                       syntax);
3283               if (BE (*err != REG_NOERROR, 0))
3284                goto parse_bracket_exp_free_return;
3285               break;
3286             default:
3287               assert (0);
3288               break;
3289             }
3290         }
3291       if (BE (token->type == END_OF_RE, 0))
3292         {
3293           *err = REG_EBRACK;
3294           goto parse_bracket_exp_free_return;
3295         }
3296       if (token->type == OP_CLOSE_BRACKET)
3297         break;
3298     }
3299
3300   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3301
3302   /* If it is non-matching list.  */
3303   if (non_match)
3304     bitset_not (sbcset);
3305
3306 #ifdef RE_ENABLE_I18N
3307   /* Ensure only single byte characters are set.  */
3308   if (dfa->mb_cur_max > 1)
3309     bitset_mask (sbcset, dfa->sb_char);
3310
3311   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3312       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3313                                                      || mbcset->non_match)))
3314     {
3315       bin_tree_t *mbc_tree;
3316       int sbc_idx;
3317       /* Build a tree for complex bracket.  */
3318       dfa->has_mb_node = 1;
3319       br_token.type = COMPLEX_BRACKET;
3320       br_token.opr.mbcset = mbcset;
3321       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3322       if (BE (mbc_tree == NULL, 0))
3323         goto parse_bracket_exp_espace;
3324       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3325         if (sbcset[sbc_idx])
3326           break;
3327       /* If there are no bits set in sbcset, there is no point
3328          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3329       if (sbc_idx < BITSET_WORDS)
3330         {
3331           /* Build a tree for simple bracket.  */
3332           br_token.type = SIMPLE_BRACKET;
3333           br_token.opr.sbcset = sbcset;
3334           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3335           if (BE (work_tree == NULL, 0))
3336             goto parse_bracket_exp_espace;
3337
3338           /* Then join them by ALT node.  */
3339           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3340           if (BE (work_tree == NULL, 0))
3341             goto parse_bracket_exp_espace;
3342         }
3343       else
3344         {
3345           re_free (sbcset);
3346           work_tree = mbc_tree;
3347         }
3348     }
3349   else
3350 #endif /* not RE_ENABLE_I18N */
3351     {
3352 #ifdef RE_ENABLE_I18N
3353       free_charset (mbcset);
3354 #endif
3355       /* Build a tree for simple bracket.  */
3356       br_token.type = SIMPLE_BRACKET;
3357       br_token.opr.sbcset = sbcset;
3358       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3359       if (BE (work_tree == NULL, 0))
3360         goto parse_bracket_exp_espace;
3361     }
3362   return work_tree;
3363
3364  parse_bracket_exp_espace:
3365   *err = REG_ESPACE;
3366  parse_bracket_exp_free_return:
3367   re_free (sbcset);
3368 #ifdef RE_ENABLE_I18N
3369   free_charset (mbcset);
3370 #endif /* RE_ENABLE_I18N */
3371   return NULL;
3372 }
3373
3374 /* Parse an element in the bracket expression.  */
3375
3376 static reg_errcode_t
3377 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3378                        re_token_t *token, int token_len, re_dfa_t *dfa,
3379                        reg_syntax_t syntax, bool accept_hyphen)
3380 {
3381 #ifdef RE_ENABLE_I18N
3382   int cur_char_size;
3383   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3384   if (cur_char_size > 1)
3385     {
3386       elem->type = MB_CHAR;
3387       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3388       re_string_skip_bytes (regexp, cur_char_size);
3389       return REG_NOERROR;
3390     }
3391 #endif /* RE_ENABLE_I18N */
3392   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3393   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3394       || token->type == OP_OPEN_EQUIV_CLASS)
3395     return parse_bracket_symbol (elem, regexp, token);
3396   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3397     {
3398       /* A '-' must only appear as anything but a range indicator before
3399          the closing bracket.  Everything else is an error.  */
3400       re_token_t token2;
3401       (void) peek_token_bracket (&token2, regexp, syntax);
3402       if (token2.type != OP_CLOSE_BRACKET)
3403         /* The actual error value is not standardized since this whole
3404            case is undefined.  But ERANGE makes good sense.  */
3405         return REG_ERANGE;
3406     }
3407   elem->type = SB_CHAR;
3408   elem->opr.ch = token->opr.c;
3409   return REG_NOERROR;
3410 }
3411
3412 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3413    such as [:<character_class>:], [.<collating_element>.], and
3414    [=<equivalent_class>=].  */
3415
3416 static reg_errcode_t
3417 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3418                       re_token_t *token)
3419 {
3420   unsigned char ch, delim = token->opr.c;
3421   int i = 0;
3422   if (re_string_eoi(regexp))
3423     return REG_EBRACK;
3424   for (;; ++i)
3425     {
3426       if (i >= BRACKET_NAME_BUF_SIZE)
3427         return REG_EBRACK;
3428       if (token->type == OP_OPEN_CHAR_CLASS)
3429         ch = re_string_fetch_byte_case (regexp);
3430       else
3431         ch = re_string_fetch_byte (regexp);
3432       if (re_string_eoi(regexp))
3433         return REG_EBRACK;
3434       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3435         break;
3436       elem->opr.name[i] = ch;
3437     }
3438   re_string_skip_bytes (regexp, 1);
3439   elem->opr.name[i] = '\0';
3440   switch (token->type)
3441     {
3442     case OP_OPEN_COLL_ELEM:
3443       elem->type = COLL_SYM;
3444       break;
3445     case OP_OPEN_EQUIV_CLASS:
3446       elem->type = EQUIV_CLASS;
3447       break;
3448     case OP_OPEN_CHAR_CLASS:
3449       elem->type = CHAR_CLASS;
3450       break;
3451     default:
3452       break;
3453     }
3454   return REG_NOERROR;
3455 }
3456
3457   /* Helper function for parse_bracket_exp.
3458      Build the equivalence class which is represented by NAME.
3459      The result are written to MBCSET and SBCSET.
3460      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3461      is a pointer argument since we may update it.  */
3462
3463 static reg_errcode_t
3464 #ifdef RE_ENABLE_I18N
3465 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3466                    Idx *equiv_class_alloc, const unsigned char *name)
3467 #else /* not RE_ENABLE_I18N */
3468 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3469 #endif /* not RE_ENABLE_I18N */
3470 {
3471 #ifdef _LIBC
3472   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3473   if (nrules != 0)
3474     {
3475       const int32_t *table, *indirect;
3476       const unsigned char *weights, *extra, *cp;
3477       unsigned char char_buf[2];
3478       int32_t idx1, idx2;
3479       unsigned int ch;
3480       size_t len;
3481       /* This #include defines a local function!  */
3482 # include <locale/weight.h>
3483       /* Calculate the index for equivalence class.  */
3484       cp = name;
3485       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3486       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3487                                                _NL_COLLATE_WEIGHTMB);
3488       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3489                                                    _NL_COLLATE_EXTRAMB);
3490       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3491                                                 _NL_COLLATE_INDIRECTMB);
3492       idx1 = findidx (&cp, -1);
3493       if (BE (idx1 == 0 || *cp != '\0', 0))
3494         /* This isn't a valid character.  */
3495         return REG_ECOLLATE;
3496
3497       /* Build single byte matching table for this equivalence class.  */
3498       len = weights[idx1 & 0xffffff];
3499       for (ch = 0; ch < SBC_MAX; ++ch)
3500         {
3501           char_buf[0] = ch;
3502           cp = char_buf;
3503           idx2 = findidx (&cp, 1);
3504 /*
3505           idx2 = table[ch];
3506 */
3507           if (idx2 == 0)
3508             /* This isn't a valid character.  */
3509             continue;
3510           /* Compare only if the length matches and the collation rule
3511              index is the same.  */
3512           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3513             {
3514               int cnt = 0;
3515
3516               while (cnt <= len &&
3517                      weights[(idx1 & 0xffffff) + 1 + cnt]
3518                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3519                 ++cnt;
3520
3521               if (cnt > len)
3522                 bitset_set (sbcset, ch);
3523             }
3524         }
3525       /* Check whether the array has enough space.  */
3526       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3527         {
3528           /* Not enough, realloc it.  */
3529           /* +1 in case of mbcset->nequiv_classes is 0.  */
3530           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3531           /* Use realloc since the array is NULL if *alloc == 0.  */
3532           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3533                                                    int32_t,
3534                                                    new_equiv_class_alloc);
3535           if (BE (new_equiv_classes == NULL, 0))
3536             return REG_ESPACE;
3537           mbcset->equiv_classes = new_equiv_classes;
3538           *equiv_class_alloc = new_equiv_class_alloc;
3539         }
3540       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3541     }
3542   else
3543 #endif /* _LIBC */
3544     {
3545       if (BE (strlen ((const char *) name) != 1, 0))
3546         return REG_ECOLLATE;
3547       bitset_set (sbcset, *name);
3548     }
3549   return REG_NOERROR;
3550 }
3551
3552   /* Helper function for parse_bracket_exp.
3553      Build the character class which is represented by NAME.
3554      The result are written to MBCSET and SBCSET.
3555      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3556      is a pointer argument since we may update it.  */
3557
3558 static reg_errcode_t
3559 #ifdef RE_ENABLE_I18N
3560 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3561                  re_charset_t *mbcset, Idx *char_class_alloc,
3562                  const char *class_name, reg_syntax_t syntax)
3563 #else /* not RE_ENABLE_I18N */
3564 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3565                  const char *class_name, reg_syntax_t syntax)
3566 #endif /* not RE_ENABLE_I18N */
3567 {
3568   int i;
3569   const char *name = class_name;
3570
3571   /* In case of REG_ICASE "upper" and "lower" match the both of
3572      upper and lower cases.  */
3573   if ((syntax & RE_ICASE)
3574       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3575     name = "alpha";
3576
3577 #ifdef RE_ENABLE_I18N
3578   /* Check the space of the arrays.  */
3579   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3580     {
3581       /* Not enough, realloc it.  */
3582       /* +1 in case of mbcset->nchar_classes is 0.  */
3583       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3584       /* Use realloc since array is NULL if *alloc == 0.  */
3585       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3586                                                new_char_class_alloc);
3587       if (BE (new_char_classes == NULL, 0))
3588         return REG_ESPACE;
3589       mbcset->char_classes = new_char_classes;
3590       *char_class_alloc = new_char_class_alloc;
3591     }
3592   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3593 #endif /* RE_ENABLE_I18N */
3594
3595 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3596   do {                                          \
3597     if (BE (trans != NULL, 0))                  \
3598       {                                         \
3599         for (i = 0; i < SBC_MAX; ++i)           \
3600           if (ctype_func (i))                   \
3601             bitset_set (sbcset, trans[i]);      \
3602       }                                         \
3603     else                                        \
3604       {                                         \
3605         for (i = 0; i < SBC_MAX; ++i)           \
3606           if (ctype_func (i))                   \
3607             bitset_set (sbcset, i);             \
3608       }                                         \
3609   } while (0)
3610
3611   if (strcmp (name, "alnum") == 0)
3612     BUILD_CHARCLASS_LOOP (isalnum);
3613   else if (strcmp (name, "cntrl") == 0)
3614     BUILD_CHARCLASS_LOOP (iscntrl);
3615   else if (strcmp (name, "lower") == 0)
3616     BUILD_CHARCLASS_LOOP (islower);
3617   else if (strcmp (name, "space") == 0)
3618     BUILD_CHARCLASS_LOOP (isspace);
3619   else if (strcmp (name, "alpha") == 0)
3620     BUILD_CHARCLASS_LOOP (isalpha);
3621   else if (strcmp (name, "digit") == 0)
3622     BUILD_CHARCLASS_LOOP (isdigit);
3623   else if (strcmp (name, "print") == 0)
3624     BUILD_CHARCLASS_LOOP (isprint);
3625   else if (strcmp (name, "upper") == 0)
3626     BUILD_CHARCLASS_LOOP (isupper);
3627   else if (strcmp (name, "blank") == 0)
3628     BUILD_CHARCLASS_LOOP (isblank);
3629   else if (strcmp (name, "graph") == 0)
3630     BUILD_CHARCLASS_LOOP (isgraph);
3631   else if (strcmp (name, "punct") == 0)
3632     BUILD_CHARCLASS_LOOP (ispunct);
3633   else if (strcmp (name, "xdigit") == 0)
3634     BUILD_CHARCLASS_LOOP (isxdigit);
3635   else
3636     return REG_ECTYPE;
3637
3638   return REG_NOERROR;
3639 }
3640
3641 static bin_tree_t *
3642 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3643                     const char *class_name,
3644                     const char *extra, bool non_match,
3645                     reg_errcode_t *err)
3646 {
3647   re_bitset_ptr_t sbcset;
3648 #ifdef RE_ENABLE_I18N
3649   re_charset_t *mbcset;
3650   Idx alloc = 0;
3651 #endif /* not RE_ENABLE_I18N */
3652   reg_errcode_t ret;
3653   re_token_t br_token;
3654   bin_tree_t *tree;
3655
3656   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3657 #ifdef RE_ENABLE_I18N
3658   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3659 #endif /* RE_ENABLE_I18N */
3660
3661 #ifdef RE_ENABLE_I18N
3662   if (BE (sbcset == NULL || mbcset == NULL, 0))
3663 #else /* not RE_ENABLE_I18N */
3664   if (BE (sbcset == NULL, 0))
3665 #endif /* not RE_ENABLE_I18N */
3666     {
3667       *err = REG_ESPACE;
3668       return NULL;
3669     }
3670
3671   if (non_match)
3672     {
3673 #ifdef RE_ENABLE_I18N
3674       mbcset->non_match = 1;
3675 #endif /* not RE_ENABLE_I18N */
3676     }
3677
3678   /* We don't care the syntax in this case.  */
3679   ret = build_charclass (trans, sbcset,
3680 #ifdef RE_ENABLE_I18N
3681                          mbcset, &alloc,
3682 #endif /* RE_ENABLE_I18N */
3683                          class_name, 0);
3684
3685   if (BE (ret != REG_NOERROR, 0))
3686     {
3687       re_free (sbcset);
3688 #ifdef RE_ENABLE_I18N
3689       free_charset (mbcset);
3690 #endif /* RE_ENABLE_I18N */
3691       *err = ret;
3692       return NULL;
3693     }
3694   /* \w match '_' also.  */
3695   for (; *extra; extra++)
3696     bitset_set (sbcset, *extra);
3697
3698   /* If it is non-matching list.  */
3699   if (non_match)
3700     bitset_not (sbcset);
3701
3702 #ifdef RE_ENABLE_I18N
3703   /* Ensure only single byte characters are set.  */
3704   if (dfa->mb_cur_max > 1)
3705     bitset_mask (sbcset, dfa->sb_char);
3706 #endif
3707
3708   /* Build a tree for simple bracket.  */
3709   br_token.type = SIMPLE_BRACKET;
3710   br_token.opr.sbcset = sbcset;
3711   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3712   if (BE (tree == NULL, 0))
3713     goto build_word_op_espace;
3714
3715 #ifdef RE_ENABLE_I18N
3716   if (dfa->mb_cur_max > 1)
3717     {
3718       bin_tree_t *mbc_tree;
3719       /* Build a tree for complex bracket.  */
3720       br_token.type = COMPLEX_BRACKET;
3721       br_token.opr.mbcset = mbcset;
3722       dfa->has_mb_node = 1;
3723       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3724       if (BE (mbc_tree == NULL, 0))
3725         goto build_word_op_espace;
3726       /* Then join them by ALT node.  */
3727       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3728       if (BE (mbc_tree != NULL, 1))
3729         return tree;
3730     }
3731   else
3732     {
3733       free_charset (mbcset);
3734       return tree;
3735     }
3736 #else /* not RE_ENABLE_I18N */
3737   return tree;
3738 #endif /* not RE_ENABLE_I18N */
3739
3740  build_word_op_espace:
3741   re_free (sbcset);
3742 #ifdef RE_ENABLE_I18N
3743   free_charset (mbcset);
3744 #endif /* RE_ENABLE_I18N */
3745   *err = REG_ESPACE;
3746   return NULL;
3747 }
3748
3749 /* This is intended for the expressions like "a{1,3}".
3750    Fetch a number from 'input', and return the number.
3751    Return REG_MISSING if the number field is empty like "{,1}".
3752    Return RE_DUP_MAX + 1 if the number field is too large.
3753    Return REG_ERROR if an error occurred.  */
3754
3755 static Idx
3756 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3757 {
3758   Idx num = REG_MISSING;
3759   unsigned char c;
3760   while (1)
3761     {
3762       fetch_token (token, input, syntax);
3763       c = token->opr.c;
3764       if (BE (token->type == END_OF_RE, 0))
3765         return REG_ERROR;
3766       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3767         break;
3768       num = ((token->type != CHARACTER || c < '0' || '9' < c
3769               || num == REG_ERROR)
3770              ? REG_ERROR
3771              : num == REG_MISSING
3772              ? c - '0'
3773              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3774     }
3775   return num;
3776 }
3777 \f
3778 #ifdef RE_ENABLE_I18N
3779 static void
3780 free_charset (re_charset_t *cset)
3781 {
3782   re_free (cset->mbchars);
3783 # ifdef _LIBC
3784   re_free (cset->coll_syms);
3785   re_free (cset->equiv_classes);
3786   re_free (cset->range_starts);
3787   re_free (cset->range_ends);
3788 # endif
3789   re_free (cset->char_classes);
3790   re_free (cset);
3791 }
3792 #endif /* RE_ENABLE_I18N */
3793 \f
3794 /* Functions for binary tree operation.  */
3795
3796 /* Create a tree node.  */
3797
3798 static bin_tree_t *
3799 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3800              re_token_type_t type)
3801 {
3802   re_token_t t;
3803   t.type = type;
3804   return create_token_tree (dfa, left, right, &t);
3805 }
3806
3807 static bin_tree_t *
3808 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3809                    const re_token_t *token)
3810 {
3811   bin_tree_t *tree;
3812   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3813     {
3814       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3815
3816       if (storage == NULL)
3817         return NULL;
3818       storage->next = dfa->str_tree_storage;
3819       dfa->str_tree_storage = storage;
3820       dfa->str_tree_storage_idx = 0;
3821     }
3822   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3823
3824   tree->parent = NULL;
3825   tree->left = left;
3826   tree->right = right;
3827   tree->token = *token;
3828   tree->token.duplicated = 0;
3829   tree->token.opt_subexp = 0;
3830   tree->first = NULL;
3831   tree->next = NULL;
3832   tree->node_idx = REG_MISSING;
3833
3834   if (left != NULL)
3835     left->parent = tree;
3836   if (right != NULL)
3837     right->parent = tree;
3838   return tree;
3839 }
3840
3841 /* Mark the tree SRC as an optional subexpression.
3842    To be called from preorder or postorder.  */
3843
3844 static reg_errcode_t
3845 mark_opt_subexp (void *extra, bin_tree_t *node)
3846 {
3847   Idx idx = (uintptr_t) extra;
3848   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849     node->token.opt_subexp = 1;
3850
3851   return REG_NOERROR;
3852 }
3853
3854 /* Free the allocated memory inside NODE. */
3855
3856 static void
3857 free_token (re_token_t *node)
3858 {
3859 #ifdef RE_ENABLE_I18N
3860   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861     free_charset (node->opr.mbcset);
3862   else
3863 #endif /* RE_ENABLE_I18N */
3864     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865       re_free (node->opr.sbcset);
3866 }
3867
3868 /* Worker function for tree walking.  Free the allocated memory inside NODE
3869    and its children. */
3870
3871 static reg_errcode_t
3872 free_tree (void *extra, bin_tree_t *node)
3873 {
3874   free_token (&node->token);
3875   return REG_NOERROR;
3876 }
3877
3878
3879 /* Duplicate the node SRC, and return new node.  This is a preorder
3880    visit similar to the one implemented by the generic visitor, but
3881    we need more infrastructure to maintain two parallel trees --- so,
3882    it's easier to duplicate.  */
3883
3884 static bin_tree_t *
3885 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3886 {
3887   const bin_tree_t *node;
3888   bin_tree_t *dup_root;
3889   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3890
3891   for (node = root; ; )
3892     {
3893       /* Create a new tree and link it back to the current parent.  */
3894       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3895       if (*p_new == NULL)
3896         return NULL;
3897       (*p_new)->parent = dup_node;
3898       (*p_new)->token.duplicated = 1;
3899       dup_node = *p_new;
3900
3901       /* Go to the left node, or up and to the right.  */
3902       if (node->left)
3903         {
3904           node = node->left;
3905           p_new = &dup_node->left;
3906         }
3907       else
3908         {
3909           const bin_tree_t *prev = NULL;
3910           while (node->right == prev || node->right == NULL)
3911             {
3912               prev = node;
3913               node = node->parent;
3914               dup_node = dup_node->parent;
3915               if (!node)
3916                 return dup_root;
3917             }
3918           node = node->right;
3919           p_new = &dup_node->right;
3920         }
3921     }
3922 }